Showing preview only (5,481K chars total). Download the full file or copy to clipboard to get everything.
Repository: walaj/svaba
Branch: master
Commit: 645fab67c906
Files: 399
Total size: 131.2 MB
Directory structure:
gitextract_ugvk6jc2/
├── CLAUDE.md
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── R/
│ └── archive_non_functional/
│ ├── create-databases.R
│ ├── gen_quals.R
│ ├── svaba-annotate.R
│ ├── svaba-asqg2pdf.R
│ ├── svaba-bam-qcplot.R
│ ├── svaba-benchmark.R
│ ├── svaba-bps-to-maflite.R
│ ├── svaba-circos.R
│ ├── svaba-create-pon.R
│ ├── svaba-event-plot.R
│ ├── svaba-histogram.R
│ ├── svaba-nozzle.R
│ ├── svaba-sig.R
│ └── svaba-vcf-to-maflite.R
├── README.md
├── docs/
│ ├── README.md
│ ├── alignments_viewer.html
│ ├── app.js
│ ├── bps_explorer.html
│ ├── bps_viewer.html
│ ├── comparison.html
│ ├── index.html
│ ├── learn_explorer.html
│ ├── r2c_explorer.html
│ ├── runtime_explorer.html
│ └── styles.css
├── notes
├── opt/
│ ├── jemalloc_test.sh
│ ├── memprof.sh
│ ├── memprof_osx.sh
│ ├── memusg.sh
│ ├── profiler.sh
│ └── runtime.R
├── scripts/
│ ├── combine_blacklists.sh
│ ├── extract_by_qname.sh
│ ├── extract_discordants.sh
│ ├── extract_pairs_by_seq.sh
│ ├── filter_contig_supporting_reads.sh
│ ├── gcloud_teardown.sh
│ ├── mosdepth_lowmapq_blacklist.sh
│ ├── plot_learn.sh
│ ├── r2c_for_contig.sh
│ ├── search_sequence.sh
│ ├── sort_and_dedupe_bps_old.sh
│ ├── sort_bps.sh
│ ├── svaba_cloud.sh
│ ├── svaba_local_function.sh
│ ├── svaba_postprocess.sh
│ └── update_svaba_image.sh
├── src/
│ ├── SGA/
│ │ ├── Algorithm/
│ │ │ ├── ClusterProcess.cpp
│ │ │ ├── ClusterProcess.h
│ │ │ ├── ConnectProcess.cpp
│ │ │ ├── ConnectProcess.h
│ │ │ ├── DPAlignment.cpp
│ │ │ ├── DPAlignment.h
│ │ │ ├── ErrorCorrectProcess.cpp
│ │ │ ├── ErrorCorrectProcess.h
│ │ │ ├── ExtensionDP.cpp
│ │ │ ├── ExtensionDP.h
│ │ │ ├── FMMergeProcess.cpp
│ │ │ ├── FMMergeProcess.h
│ │ │ ├── GapFillProcess.cpp
│ │ │ ├── GapFillProcess.h
│ │ │ ├── HaplotypeBuilder.cpp
│ │ │ ├── HaplotypeBuilder.h
│ │ │ ├── KmerOverlaps.cpp
│ │ │ ├── KmerOverlaps.h
│ │ │ ├── LRAlignment.cpp
│ │ │ ├── LRAlignment.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── OverlapAlgorithm.cpp
│ │ │ ├── OverlapAlgorithm.h
│ │ │ ├── OverlapBlock.cpp
│ │ │ ├── OverlapBlock.h
│ │ │ ├── OverlapTools.cpp
│ │ │ ├── OverlapTools.h
│ │ │ ├── QCProcess.cpp
│ │ │ ├── QCProcess.h
│ │ │ ├── ReadCluster.cpp
│ │ │ ├── ReadCluster.h
│ │ │ ├── SearchHistory.cpp
│ │ │ ├── SearchHistory.h
│ │ │ ├── SearchSeed.cpp
│ │ │ ├── SearchSeed.h
│ │ │ ├── StatsProcess.cpp
│ │ │ ├── StatsProcess.h
│ │ │ ├── StringGraphGenerator.cpp
│ │ │ ├── StringGraphGenerator.h
│ │ │ ├── StringThreader.cpp
│ │ │ ├── StringThreader.h
│ │ │ ├── VariationBuilderCommon.cpp
│ │ │ └── VariationBuilderCommon.h
│ │ ├── Bigraph/
│ │ │ ├── Bigraph.cpp
│ │ │ ├── Bigraph.h
│ │ │ ├── Edge.cpp
│ │ │ ├── Edge.h
│ │ │ ├── EdgeDesc.cpp
│ │ │ ├── EdgeDesc.h
│ │ │ ├── GraphCommon.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── Vertex.cpp
│ │ │ └── Vertex.h
│ │ ├── SGA/
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── OverlapCommon.cpp
│ │ │ ├── OverlapCommon.h
│ │ │ ├── SGACommon.h
│ │ │ ├── index.cpp
│ │ │ ├── index.h
│ │ │ ├── overlap.cpp
│ │ │ └── overlap.h
│ │ ├── SQG/
│ │ │ ├── ASQG.cpp
│ │ │ ├── ASQG.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── SQG.cpp
│ │ │ └── SQG.h
│ │ ├── StringGraph/
│ │ │ ├── CompleteOverlapSet.cpp
│ │ │ ├── CompleteOverlapSet.h
│ │ │ ├── GraphSearchTree.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── RemovalAlgorithm.cpp
│ │ │ ├── RemovalAlgorithm.h
│ │ │ ├── SGAlgorithms.cpp
│ │ │ ├── SGAlgorithms.h
│ │ │ ├── SGSearch.cpp
│ │ │ ├── SGSearch.h
│ │ │ ├── SGUtil.cpp
│ │ │ ├── SGUtil.h
│ │ │ ├── SGVisitors.cpp
│ │ │ ├── SGVisitors.h
│ │ │ ├── SGWalk.cpp
│ │ │ └── SGWalk.h
│ │ ├── SuffixTools/
│ │ │ ├── BWT.h
│ │ │ ├── BWTAlgorithms.cpp
│ │ │ ├── BWTAlgorithms.h
│ │ │ ├── BWTCABauerCoxRosone.cpp
│ │ │ ├── BWTCABauerCoxRosone.h
│ │ │ ├── BWTCARopebwt.cpp
│ │ │ ├── BWTCARopebwt.h
│ │ │ ├── BWTDiskConstruction.cpp
│ │ │ ├── BWTDiskConstruction.h
│ │ │ ├── BWTIndexSet.h
│ │ │ ├── BWTInterval.h
│ │ │ ├── BWTIntervalCache.cpp
│ │ │ ├── BWTIntervalCache.h
│ │ │ ├── BWTReader.cpp
│ │ │ ├── BWTReader.h
│ │ │ ├── BWTReaderAscii.cpp
│ │ │ ├── BWTReaderAscii.h
│ │ │ ├── BWTReaderBinary.cpp
│ │ │ ├── BWTReaderBinary.h
│ │ │ ├── BWTTraverse.cpp
│ │ │ ├── BWTTraverse.h
│ │ │ ├── BWTWriter.cpp
│ │ │ ├── BWTWriter.h
│ │ │ ├── BWTWriterAscii.cpp
│ │ │ ├── BWTWriterAscii.h
│ │ │ ├── BWTWriterBinary.cpp
│ │ │ ├── BWTWriterBinary.h
│ │ │ ├── FMMarkers.h
│ │ │ ├── GapArray.cpp
│ │ │ ├── GapArray.h
│ │ │ ├── HitData.h
│ │ │ ├── InverseSuffixArray.cpp
│ │ │ ├── InverseSuffixArray.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── Occurrence.cpp
│ │ │ ├── Occurrence.h
│ │ │ ├── PopulationIndex.cpp
│ │ │ ├── PopulationIndex.h
│ │ │ ├── QuickBWT.cpp
│ │ │ ├── QuickBWT.h
│ │ │ ├── RLBWT.cpp
│ │ │ ├── RLBWT.h
│ │ │ ├── RLUnit.h
│ │ │ ├── RankProcess.cpp
│ │ │ ├── RankProcess.h
│ │ │ ├── SACAInducedCopying.cpp
│ │ │ ├── SACAInducedCopying.h
│ │ │ ├── SAReader.cpp
│ │ │ ├── SAReader.h
│ │ │ ├── SAWriter.cpp
│ │ │ ├── SAWriter.h
│ │ │ ├── SBWT.cpp
│ │ │ ├── SBWT.h
│ │ │ ├── STCommon.cpp
│ │ │ ├── STCommon.h
│ │ │ ├── STGlobals.h
│ │ │ ├── SampledSuffixArray.cpp
│ │ │ ├── SampledSuffixArray.h
│ │ │ ├── SparseGapArray.h
│ │ │ ├── SuffixArray.cpp
│ │ │ ├── SuffixArray.h
│ │ │ ├── SuffixCompare.cpp
│ │ │ └── SuffixCompare.h
│ │ └── Util/
│ │ ├── Alphabet.cpp
│ │ ├── Alphabet.h
│ │ ├── BWT4Codec.h
│ │ ├── BWTCodec.h
│ │ ├── Bamreader.cpp
│ │ ├── BitChar.cpp
│ │ ├── BitChar.h
│ │ ├── BitVector.cpp
│ │ ├── BitVector.h
│ │ ├── BloomFilter.cpp
│ │ ├── BloomFilter.h
│ │ ├── ClusterReader.cpp
│ │ ├── ClusterReader.h
│ │ ├── Contig.cpp
│ │ ├── Contig.h
│ │ ├── CorrectionThresholds.cpp
│ │ ├── CorrectionThresholds.h
│ │ ├── DNACodec.h
│ │ ├── DNADouble.h
│ │ ├── DNAString.cpp
│ │ ├── DNAString.h
│ │ ├── EncodedString.h
│ │ ├── HashMap.h
│ │ ├── Interval.cpp
│ │ ├── Interval.h
│ │ ├── IntervalTree.h
│ │ ├── KmerDistribution.cpp
│ │ ├── KmerDistribution.h
│ │ ├── Makefile.am
│ │ ├── Makefile.in
│ │ ├── Match.cpp
│ │ ├── Match.h
│ │ ├── Metrics.h
│ │ ├── MultiAlignment.cpp
│ │ ├── MultiAlignment.h
│ │ ├── MultiOverlap.cpp
│ │ ├── MultiOverlap.h
│ │ ├── NoCodec.h
│ │ ├── Pileup.cpp
│ │ ├── Pileup.h
│ │ ├── PrimerScreen.cpp
│ │ ├── PrimerScreen.h
│ │ ├── Profiler.h
│ │ ├── Quality.cpp
│ │ ├── Quality.h
│ │ ├── QualityCodec.h
│ │ ├── QualityTable.cpp
│ │ ├── QualityTable.h
│ │ ├── QualityVector.cpp
│ │ ├── QualityVector.h
│ │ ├── Read2Contig.h
│ │ ├── ReadInfoTable.cpp
│ │ ├── ReadInfoTable.h
│ │ ├── ReadTable.cpp
│ │ ├── ReadTable.h
│ │ ├── ReadTableNew.cpp
│ │ ├── ReadTableNew.h
│ │ ├── ReadTableS.h
│ │ ├── SGAStats.cpp
│ │ ├── SGAStats.h
│ │ ├── SeqCoord.cpp
│ │ ├── SeqCoord.h
│ │ ├── SeqReader.cpp
│ │ ├── SeqReader.h
│ │ ├── SimpleAllocator.h
│ │ ├── SimplePool.h
│ │ ├── StdAlnTools.cpp
│ │ ├── StdAlnTools.h
│ │ ├── Timer.h
│ │ ├── Util.cpp
│ │ ├── Util.h
│ │ ├── VCFUtil.cpp
│ │ ├── VCFUtil.h
│ │ ├── VariantIndex.cpp
│ │ ├── VariantIndex.h
│ │ ├── Verbosity.h
│ │ ├── bamreader.h
│ │ ├── bucketSort.cpp
│ │ ├── bucketSort.h
│ │ ├── gzstream.C
│ │ ├── gzstream.h
│ │ ├── mkqs.h
│ │ ├── old.AlignedContig.h
│ │ ├── stdaln.c
│ │ └── stdaln.h
│ ├── svaba/
│ │ ├── AlignedContig.cpp
│ │ ├── AlignedContig.h
│ │ ├── AlignmentFragment.cpp
│ │ ├── AlignmentFragment.h
│ │ ├── BamStats.cpp
│ │ ├── BamStats.h
│ │ ├── BreakPoint.cpp
│ │ ├── BreakPoint.h
│ │ ├── ContigAlignmentScore.cpp
│ │ ├── ContigAlignmentScore.h
│ │ ├── DBSnpFilter.cpp
│ │ ├── DBSnpFilter.h
│ │ ├── DiscordantCluster.cpp
│ │ ├── DiscordantCluster.h
│ │ ├── DiscordantRealigner.cpp
│ │ ├── DiscordantRealigner.h
│ │ ├── Histogram.cpp
│ │ ├── Histogram.h
│ │ ├── KmerFilter.cpp
│ │ ├── KmerFilter.h
│ │ ├── LearnBamParams.cpp
│ │ ├── LearnBamParams.h
│ │ ├── ReadToContigAligner.h
│ │ ├── STCoverage.cpp
│ │ ├── STCoverage.h
│ │ ├── SvabaASQG.cpp
│ │ ├── SvabaASQG.h
│ │ ├── SvabaAssemble.cpp
│ │ ├── SvabaAssemble.h
│ │ ├── SvabaAssemblerConfig.h
│ │ ├── SvabaAssemblerEngine.cpp
│ │ ├── SvabaAssemblerEngine.h
│ │ ├── SvabaBamWalker.cpp
│ │ ├── SvabaBamWalker.h
│ │ ├── SvabaDebug.h
│ │ ├── SvabaFileLoader.cpp
│ │ ├── SvabaFileLoader.h
│ │ ├── SvabaLogger.cpp
│ │ ├── SvabaLogger.h
│ │ ├── SvabaModels.cpp
│ │ ├── SvabaModels.h
│ │ ├── SvabaOptions.cpp
│ │ ├── SvabaOptions.h
│ │ ├── SvabaOutputWriter.cpp
│ │ ├── SvabaOutputWriter.h
│ │ ├── SvabaOverlapAlgorithm.cpp
│ │ ├── SvabaOverlapAlgorithm.h
│ │ ├── SvabaPostprocess.cpp
│ │ ├── SvabaPostprocess.h
│ │ ├── SvabaRead.cpp
│ │ ├── SvabaRead.h
│ │ ├── SvabaRegionProcessor.cpp
│ │ ├── SvabaRegionProcessor.h
│ │ ├── SvabaSharedConfig.h
│ │ ├── SvabaThreadUnit.cpp
│ │ ├── SvabaThreadUnit.h
│ │ ├── SvabaUtils.cpp
│ │ ├── SvabaUtils.h
│ │ ├── refilter.cpp
│ │ ├── refilter.h
│ │ ├── run_svaba.cpp
│ │ ├── svaba.cpp
│ │ ├── test_svaba.cpp
│ │ ├── threadpool.h
│ │ ├── tovcf.cpp
│ │ ├── vcf.cpp
│ │ └── vcf.h
│ └── svabautils/
│ ├── AssemblyBamWalker.cpp
│ ├── AssemblyBamWalker.h
│ ├── BamSplitter.cpp
│ ├── BamSplitter.h
│ ├── Fractions.cpp
│ ├── Fractions.h
│ ├── Makefile.am
│ ├── Makefile.in
│ ├── PowerLawSim.cpp
│ ├── PowerLawSim.h
│ ├── ReadSim.cpp
│ ├── ReadSim.h
│ ├── SeqFrag.cpp
│ ├── SeqFrag.h
│ ├── SimGenome.cpp
│ ├── SimGenome.h
│ ├── SimTrainerWalker.cpp
│ ├── SimTrainerWalker.h
│ ├── assembly2vcf.cpp
│ ├── assembly2vcf.h
│ ├── benchmark.cpp
│ ├── benchmark.h
│ ├── configure
│ ├── configure.ac
│ ├── snowmanutils.cpp
│ ├── snowtools.cpp
│ ├── splitcounter.cpp
│ └── splitcounter.h
├── svaba_jemalloc
└── tracks/
├── README.md
├── genome.hg38.sorted.bed
├── hg38.bed
├── hg38.blacklist.sorted.bed
├── hg38.combined_blacklist.bed
├── hg38.high_runtime.bed
├── hg38.manual.blacklist.bed
├── hg38.nonstd_chr.blacklist.bed
├── hg38.rmsk.simple_repeat.bed
├── hg38_arms_excl_centromeres.bed
├── lowmap30perc.bed
├── lowmap50perc.bed
├── lowmap70perc.bed
└── region_generator.R
================================================
FILE CONTENTS
================================================
================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md — svaba working notes
This file captures conventions, file landmarks, and open investigations for the
svaba SV/indel caller project so future sessions can pick up quickly. Update it
as understanding changes — it's the crash-safety net, not the README.
## Project at a glance
svaba is a structural variant (SV) and indel caller that uses local assembly +
read realignment to call variants from short-read BAMs. The canonical use case
is tumor/normal somatic calling, but it also supports germline and multi-sample
modes.
Top-level layout:
- `src/svaba/` — main C++ sources. Entry point is `run_svaba.cpp`; assembly,
realignment, breakpoint scoring, VCF output, and postprocess all live here.
File-naming convention is **PascalCase** — e.g. `SvabaOptions.cpp`,
`SvabaOutputWriter.h`. A few intentional exceptions: `refilter.cpp/h`,
`run_svaba.cpp`, `svaba.cpp`, `threadpool.h`, `tovcf.cpp`, `vcf.cpp/h`.
- `src/SGA/` — String Graph Assembler sources (vendored).
- `SeqLib/` — vendored htslib/bwa/fermi-lite wrapper used for BAM I/O,
alignment, and assembly primitives. See "Build system" for how its flags
get set.
- `bin/`, `build/` — build artifacts; don't edit by hand.
- `R/`, `viewer/`, `tracks/` — downstream analysis/visualization helpers.
- `tests/`, `example_data/` — test fixtures.
- `scripts/` — post-processing and utility shell helpers, all kept here
(not at the repo root): `svaba_postprocess.sh`, `combine_blacklists.sh`,
`extract_discordants.sh`, `filter_contig_supporting_reads.sh`,
`r2c_for_contig.sh`, `sort_bps.sh`, `svaba_cloud.sh`. Profiling
helpers (`memprof*.sh`) live under `opt/` (the user's ad-hoc tooling
dir).
- `somlod_maxlod_analysis.html` — deep-dive writeup of the somatic log-odds
scoring model. See "Statistical model" below.
Scripts that used to live here and are gone, in case you're looking for them:
`sort_output.sh` and `sort_and_deduplicate_bps.sh` were subsumed into the
unified `svaba_postprocess.sh`.
## Build system
`cmake -B build && cmake --build build` defaults to `CMAKE_BUILD_TYPE =
RelWithDebInfo`, which gives you **`-O2 -g -DNDEBUG -fno-omit-frame-pointer`**
for the svaba/SeqLib/SGA C++ code. Not O0, not O3.
Critical gotcha: **the `-O2` is hardcoded across all of the vendored
submodules**, and CMake's build-type doesn't reach them:
- `SeqLib/bwa/Makefile`: `CFLAGS = -g -Wall -Wno-unused-function -O2`
(hardcoded; doesn't read the parent CMake). Makes `libbwa.a`, owner of ~38%
of wall-time in real runs.
- `SeqLib/fermi-lite/Makefile`: `CFLAGS = -g -Wall -O2 -Wno-unused-function`
(same story). Makes `libfml.a`, ~27% of wall-time.
- `SeqLib/CMakeLists.txt:10` does `set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")`
— appends `-O2` to whatever you passed, and because later `-O` wins on
gcc/clang, this silently defeats any top-level attempt to set `-O3` via
`CMAKE_CXX_FLAGS` unless you pass it via `CMAKE_CXX_FLAGS_<CONFIG>` (which
is appended after, and therefore wins).
- htslib is external; whoever built yours picked its flags.
To push the submodules to `-O3 -mcpu=native` (the single biggest free-perf
knob on Apple Silicon — 65% of compute is at -O2 generic right now):
```bash
make -C SeqLib/bwa clean
make -C SeqLib/fermi-lite clean
make -C SeqLib/bwa -j CFLAGS="-g -O3 -mcpu=native -fno-omit-frame-pointer -Wall -Wno-unused-function"
make -C SeqLib/fermi-lite -j CFLAGS="-g -O3 -mcpu=native -fno-omit-frame-pointer -Wall -Wno-unused-function"
cd build && make -j
```
Sanity-check: `make VERBOSE=1 2>&1 | grep -oE -- "-O[0-9sg]" | sort | uniq -c`.
## Statistical model — the files that matter
For anything related to how variants are scored (LOD, somatic vs germline,
error model), the two files to read first are:
- `src/svaba/SvabaModels.cpp` / `.h` — self-contained statistical primitives.
- `LogLikelihood(d, a, f, e_fwd, e_rev)` (~lines 11-63) is the per-sample
two-state error model: `p_ref = (1-f)(1-e_fwd) + f*e_rev` and
`p_alt = f(1-e_rev) + (1-f)*e_fwd`. Returns log10 likelihood of observing
`a` alt reads out of `d`. This is the primitive every higher-level score
is built from.
- `SomaticLOD(...)` (~lines 70-83) — public wrapper; forwards to the
split-error implementation.
- `SomaticLOD_withSplitErrors(...)` (~lines 86-189) — the active somatic
model. Enumerates sub-hypotheses: `SOM_true`, `SOM_art`, `GERM_het`,
`GERM_hom`, `GERM_art`, `GERM_shared`. Returns
`log10( P(somatic) / P(any non-somatic) )`.
- Line ~79: `const double eN_fwd = std::min(e_art_fwd, 0.005);` — hard cap
on the normal-sample forward error rate. This cap is important: it means
even in regions where the artifact model infers a high error rate, the
normal sample is assumed to be clean. That's the knob you'd relax if you
want somlod to be "artifact-aware" on the normal side.
- Lines ~138-147: `GERM_shared` free-MLE branch. This is the sub-hypothesis
that fits a single pooled allele fraction across tumor+normal. It exists
to catch LOH (loss of heterozygosity) germline events where tumor VAF can
be much higher than 0.5 while normal is still ~0.5. It is also the main
reason `somlod` asymptotes slowly as tumor alt-support grows (see below).
- `src/svaba/BreakPoint.cpp` / `.h` — per-breakpoint scoring glue. Points of
interest:
- `BreakPoint::score_somatic()` at ~line 975 is the entry point that sets
`LO_s` (the somatic LOD). Calls
`SvabaModels::SomaticLOD(scaled_alt_n, a_cov_n, scaled_alt_t, a_cov_t, error_fwd, error_rev)`
around lines 1028-1031.
- `SampleInfo::modelSelection()` (~lines 1562-1674) computes the per-sample
`LO = ll_alt - ll_err` at line ~1610. These are unnormalized log10
likelihoods — absolute value is not meaningful, the difference is.
- `max_lod` is computed at lines ~198-200 and ~1196-1198 as the max of
`al.LO` across samples. This is the "is this artifact or not" score; it
does grow with additional supporting reads because it compares a variant
hypothesis to a pure-error hypothesis with no germline branch.
- Lines ~1072-1073: the current INDEL somatic gate only tests `somlod` —
it does not use `maxlod` as a co-gate. This is one of the levers in the
proposed fixes.
## The somlod / maxlod investigation (still open)
**Problem statement.** Users observe that `maxlod` grows with tumor alt
support, but `somlod` barely moves once tumor alt-support gets high. For a
clean normal (`aN=0` or `aN=1`) you'd naively expect `somlod` to also keep
climbing as tumor evidence accumulates, but it asymptotes around ~9 for
`dN ≈ 30`.
**Diagnosis.**
- There is a real statistical ceiling on somlod roughly equal to
`dN · log10(1/(1-fT_hat))`. With 30 normal reads you can only ever rule out
a shared germline hypothesis so hard — it's a finite amount of evidence.
- The `GERM_shared` free-MLE branch makes `somlod` *approach* that ceiling
slowly at sub-clonal tumor VAFs: when the MLE of a pooled AF lands in a
germline-plausible band, it provides a strong non-somatic explanation that
the somatic hypothesis has to beat.
- You cannot simply delete `GERM_shared` because doing so makes LOH germline
cases (e.g. `aN=15/dN=30, aT=285/dT=300`) return `somlod ≈ +36`, a false
positive. Verified numerically while writing the analysis.
- The `eN_fwd ≤ 0.005` cap means that in truly high-artifact regions, the
model refuses to "forgive" 2-3 alt reads in the normal as artifacts, which
is both good (prevents somatic false positives where normal is contaminated
by real signal) and limiting (prevents somatic calls where the artifact
model really does explain the normal reads).
**Proposed fixes (see `somlod_maxlod_analysis.html` for the full writeup).**
- Fix 1 — disjunction gate on `GERM_shared`: only let the free-MLE pooled
branch influence somlod when `shared_is_germline_plausible || normal_evidence > 1.0`,
where `normal_evidence = LL_N(dN, aN, f_n_mle) - LL_N(dN, aN, 0)`. This is
error-rate aware and is the fix I'd land first.
- Fix 2 — loosen `eN_fwd` cap in known high-artifact regions.
- Fix 3 — BIC penalty on the free-MLE branch (1 free parameter costs
~`0.5 log10(dN+dT)` nats ≈ bits of evidence).
- Fix 4 — joint `maxlod` + `somlod` gate for INDELs (require both above
threshold), since `maxlod` moves freely with tumor depth.
- Fix 5 — debug dump of sub-hypothesis LLs when `somlod` is within some
epsilon of the gate, to make future failures diagnosable.
**Related fix landed already (SvABA2.0 v3 split-coverage gate):** the
old `both_split && homlen > 0` / `one_split && homlen == 0` branching
in `BreakPoint::splitCoverage` was removed. A read is now credited as
a split-supporter iff (a) its r2c alignment scores strictly higher
than its native alignment (`r2c_score > native_score`, no percentage
margin — see `src/svaba/SvabaOptions.h`), and (b) it spans at least
one breakend on the contig. Long junction homology → r2c and native
tie → read doesn't credit either sample, which is the correct
conservative behavior (rather than the old "homology=0 one_split is
fine, homology>0 you need both_split" which nuked normal support
specifically when homology was long, biasing toward somatic calls).
The repeat_seq-length padding on the buffers is also gone — same
rationale, subsumed by the comparative score gate. See the user-
facing bp-id (v3 schema) work for how to trace a specific read's
current support attribution end-to-end.
**v3.1 fix — remove T_R2C_MIN_MARGIN (set to 0):**
the 10% `T_R2C_MIN_MARGIN` was killing all tumor alt-supporting reads
for small indels. A 1bp deletion on a 150bp read gives r2c=150 vs
native=143, a 4.9% improvement — mathematically impossible to clear
the 10% threshold. Traced via the `SVABA_TRACE_CONTIG` system on
contig `c_fermi_chr2_215869501_215894501_13C` (CIGAR `392M1D530M`):
every tumor read hit TP8 with r2c=150 vs threshold=157.3 → SKIP →
0 split support → LOWLOD → hasMinimal fail → variant dropped.
Fix: set `T_R2C_MIN_MARGIN` to 0.0 in `SvabaOptions.h` — same as
normal, strict greater-than only. Any percentage margin is inherently
read-length-dependent: a 1bp del gives 4.9% improvement on 150bp
reads but only 2.9% on 250bp reads, so any fixed percentage either
blocks long reads or is too loose for short reads. There's no
percentage that works across all read lengths.
The margin was belt-and-suspenders on top of the LOD model. In the
junction-homology case it was designed for: if both tumor and normal
credit borderline reads equally (r2c barely > native by 1-2 points),
the downstream LOD model sees similar split support in both samples →
low somlod → correctly not called somatic. Normal already used
margin=0, so the asymmetry was the only thing preventing normal from
crediting those reads — and it wasn't, because N_R2C_MIN_MARGIN was
always 0. The somatic/germline distinction is the LOD model's job.
**Important correctness notes (earned the hard way):**
- Don't propose `aN >= 2` style hard count gates without an error-rate
adjustment. In a high-artifact region, 2-3 normal alt reads can be
genuine artifacts, and gating on raw counts overcalls the case away. Always
reason about `normal_evidence` (LL delta against `f=0`) instead of raw `aN`.
- A LL ratio reported by `LogLikelihood` is log10. Multiplying by ~3.32 gives
bits. Per-read surprise is `(LL_alt - LL_ref) / d`. Absolute LL values have
no meaning — always compare two hypotheses at the same data.
- The ~9 ceiling at `dN=30` is a real statistical bound; no reformulation of
the somatic test can push above it. `GERM_shared` changes the *slope* of
approach, not the asymptote.
## Postprocess pipeline
Everything post-`svaba run` is orchestrated by `scripts/svaba_postprocess.sh`.
It's the unified replacement for the old `sort_output.sh` +
`sort_and_deduplicate_bps.sh` pair, which no longer exist.
Five steps per invocation, all idempotent (missing inputs log a one-liner and
continue):
1. **Merge per-thread BAMs** — `${ID}.thread*.${suffix}.bam` → `${ID}.${suffix}.bam`
for `discordant` / `weird` / `corrected`. Single-file inputs are moved;
no-file inputs silently skipped.
2. **`svaba postprocess`** — the C++ subcommand in `src/svaba/SvabaPostprocess.cpp`.
For each suffix (`weird`, `corrected`, `discordant`, `contigs`):
- `samtools sort -@ per_job_threads -m MEM` (shell out — htslib doesn't
expose its sort as a library call). **Auto-skipped** when the BAM
already declares `@HD SO:coordinate` — `isCoordinateSorted()`
inspects the header via `readHeaderOnly()` and logs
"already coordinate-sorted; skipping sort" so reruns are a no-op.
- Native streaming dedup (only for `weird`/`corrected`/`discordant`):
reads BAM via SeqLib::BamReader, collapses exact (qname, flag)
duplicates at each locus, and **unions their `bi:Z` / `bz:Z` comma-token
lists** so alt-supporting-contig evidence isn't lost when the same read
got emitted by two overlapping assembly windows. Key function:
`mergeCommaTokens` — boundary-aware union, mirrors
`SvabaOutputWriter::stamp_tag`.
Both the reader and writer have the htslib BGZF thread pool enabled
via a new `SeqLib::BamReader::SetThreads(int)` /
`SeqLib::BamWriter::SetThreads(int)` API that calls
`hts_set_threads(fp, n)`. `streamDedup` accepts a `threads` parameter
(wired from the full postprocess budget, not the per-suffix slice —
see two-phase note below) and applies it on both sides.
Without this pool, BGZF decompress + compress is single-threaded
and dominates wall time (40 GB BAM → ~2 hours). With `-t 4..8`
this typically drops to 25–40 min, a 3–5× speedup.
**Buckets-clear gotcha** (landed alongside the thread-pool fix):
the per-locus `idx_by_key` is a `std::unordered_map<std::string,
size_t>`. `unordered_map::clear()` is **O(bucket_count)** — it
memsets the entire bucket array even when size is 0, and
`bucket_count` only grows, never shrinks. One pileup locus
(centromere / simple repeat / HLA) inflates buckets to 100k+
for the rest of the BAM, and every subsequent locus transition
(hundreds of millions of them) pays that memset. Pre-fix perf
showed 95% of main-thread CPU going to
`__memset_evex_unaligned_erms` called from `unordered_map::clear`.
`flushLocus` now swaps with a fresh small map when
`bucket_count() > 256`, paying the inflated-map destructor cost
ONCE per pileup exit rather than once per locus. Lesson: never
trust `unordered_map::clear()` in a transient reuse pattern where
the map briefly inflates.
**Two-phase driver.** `svaba postprocess` runs in two phases so the
thread budget lands where it actually helps:
Phase 1 (PARALLEL): samtools sort across all active suffixes
concurrently, each worker with `o.threads / n_active` threads.
Sort is disk+CPU bound and scales linearly across files.
Phase 2 (SERIAL): dedup + reheader + index, one BAM at a time,
each with the full `o.threads` as its BGZF pool. Serial here
is deliberate — running dedup in parallel across suffixes
would oversubscribe (each BAM needs its own read+write BGZF
pool), and BGZF parallelism has diminishing returns so
`4 workers × 2 threads` is worse wall-clock than
`1 worker × 8 threads` iterated four times.
**Idempotency.** Every phase has its own auto-skip so rerunning
the pipeline on an already-finished BAM is essentially instant:
- Phase 1 sort: `isCoordinateSorted()` inspects `@HD SO:coordinate`
and bypasses the sort when already done.
- Phase 2 dedup: `hasSvabaPostprocessPg()` scans the @PG chain for
any `svaba_postprocess` (or uniquified `.1`, `.2` variants); if
present, dedup AND the subsequent reheader step are both skipped
(only the `.bai` is rebuilt, which is cheap and covers the
missing-index case). The first successful `streamDedup` stamps
the PG line, so a second `svaba_postprocess.sh` run on the same
outputs no-ops almost entirely.
- The shell-layer merge step (`scripts/svaba_postprocess.sh` step 1)
is already a no-op when per-thread `.thread*.bam` files aren't
present (nothing to merge), so all three steps compose naturally.
- **@PG stamp**: writes an `@PG ID:svaba_postprocess PN:svaba VN:<ver> CL:<argv> PP:<prev chain tail>`
line into the output header. For dedup-eligible suffixes this is free
(done in the writer during dedup); for `contigs` (no dedup) it's done
via `samtools reheader`. ID is auto-uniquified if `svaba postprocess`
has been run before on the same BAM.
- **BAI index** via `sam_index_build(fn, 0)` directly from htslib — one
less subprocess.
- Intermediate filenames use a `.postprocess.*.tmp.bam` suffix and are
renamed over on success / unlinked on failure. End state per suffix is
exactly `${ID}.${suffix}.bam(.bai)`; no `.sorted` / `.deduped` flotsam.
3. **Sort + dedup + PASS-filter `bps.txt.gz`** — external-memory sort via
GNU sort (`gsort` on macOS from homebrew coreutils). Sort keys:
`chr1(V), pos1(n), strand1, chr2(V), pos2(n), strand2, maxlod(gr)` —
maxlod descending so the "best SV per junction" survives the dedup.
Produces three files: `.bps.sorted.txt.gz`, `.bps.sorted.dedup.txt.gz`
(one row per unique breakpoint pair), `.bps.sorted.dedup.pass.txt.gz`
(col 32 == "PASS" only).
- Column positions hard-coded: `col 30 = cname (contig_and_region)`,
`col 32 = confidence`, `col 38 = maxlod`. These come from
`BreakPoint::toFileString` — change there and the script breaks.
4. **Filter `r2c.txt.gz` to PASS contigs (+ PASS-somatic subset)** —
resolves PASS cnames from `bps.txt.gz` (col 32 == "PASS") and the
PASS-somatic subset (also col 37, `somlod`, >= 1) in one pass. Then
one pass over `r2c.txt.gz` writes two outputs via awk pipe-to-command
(gzip compressors run in parallel as child processes):
- `${ID}.r2c.pass.txt.gz` (all PASS)
- `${ID}.r2c.pass.somatic.txt.gz` (PASS ∩ somlod >= 1)
Both are cname-keyed; contig and read rows survive together since
they share col 2. Either is suitable for `bps_explorer.html`'s
alignments sub-panel; the somatic subset is the lighter load when
you only care about the tumor-specific calls.
5. **Optional split-by-source** — `--split-by-source` (or env
`SPLIT_BY_SOURCE=1`) demuxes the deduped BAMs by the first 4 chars of
each QNAME into `${ID}.${suffix}.${prefix}.bam`.
CLI: `scripts/svaba_postprocess.sh -t THREADS -m MEM [other flags] <ID>`. Flags:
`-t/--threads`, `-m/--mem`, `--sort-buffer`, `--split-by-source`,
`--input-dir`, `--output-dir`, `--svaba`, `--keep-tmp`, `--skip-bam`,
`--skip-dedup`, `--skip-bps`, `--skip-r2c`, `--skip-split`, `-h/--help`.
`--skip-dedup` maps to `--sort-only` on the C++ CLI: keeps sort + @PG +
index but skips the dedup pass. Combined with the C++'s auto-skip of
sort when the BAM header already declares `@HD SO:coordinate`
(`isCoordinateSorted()` in `SvabaPostprocess.cpp`), a rerun on
already-postprocessed files is effectively instant — useful for
refreshing just the index or PG stamp. Env fallbacks for
backward compat: `THREADS`, `MEM`, `BUFFER_SIZE`, `SVABA`, `SAM`,
`INPUT_DIR`, `OUTPUT_DIR`, `SPLIT_BY_SOURCE`, `KEEP_TMP`.
Gotcha: `zcat` on macOS is BSD zcat (looks for `.Z`), not GNU. This script
uses `gzip -dc` everywhere for portability — if you add a new decompression
step, do the same or it'll fail on Mac.
## `svaba tovcf` subcommand
Standalone converter: pre-deduplicated `bps.txt.gz` → two VCFv4.5 files
(`${id}.sv.vcf.gz` + `${id}.indel.vcf.gz`). Lives in `src/svaba/tovcf.cpp`;
dispatch wired into `src/svaba/svaba.cpp`. Runs the VCFFile parser with
`skip_dedup=true` because the postprocess pipeline has already sorted
and deduplicated the bps.txt.gz upstream — the internal interval-tree
dedup would just be wasted work and can spuriously dup things via its
SV-pileup blacklist.
Design calls made (see `docs/vcf-design-decisions.md` if created; for
now this is the reference):
- **VCF spec:** declares `##fileformat=VCFv4.5` (latest formal spec,
Oct 2024). Backwards-compatible at the record level with anything
that accepts 4.2+.
- **File structure:** one SV VCF + one indel VCF per sample-set. Every
record (somatic + germline) goes into both files; somatic rows carry
the `SOMATIC` flag INFO so `bcftools view -f .,PASS -i 'INFO/SOMATIC'`
peels them off cleanly. Replaces the older 4-file split
(sv.somatic / sv.germline / indel.somatic / indel.germline).
- **SV representation:** intrachrom events with unambiguous orientation
become single-record symbolic alleles — `+/+` or `-/-` → `<DEL>`,
`-/+` → `<DUP>`, `+/-` → `<INV>`. Inter-chrom and anything else falls
through to paired BND records with mate-bracket notation. Override
with `--always-bnd` to force BND for every SV (useful if downstream
tooling gets confused by symbolic alleles). Classification lives in
`classify_symbolic_kind()` in `vcf.cpp`.
- **EVENT grouping:** both BND records of a pair get `EVENT=<bp_id>`
(taken from col 52 of bps.txt.gz, the v3 per-BP identifier). This
uses the same namespace as `r2c.txt.gz`'s `split_bps`/`disc_bps` and
the BAM `bi:Z` tag, so a user can follow a single variant across all
svaba outputs with one key.
- **QUAL column:** defaults to `.` (missing). QUAL was historically the
Phred of `Σ per-sample LO`, which is strongly correlated with
`INFO/MAXLOD` but can mislead users into filtering on QUAL when they
should be filtering on SOMLOD / MAXLOD / FILTER. `.` is VCF-spec-valid
missing and makes downstream tools fall through to the INFO fields
where the canonical scores live. Override with `--qual maxlod`
(writes `round(10 * maxlod)` capped at 99) or `--qual sum` (legacy).
- **SVCLAIM:** per VCF 4.5. `J` for pure assembly-only SVs, `DJ` for
ASDIS (both assembly + discordant evidence), `D` for discordant-only.
svaba is junction-native so most calls end up `J` or `DJ`.
Flag surface:
```
svaba tovcf -i BPS.txt.gz -b BAM -a ID [options]
--sv-out FILE override SV path (default ${ID}.sv.vcf.gz)
--indel-out FILE override indel path (default ${ID}.indel.vcf.gz)
--plain write plain .vcf (no bgzip)
--always-bnd force BND for every SV
--qual MODE missing (default) | maxlod | sum
--include-nonpass include FILTER != PASS records
--dedup re-run legacy interval-tree dedup
-v / --verbose
-h / --help
```
Gotcha: the BAM is required only for the chromosome name/length table
(used by contig `##contig` lines in the VCF header and by the
BreakPoint parser to turn chrom-name strings into chr IDs). No reads are
actually read from it — any BAM that shares the reference is fine.
Not-yet-done on this subcommand: bgzip-proper (current `--plain=false`
output is plain gzip, which bcftools accepts but tabix doesn't index
correctly). For now, pipe through `bcftools sort -Oz` + `tabix -p vcf`
after. If this becomes painful, revisit with htslib's `hts_open` +
`vcf_write` path instead of ogzstream.
## BreakPoint IDs (v3 schema)
Every BP gets a unique stable identifier of the form `bpTTTNNNNNNNN`
where `TTT` is the 3-digit zero-padded worker thread ID and
`NNNNNNNN` is an 8-digit zero-padded per-thread running counter.
Generation is lock-free: the counter lives on `svabaThreadUnit` (see
`next_bp_id()` in `src/svaba/SvabaThreadUnit.h`). Assignment happens
exactly once per BP in `SvabaRegionProcessor::process()` right before
the pointer is pushed into `unit.m_bps`. Because `AlignedContig`
holds BPs via `shared_ptr`, the id mutation is immediately visible
through every reference (global, multi-map, indel breaks).
The id lands as the 52nd core column of `bps.txt.gz` (right after
`flipped`, before per-sample blocks — this is the v3 schema; v2 had
51 cols, LEGACY had 41). It's also carried into `r2c.txt.gz` (see
next section) so a read's support attribution is unambiguously
linked to the exact BP row in bps.txt, eliminating the old "which BP
on this contig did this read actually support?" puzzle.
`svaba_bps_cols` (from `scripts/svaba_local_function.sh`) documents
the full layout; column 52 is the bp_id field.
**BAM aux tags `bi:Z` / `bz:Z` (v3).** The two aux tags svaba stamps
on weird/discordant/corrected BAM outputs now live in *different*
identifier namespaces — choose the right one for the join you want:
- `bi:Z` — comma-joined list of **bp_ids** this read supports as
ALT. Matches the per-BP resolution of `r2c.txt.gz`'s `split_bps`
/ `disc_bps` columns and `bps.txt.gz`'s col 52. Pre-v3 this
carried cnames (contig-level), which couldn't disambiguate a
contig that hosted multiple BPs (global + multi + indel). To pull
every ALT-supporting read for a specific variant row:
`samtools view corrected.bam | grep bi:Z:bp00100000042`. The tag
is populated in `SvabaRegionProcessor::process` at the BP
finalization loop (`tag_with_bp_id` lambda), mirrored into
`svabaThreadUnit::alt_bp_ids_by_name` for the corrected-BAM
restamp path in `SvabaOutputWriter::writeUnit`.
- `bz:Z` — comma-joined list of **cnames** this read r2c-aligned
to. Stays cname-keyed because "which contigs did this read align
to" is a contig-level concept. Populated inside the r2c loop in
`SvabaRegionProcessor` alongside `svabaRead::AddR2C(...)`,
mirrored into `svabaThreadUnit::all_cnames_by_name`. Superset of
`bi:Z` in the sense that every ALT-supporter r2c-aligned to its
contig, but uses a different key namespace, so join back to
bps.txt by col 30 (cname) here vs col 52 (bp_id) for `bi:Z`.
`scripts/r2c_for_contig.sh` defaults to `bz:Z` to pull all r2c'd
reads for a contig; set `TAG=bi` and pass a bp_id instead of a
cname if you want the ALT-supporter subset for a specific variant.
## r2c TSV format (re-plot-able alignments)
`${ID}.r2c.txt.gz` is the structured, re-plot-able alignment dump. It
replaces the old pre-rendered `${ID}.alignments.txt.gz` ASCII output,
which has been removed entirely — anything that lived implicitly in the
ASCII art (fragment orientation, leading/trailing soft-clip bases, etc.)
is now available as an explicit field in the TSV, and
`viewer/r2c_explorer.html` re-plots it in-browser on demand.
**Per-thread emission + postprocess merge.** Each svaba worker writes its
own `${ID}.thread${N}.r2c.txt.gz` during the run (stream lives in
`svabaThreadUnit::r2c_out_`; opened in the ctor, closed in the dtor,
gated on `opts.dump_alignments`). The write happens in
`SvabaOutputWriter::writeUnit` **before** `writeMutex_` is acquired —
each thread's gzip stream is independent, so deflate runs in parallel
across all workers. The first worker (threadId == 1; the worker pool in
`threadpool.h` numbers threads 1..N, so there is no thread 0) writes the
column-header line on open; other threads start with data.
`scripts/svaba_postprocess.sh` step 1 merges the per-thread files via
`cat`: gzip is concatenation-safe per RFC 1952, and the postprocess step
numerically sorts `.threadN.r2c.txt.gz` so thread 1 is first in the cat,
which means the merged file has exactly one header at the top. This is
the same architectural pattern as the per-thread BAM writers; for the
rationale, see "Perf notes" — in short, a mutex-shared gzip stream
serializes `deflate()` across all threads, losing ~15/16 of compression
parallelism at `-p 16`.
Gotcha: the header-writing branch is keyed to `threadId == 1` in
`SvabaThreadUnit::svabaThreadUnit`. Older revisions checked
`threadId == 0`; that never fired because the pool hands out 1..N,
which silently produced headerless `r2c.txt.gz` files. If you ever
change the worker-numbering base, that line must change with it.
Schema (documented by `AlignedContig::r2cTsvHeader()` in
`src/svaba/AlignedContig.cpp`) — 21 tab-separated columns, record-type
discriminated:
```
record_type contig_name contig_len contig_seq frags bps n_reads
# contig-only fields above — NA on read rows
read_id read_chrom read_pos read_flag r2c_cigar r2c_start
r2c_rc r2c_nm support split_bps disc_bps r2c_score
native_score read_seq
# read-only fields — NA on the contig row
```
`split_bps` / `disc_bps` are comma-joined `bp_id` lists — the unambiguous
per-BP attribution of each read. The categorical `support` field
(`split` / `disc` / `both` / `none`) is derived from these (whichever is
non-NA) and kept for grep-friendliness. The `bps` field on the contig
row also carries each BP's id as the 2nd subfield so viewers can join
back without a second file. On older r2c files without these columns
(pre-v3 emitter) the viewer falls back to the categorical support only.
One `contig` row per variant-bearing contig, followed by one `read` row per
r2c-aligned read. `contig_name` is the shared join key (same value as
bps.txt's col-30 `cname`), so grouping reads back to their contig is O(n).
Format details:
- `frags`: `|`-separated per fragment; within a frag, `:`-separated
`chr:pos:strand:cigar:mapq:cpos_break1:cpos_break2:gpos_break1:gpos_break2:flipped`.
- `bps`: `|`-separated; within a bp, `:`-separated
`kind:chr1:pos1:strand1:chr2:pos2:strand2:span:insertion`. `kind ∈ {global, multi, indel}`.
Insertion is `.` when absent so the field count stays fixed at 9.
- All cell values are TSV-escaped (tab/CR/LF → space) via `r2cEscape`.
- Per-sample support classification (`split`/`disc`/`both`/`none`) and the
scores are computed **identically** to the ASCII emitter — same
`r2c_score > native_score` gate, same `corrected_native_cig` precedence,
same `split_supporters`/`disc_supporters` sets. The two emitters share
enough code to prevent drift.
## Blacklist-aware region pruning
`run_svaba.cpp` drops any queued region that is 100% covered by the
blacklist BEFORE the region is sent to a worker thread. Lives right after
`loader.countJobs(regionsToRun)`, before `sc.total_regions_to_process` is
set. Rule: for each `r` in `regionsToRun`, if
`sc.blacklist.FindOverlapWidth(r, true) >= r.Width()`, drop it.
This was a measured win. Previously each fully-blacklisted chunk (e.g. a
25 kb window on a decoy/alt/random contig) paid the full pipeline cost:
`QueryRegion` on the ref, `walker->SetRegion` + `readBam` (which
decompresses every BGZF block overlapping the region, parses each
bam1_t, allocates an `svabaRead`) — only to have `sc.blacklist.CountOverlaps`
drop every read at `SvabaBamWalker.cpp:181-182`. Now those regions never
hit a thread.
Safe because `sc.blacklist` has had `MergeOverlappingIntervals()` +
`CreateTreeMap()` called, so `FindOverlapWidth` can't double-count and
wrongly drop a partially-callable region.
The per-read and per-BP blacklist checks at `SvabaRegionProcessor.cpp:74,
818` still run for regions that **partially** overlap — this prune only
short-circuits the 100%-covered case.
Pruned regions don't get a `runtime.txt` row. That's intentional and
actually makes the runtime file cleaner (only regions that did work).
## Options surface
`src/svaba/SvabaOptions.h`:
- `dump_weird_reads` is **compile-time-only** (`static constexpr bool`).
Flip the literal in the header and rebuild to enable; there is
deliberately no CLI path. Because it's `static constexpr false` by
default, the compiler dead-code-eliminates every
`if (sc.opts.dump_weird_reads) { ... }` branch at -O2+.
- `dump_discordant_reads`, `dump_corrected_reads`, and `dump_alignments`
default **false** and are all enabled together by the single
`--dump-reads` runtime flag (switch-case 1800 in `SvabaOptions.cpp`).
They control, respectively:
- `${ID}.discordant.bam`
- `${ID}.corrected.bam`
- `${ID}.r2c.txt.gz` (per-thread; merged by the postprocess step)
The fields are kept separate so individual callsites can key off
their own concern (e.g. `svabaThreadUnit` gates `r2c_out_` opening
on `dump_alignments` only), but there's no runtime path to toggle
them individually — all three flip as a unit under `--dump-reads`.
- Without `--dump-reads`, svaba produces the lean output set only:
`bps.txt.gz`, VCFs, `contigs.bam`, `runtime.txt`, `discordant.txt.gz`
(cluster-level, tiny). No per-thread `r2c.txt.gz`, no `corrected.bam`,
no `discordant.bam`. This is the production default because the gated
outputs can run to tens of gigabytes on deep samples.
- **`alignments.txt.gz` is gone.** The pre-rendered ASCII viewer output
was replaced in full by `r2c.txt.gz` (same information, not
pre-formatted). `AlignedContig::printToAlignmentsFile` and
`BreakPoint::printDeletionMarksForAlignmentsFile` were removed; the
surviving `AlignmentFragment::printToAlignmentsFile` is kept only
because one `std::cerr` debug print in `BreakPoint.cpp` still calls
it. `viewer/alignments_viewer.html` still exists and still works on
old `.alignments.txt.gz` files from previous runs, but new runs don't
produce that file — use the r2c sub-panel in `viewer/bps_explorer.html`
instead.
## Viewer suite (`viewer/`)
Entry point is `viewer/index.html`, a card grid pointing at the
sub-viewers. All client-side, no server required.
- **`bps_explorer.html`** — primary viewer. Sortable table of bps rows,
numeric filters (somlod/maxlod/qual/span/etc.), chip filters (counts
are live — they reflect the current filter, not the full dataset),
per-sample detail panel, small histograms for somlod/maxlod/span,
IGV-navigation links in the IGV1/IGV2 columns (fires
`fetch('http://localhost:60151/goto?locus=…', {mode:'no-cors'})`;
requires IGV running with port 60151 enabled). r2c re-plot sub-panel
was removed — that capability now lives in the standalone
`r2c_explorer.html` below.
- **`r2c_explorer.html`** — standalone re-plotter for the structured
r2c TSV (emitted by `svaba run --dump-reads`, or filtered to
PASS / PASS-somatic by `scripts/svaba_postprocess.sh`). Upload an
`.r2c.txt.gz` / `.r2c.pass.txt.gz` / `.r2c.pass.somatic.txt.gz`,
type or pick a contig name in the search box (browser `<datalist>`
autocomplete, capped at 5000 entries), and get the alignment plot
rendered in-browser: 10bp ruler, contig sequence, per-BWA-fragment
summaries with `>/<` orientation rendering, per-breakpoint summary
rows, and per-read gap-expanded CIGAR rendering with lowercase
leading/trailing soft-clip bases placed at `start - lead_clip_len`
(exact mirror of the old `AlignedContig::printToAlignmentsFile`).
Prev/Next/Random navigation, imperfect-only and per-support-kind
toggles, click-to-hide source-prefix legend. Load-once, explore
many contigs — no need to load a bps.txt.gz first.
- **`alignments_viewer.html`** — legacy ASCII-plot viewer for
`alignments.txt.gz` outputs produced by svaba runs *before* the r2c
migration. New runs no longer produce `alignments.txt.gz`;
`r2c_explorer.html` is the replacement. Kept in the tree so historical
outputs still render.
- **`runtime_explorer.html`** — explorer for `runtime.txt` (the per-region
timing TSV produced by `svabaTimer::logRuntime`, schema in
`SvabaUtils.cpp`). Sortable table, region-range filter,
runtime/contigs/discordant filters, IGV-click on the region column,
prominent runtime histogram (defaults to log10 because runtime
distributions are always long-tailed — see "Perf notes"). 17-column
schema hardcoded from `SvabaUtils.cpp::svabaTimer::header`.
- **`comparison.html`** — side-by-side of two bps runs.
- **`bps_viewer.html`** — legacy light-theme viewer, uses external
`app.js` + `styles.css`.
## tracks/ and blacklists
`tracks/hg38.combined_blacklist.bed` is the one to feed `svaba run
--blacklist`. It's a **regeneratable artifact** — produced by
`scripts/combine_blacklists.sh` from the component files in the same dir. Don't
hand-edit it; edit a component file and re-run the script.
Components (as of last pass):
- `hg38.blacklist.sorted.bed` — ENCODE-style high-signal regions.
- `hg38.high_runtime.bed` — regions empirically slow to assemble.
- `hg38.manual.blacklist.bed` — ad-hoc bad-list, curated.
- `hg38.nonstd_chr.blacklist.bed` — full-contig entries for every
chrUn/*_decoy/*_alt/*_random/chrEBV/HLA-* contig in the reference,
generated from `tracks/chr` (a GRCh38 fasta-header dump).
- `hg38.rmsk.simple_repeat.bed` — UCSC RepeatMasker simple-repeat
regions.
`scripts/combine_blacklists.sh` has three modes: plain concat (default),
`--merge` (sort + `bedtools merge` with distinct-label aggregation), and
`--clip GENOME` (clip interval ends to real contig length, because some
input BEDs use oversize-end sentinels like `end=250000000` that would
otherwise inflate covered-bp totals into the trillions — ask me how I
know). Preferred invocation for refreshing the combined blacklist:
```bash
./scripts/combine_blacklists.sh --merge --clip tracks/chr \
-o tracks/hg38.combined_blacklist.bed \
tracks/hg38.blacklist.sorted.bed \
tracks/hg38.high_runtime.bed \
tracks/hg38.manual.blacklist.bed \
tracks/hg38.nonstd_chr.blacklist.bed \
tracks/hg38.rmsk.simple_repeat.bed
```
## Perf notes (empirical, from profiling the HLA region on M3 Ultra)
From a `sample`-based CPU profile with `-p 16`:
- **96% of worker time is inside `SvabaRegionProcessor::process`**.
Four buckets soak it up:
- BWA alignment: **~38%** (`mem_align1_core`, `mem_chain`, seed lookups
`bwt_sa`/`bwt_occ*`, `ksw_extend2`)
- Fermi-lite assembly: **~27%** (`fml_assemble`, `fml_fmi2mag*`,
`rld_rank2a`, `unitig_unidir`)
- BFC error correction: **~17%** (`SeqLib::BFC::ErrorCorrect/Train`,
`kmer_correct`, `bfc_*`). User-controllable via the ec flags.
- BAM walking: **~9%** (`svabaBamWalker::readBam`)
- **IPC ≈ 3.0** on Apple Silicon P-cores (theoretical max ~8, typical
real workloads 1.2–2.5). The code is already running near the silicon's
sustained envelope; algorithmic rewrites have diminishing returns.
- **Voluntary context switches ≈ 42 across a 56s run** — i.e. essentially
zero. Which means **there is no allocator contention, no mutex
contention, no I/O blocking** to fix. jemalloc/mimalloc on macOS **lost
to libmalloc by 5-10×** in an A/B test (DYLD interposition overhead +
Apple's nanomalloc fast path for small allocs). Save the allocator swap
for the Linux production box — on Linux it's often a 10–20% win; on
macOS don't bother.
- **BWA and fermi-lite default to `n_threads = 1` internally**
(`mem_opt_init` at `bwamem.c:100`, `fml_opt_init` at `bfc.c:21`) and
svaba never mutates them. `kt_for` samples in the profile are the same
worker thread running inline, not spawned threads. So svaba's `-p N`
is 1:1 with worker threads; there's no multiplicative thread fan-out.
Thread-count guidance on this 20 P-core + 8 E-core box:
- `-p 20` is the default sweet spot. One worker per P-core; E-cores absorb
macOS background work, jemalloc bg thread, etc.
- `-p 26` or `-p 28` is possible but the extra threads land on E-cores
and become tail latency (E-core is ~50% the speed of a P-core). Usually
a wash or slight loss vs `-p 20`.
- Load imbalance is the main remaining utilization gap. On a 3 Mb test
region, utilization was ~63%; on a 10 Mb region it climbed to ~78%; on
full chromosome / WGS expect 85–90%. Tail regions (HLA, centromere,
high-coverage weird spots) dominate wall clock.
Build flag recommendation for production runs (see "Build system"):
`-O3 -mcpu=native` on bwa + fermi-lite + (via `CMAKE_CXX_FLAGS_RELWITHDEBINFO`)
the svaba C++. Measured 5–15% wall-time gain expected on top of the default
RelWithDebInfo build.
## Cloud scatter-gather (`scripts/svaba_cloud.sh`)
`svaba_cloud.sh` parallelizes a WGS run across multiple GCP VMs — one
per chromosome partition — sharing a single read-only persistent disk.
Each VM runs `svaba run -k <partition>` independently; outputs go to a
GCS bucket; an optional `--merge` step concatenates the per-partition
`bps.txt.gz` files and runs `svaba_postprocess.sh`.
Architecture rationale: svaba's bottleneck is BWA FM-index random
lookups, which are latency-bound and NUMA-hostile. Multi-socket servers
waste half their threads on cross-socket access. Small single-socket VMs
(e.g. `c3d-highcpu-30`, AMD Genoa, ~128 MB L3, no NUMA) give each
worker full-speed local memory. Horizontal scaling across VMs beats
vertical scaling to more threads on a big box.
On dual-socket Xeon servers (measured on 2×20-core Xeon @ 2.8 GHz),
jemalloc is the single biggest optimization — 37% wall-time reduction at
38 threads by eliminating glibc arena lock contention. NUMA pinning
(`numactl --cpunodebind --membind`) is second-order at ≤40 threads on
a 2-socket box but matters more on 4-socket or at higher thread counts.
Disk I/O is ~9% of wall time (sequential BAM streaming). NVMe is
unnecessary; standard persistent disk or even gcsfuse over a GCS bucket
is fine.
Interchromosomal SVs: both breakends get assembled independently by
whichever partition contains the discordant read pileup. The merge +
dedup step in `svaba_postprocess.sh` pairs them. No calls are lost.
## Conventions
- **File naming**: `src/svaba/Svaba*.{cpp,h}` for the svaba-specific files
(PascalCase, first letter capitalized). Intentional lowercase / snake_case
exceptions live at the top of `src/svaba/` (`refilter`, `run_svaba`,
`svaba`, `threadpool`, `tovcf`, `vcf`). Don't introduce new lowercase-first
names; if you rename something on a case-insensitive filesystem (macOS),
do it via `git rm --cached <lower> && git add <Upper>` or git's `core.ignorecase`
will hide the rename from the index.
- **C++ style**: snake_case methods inside ClassName, 2-space indent,
header/impl split. Don't introduce new formatting unless asked.
- **Statistical code lives in `SvabaModels.*`**; breakpoint glue lives in
`BreakPoint.*`. Keep statistical primitives in the models file, not
inlined into BreakPoint.
- **LL/LOD values in this codebase are always log10**, not natural log.
- **`aN, dN, aT, dT`** = alt count / depth in normal and tumor; **`f`** =
allele fraction; **`e_fwd`/`e_rev`** = forward/reverse error rates from
the artifact model. These names are used consistently in the analysis
HTML too.
- **Option codes** in `SvabaOptions.cpp::longOpts`: 1001-1099 = mode,
1100s = assembly, 1200s = EC, 1300s = discordant, 1400s = filter,
1500 = chunking, 1600s = bwa-mem tuning, 1700s = output/DBs, 1800 =
dump-reads. Keep the ranges coherent when adding new options.
## Mate-region lookup pipeline
When svaba encounters discordant reads (insert size too large or wrong
orientation), it collects their mate loci and considers doing a secondary
"mate-region" assembly to catch the other breakend of an SV.
**Six gates** a mate candidate must pass (in order):
1. **Primary MAPQ** (`minMateMAPQ`, default -1 = no gate): the discordant
read itself must have MAPQ ≥ this. Set to e.g. 10 to skip
multi-mapped primaries.
2. **Chromosome ID** (`maxMateChrID`, default 23): mate must land on
chr ≤ this ID (0-indexed: 0=chr1 .. 22=chrX, 23=chrY). Skips
chrM/alt/decoy in human. Set to -1 (via `--non-human`) to disable
entirely for non-human genomes.
3. **Blacklist**: mate locus checked against `sc.blacklist`.
4. **Min count** (`mateRegionMinCount`, default 2): merged region must
have ≥ N supporting reads to survive the BamWalker filter.
5. **Somatic mateLookupMin** (default 3, `MATE_LOOKUP_MIN`): in
`SvabaRegionProcessor`, only look up regions with ≥ this many
somatic-only reads.
6. **Max regions** (6): cap at 6 mate regions per assembly window.
All constants live in `SvabaOptions.h` as `inline constexpr` with
runtime overrides in the `SvabaOptions` class:
```
--min-mate-mapq N (default -1, no gate)
--max-mate-chr N (default 23, through chrY; set -1 for no limit)
--mate-min-count N (default 2)
--non-human (sets maxMateChrID = -1, removes human assumptions)
```
Code: `SvabaBamWalker.cpp::calculateMateRegions()`.
## Compile-time read & contig tracing
Two zero-cost compile-time trace systems for debugging why a specific
read was or wasn't credited / a contig was or wasn't called:
**`SVABA_TRACE_READ`** — traces a single read (by QNAME) through the
entire pipeline: BamWalker intake → BFC correction → r2c alignment →
native realignment → splitCoverage scoring → output tagging.
```bash
cmake .. -DCMAKE_CXX_FLAGS='-DSVABA_TRACE_READ="\"LH00306:129:227V5CLT4:6:1204:38807:7191\""'
```
Trace points (19 total across 3 files):
- `SvabaBamWalker.cpp`: initial read filter decisions (existing)
- `SvabaRegionProcessor.cpp`: BFC correction result, r2c alignment
per-contig, native realignment reuse/done/miss, bi:Z tagging
- `BreakPoint.cpp`: splitCoverage entry, TP8 r2c-vs-native comparison,
TP9 del/ins near break, TP10 span check, TP11 del covers,
CREDITED/NOT CREDITED final decision
**`SVABA_TRACE_CONTIG`** — traces a single contig through assembly,
alignment, and scoring:
```bash
cmake .. -DCMAKE_CXX_FLAGS='-DSVABA_TRACE_CONTIG="\"c_fermi_chr2_215869501_215894501_13C\""'
```
Both can be combined. Both are `#ifdef`-guarded so they compile to
nothing when not defined. See `src/svaba/SvabaDebug.h` for the macro
definitions and `README.md` for full recipes.
## Useful jump points
- Somatic LOD calc: `src/svaba/SvabaModels.cpp:86`
- Per-sample LO: `src/svaba/BreakPoint.cpp:1610`
- Somatic LOD entry: `src/svaba/BreakPoint.cpp:975`
- INDEL somatic gate: `src/svaba/BreakPoint.cpp:1072`
- Region-queue blacklist prune: `src/svaba/run_svaba.cpp` (right after
`loader.countJobs(regionsToRun)`)
- Per-read blacklist filter: `src/svaba/SvabaBamWalker.cpp:181-182`
- r2c TSV emitter: `src/svaba/AlignedContig.cpp::printToR2CTsv` +
`::r2cTsvHeader`
- Postprocess (C++): `src/svaba/SvabaPostprocess.cpp`
- Postprocess (shell orchestration): `scripts/svaba_postprocess.sh`
- `svaba tovcf` driver: `src/svaba/tovcf.cpp::runToVCF`
- VCF engine (parse + dedup + emit): `src/svaba/vcf.cpp` + `vcf.h`
- Symbolic SV classifier: `vcf.cpp::classify_symbolic_kind`
- Single-file VCF writers: `vcf.cpp::writeSvsSingleFile` + `writeIndelsSingleFile`
- Blacklist combiner: `scripts/combine_blacklists.sh`
- Runtime-file schema: `src/svaba/SvabaUtils.cpp::svabaTimer::header`
- Options parsing: `src/svaba/SvabaOptions.cpp::SvabaOptions::parse`
- Analysis writeup (somlod/maxlod): `somlod_maxlod_analysis.html`
- Mate-region lookup: `src/svaba/SvabaBamWalker.cpp::calculateMateRegions`
- Mate-region constants: `src/svaba/SvabaOptions.h` (lines 126-141)
- Read trace macro: `src/svaba/SvabaDebug.h`
- Read trace (BFC/r2c/native): `src/svaba/SvabaRegionProcessor.cpp`
- Read trace (splitCoverage): `src/svaba/BreakPoint.cpp`
- Debugging recipes: `README.md` (in svaba_opt root)
================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)
# Set the C++ standard required for the project
project(svaba) # LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING
"Default build type" FORCE)
endif()
add_compile_options(-fno-omit-frame-pointer)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/SeqLib)
# Include directories for headers
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/SeqLib
${CMAKE_CURRENT_SOURCE_DIR}/SeqLib/bwa
${CMAKE_CURRENT_SOURCE_DIR}/SeqLib/fermi-lite
)
# Look for htslib on the system
find_package(htslib QUIET)
# Find required system level type libraries
find_package(Threads REQUIRED)
find_package(ZLIB REQUIRED)
## LZMA
find_path(LZMA_INCLUDE_DIR NAMES lzma.h)
find_library(LZMA_LIBRARY NAMES lzma)
if(NOT LZMA_INCLUDE_DIR OR NOT LZMA_LIBRARY)
message(FATAL_ERROR "LZMA library or headers not found!")
endif()
## BZip2
find_package(BZip2 REQUIRED)
## HTSLIB
#
# Detection order:
# 1. find_package(htslib) — works only if htslib ships a
# htslibConfig.cmake (rare outside conda/vcpkg installs).
# 2. pkg-config — system-installed htslib typically provides a
# htslib.pc, so `pkg-config --libs htslib` works.
# 3. find_path/find_library for htslib/hts.h and libhts on the
# default system paths (/usr/include, /usr/local/include,
# /opt/homebrew/include, etc.).
# 4. Explicit -DHTSLIB_DIR=/path/to/htslib for non-standard installs.
#
# The first three paths auto-detect a system install, so a vanilla
# `cmake .. && make` works when htslib is installed system-wide. Only
# fall back to HTSLIB_DIR when all auto-detections fail, matching the
# pattern used here for ZLIB / BZip2 (also auto-detected).
set(SVABA_HTSLIB_FOUND FALSE)
set(SVABA_HTSLIB_LIBRARIES "")
if (htslib_FOUND)
message(STATUS "htslib: found via find_package()")
set(SVABA_HTSLIB_FOUND TRUE)
set(SVABA_HTSLIB_LIBRARIES hts)
endif()
if (NOT SVABA_HTSLIB_FOUND)
find_package(PkgConfig QUIET)
if (PkgConfig_FOUND)
pkg_check_modules(HTSLIB_PC QUIET htslib)
if (HTSLIB_PC_FOUND)
message(STATUS "htslib: found via pkg-config (version ${HTSLIB_PC_VERSION})")
include_directories(${HTSLIB_PC_INCLUDE_DIRS})
link_directories(${HTSLIB_PC_LIBRARY_DIRS})
set(SVABA_HTSLIB_FOUND TRUE)
set(SVABA_HTSLIB_LIBRARIES ${HTSLIB_PC_LIBRARIES})
endif()
endif()
endif()
if (NOT SVABA_HTSLIB_FOUND)
find_path(HTSLIB_INCLUDE_DIR NAMES htslib/hts.h
DOC "Directory containing htslib/hts.h")
find_library(HTSLIB_LIBRARY NAMES hts
DOC "Path to the hts library")
if (HTSLIB_INCLUDE_DIR AND HTSLIB_LIBRARY)
message(STATUS "htslib: found headers at ${HTSLIB_INCLUDE_DIR}, lib at ${HTSLIB_LIBRARY}")
include_directories(${HTSLIB_INCLUDE_DIR})
set(SVABA_HTSLIB_FOUND TRUE)
set(SVABA_HTSLIB_LIBRARIES ${HTSLIB_LIBRARY})
endif()
endif()
if (NOT SVABA_HTSLIB_FOUND)
set(HTSLIB_DIR "" CACHE PATH "Path to HTSLib root directory")
if (HTSLIB_DIR)
message(STATUS "htslib: using explicit HTSLIB_DIR=${HTSLIB_DIR}")
include_directories(${HTSLIB_DIR}/include)
link_directories(${HTSLIB_DIR}/lib)
set(SVABA_HTSLIB_FOUND TRUE)
set(SVABA_HTSLIB_LIBRARIES hts)
else()
message(FATAL_ERROR
"htslib not found. Install htslib system-wide (e.g. "
"`brew install htslib` or `apt install libhts-dev`) so "
"pkg-config / find_library locate it automatically, "
"or pass -DHTSLIB_DIR=/path/to/htslib to point at a "
"manual build.")
endif()
endif()
# Find all source files
set(SOURCES
src/svaba/run_svaba.cpp
src/svaba/SvabaOutputWriter.cpp
src/svaba/BreakPoint.cpp
src/svaba/ContigAlignmentScore.cpp
src/svaba/AlignedContig.cpp
src/svaba/AlignmentFragment.cpp
src/svaba/DiscordantCluster.cpp
src/svaba/DBSnpFilter.cpp
src/svaba/SvabaUtils.cpp
src/svaba/svaba.cpp
src/svaba/SvabaAssemblerEngine.cpp
src/svaba/tovcf.cpp
src/svaba/test_svaba.cpp
src/svaba/vcf.cpp
src/svaba/DiscordantRealigner.cpp
src/svaba/SvabaOverlapAlgorithm.cpp
src/svaba/SvabaASQG.cpp
src/svaba/SvabaAssemble.cpp
src/svaba/KmerFilter.cpp
src/svaba/SvabaThreadUnit.cpp
src/svaba/SvabaRegionProcessor.cpp
src/svaba/SvabaBamWalker.cpp
src/svaba/refilter.cpp
src/svaba/SvabaPostprocess.cpp
src/svaba/LearnBamParams.cpp
src/svaba/STCoverage.cpp
src/svaba/SvabaModels.cpp
##src/svaba/Histogram.cpp
##src/svaba/BamStats.cpp
src/svaba/SvabaRead.cpp
src/svaba/SvabaOptions.cpp
src/svaba/SvabaLogger.cpp
src/svaba/SvabaFileLoader.cpp
src/SGA/SuffixTools/STCommon.cpp
src/SGA/SuffixTools/Occurrence.cpp
src/SGA/SuffixTools/SuffixArray.cpp
src/SGA/SuffixTools/SuffixCompare.cpp
src/SGA/SuffixTools/InverseSuffixArray.cpp
src/SGA/SuffixTools/SACAInducedCopying.cpp
src/SGA/SuffixTools/BWTAlgorithms.cpp
src/SGA/SuffixTools/BWTReader.cpp
src/SGA/SuffixTools/BWTWriter.cpp
src/SGA/SuffixTools/SAReader.cpp
src/SGA/SuffixTools/SAWriter.cpp
src/SGA/SuffixTools/SBWT.cpp
src/SGA/SuffixTools/RLBWT.cpp
src/SGA/SuffixTools/BWTWriterBinary.cpp
src/SGA/SuffixTools/BWTReaderBinary.cpp
src/SGA/SuffixTools/BWTWriterAscii.cpp
src/SGA/SuffixTools/BWTReaderAscii.cpp
src/SGA/SuffixTools/BWTIntervalCache.cpp
src/SGA/SuffixTools/SampledSuffixArray.cpp
src/SGA/Algorithm/OverlapAlgorithm.cpp
src/SGA/Algorithm/DPAlignment.cpp
src/SGA/Algorithm/SearchSeed.cpp
src/SGA/Algorithm/OverlapBlock.cpp
src/SGA/Algorithm/SearchHistory.cpp
src/SGA/Algorithm/OverlapTools.cpp
src/SGA/Bigraph/Bigraph.cpp
src/SGA/Bigraph/Vertex.cpp
src/SGA/Bigraph/Edge.cpp
src/SGA/Bigraph/EdgeDesc.cpp
src/SGA/SGA/OverlapCommon.cpp
src/SGA/SQG/SQG.cpp
src/SGA/SQG/ASQG.cpp
src/SGA/StringGraph/SGUtil.cpp
src/SGA/StringGraph/SGAlgorithms.cpp
src/SGA/StringGraph/SGVisitors.cpp
src/SGA/StringGraph/CompleteOverlapSet.cpp
src/SGA/StringGraph/RemovalAlgorithm.cpp
src/SGA/StringGraph/SGSearch.cpp
src/SGA/StringGraph/SGWalk.cpp
src/SGA/Util/Util.cpp
src/SGA/Util/stdaln.c
src/SGA/Util/Alphabet.cpp
src/SGA/Util/Contig.cpp
src/SGA/Util/ReadTable.cpp
src/SGA/Util/ReadInfoTable.cpp
src/SGA/Util/SeqReader.cpp
src/SGA/Util/DNAString.cpp
src/SGA/Util/Match.cpp
src/SGA/Util/Pileup.cpp
src/SGA/Util/Interval.cpp
src/SGA/Util/SeqCoord.cpp
src/SGA/Util/QualityVector.cpp
src/SGA/Util/Quality.cpp
src/SGA/Util/PrimerScreen.cpp
src/SGA/Util/CorrectionThresholds.cpp
src/SGA/Util/ClusterReader.cpp
src/SGA/Util/QualityTable.cpp
src/SGA/Util/gzstream.C
src/SGA/Util/BitChar.cpp
src/SGA/Util/MultiOverlap.cpp
)
include_directories(
${CMAKE_CURRENT_BINARY_DIR}
src/SGA/Algorithm
src/SGA/StringGraph
src/SGA/SGA
src/SGA/SuffixTools
src/SGA/Util
src/SGA/SQG
src/SGA/Bigraph
src/svaba
src/SGA
)
# ---------------------------------------------------------------------------
# Optional compile-time contig tracing (SvabaDebug.h).
#
# cmake .. -DSVABA_TRACE_CONTIG="c_fermi_chr2_215869501_215894501_11C"
#
# Prints detailed stderr trace for every decision point that contig
# touches. Pass -DSVABA_TRACE_ALL=ON to trace ALL contigs (very noisy).
# ---------------------------------------------------------------------------
set(SVABA_TRACE_CONTIG "" CACHE STRING "Contig name to trace (empty=off)")
option(SVABA_TRACE_ALL "Trace ALL contigs (very noisy)" OFF)
set(SVABA_TRACE_READ "" CACHE STRING "Read QNAME to trace (empty=off)")
option(SVABA_TRACE_ALL_READS "Trace ALL reads (extremely noisy)" OFF)
# Generate the executable
add_executable(svaba ${SOURCES})
if (NOT "${SVABA_TRACE_CONTIG}" STREQUAL "")
message(STATUS "svaba: tracing contig '${SVABA_TRACE_CONTIG}'")
target_compile_definitions(svaba PRIVATE
SVABA_TRACE_CONTIG="${SVABA_TRACE_CONTIG}")
endif()
if (SVABA_TRACE_ALL)
message(STATUS "svaba: tracing ALL contigs (noisy!)")
target_compile_definitions(svaba PRIVATE SVABA_TRACE_ALL=1)
endif()
if (NOT "${SVABA_TRACE_READ}" STREQUAL "")
message(STATUS "svaba: tracing read '${SVABA_TRACE_READ}'")
target_compile_definitions(svaba PRIVATE
SVABA_TRACE_READ="${SVABA_TRACE_READ}")
endif()
if (SVABA_TRACE_ALL_READS)
message(STATUS "svaba: tracing ALL reads (extremely noisy!)")
target_compile_definitions(svaba PRIVATE SVABA_TRACE_ALL_READS=1)
endif()
# If the submodules already have Makefiles, you can use custom commands to invoke make
# in those directories.
add_custom_target(
COMMAND make
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
# ---------------------------------------------------------------------------
# Optional jemalloc support.
#
# On Linux with high thread counts (-p 16+), jemalloc eliminates glibc
# arena-lock contention that can cost 30-40% wall time. Link it at
# compile time so no LD_PRELOAD is needed at runtime.
#
# cmake .. -DUSE_JEMALLOC=ON # link jemalloc (recommended on Linux)
# cmake .. -DUSE_JEMALLOC=OFF # use system malloc (default)
#
# macOS users should NOT enable this — Apple's libmalloc with its
# nanomalloc fast path outperforms jemalloc on this workload.
# ---------------------------------------------------------------------------
option(USE_JEMALLOC "Link against jemalloc for reduced allocator contention" OFF)
set(JEMALLOC_LIBRARIES "")
if (USE_JEMALLOC)
find_library(JEMALLOC_LIB NAMES jemalloc
PATHS /usr/lib/x86_64-linux-gnu /usr/lib /usr/local/lib
DOC "Path to libjemalloc")
if (JEMALLOC_LIB)
message(STATUS "jemalloc: found at ${JEMALLOC_LIB}")
set(JEMALLOC_LIBRARIES ${JEMALLOC_LIB})
else()
message(FATAL_ERROR
"USE_JEMALLOC=ON but libjemalloc not found. "
"Install it (apt install libjemalloc-dev) or set "
"-DJEMALLOC_LIB=/path/to/libjemalloc.so")
endif()
endif()
# Linking
target_link_libraries(svaba
seqlib
${CMAKE_CURRENT_SOURCE_DIR}/SeqLib/bwa/libbwa.a
${CMAKE_CURRENT_SOURCE_DIR}/SeqLib/fermi-lite/libfml.a
Threads::Threads
${SVABA_HTSLIB_LIBRARIES}
ZLIB::ZLIB
${CURL_LIBRARY}
${LZMA_LIBRARY}
BZip2::BZip2
${JEMALLOC_LIBRARIES}
)
# ---------------------------------------------------------------------------
# Install support.
#
# `make install` (equivalent: `cmake --install build`) copies the svaba
# binary to ${CMAKE_INSTALL_PREFIX}/bin. GNUInstallDirs sets the usual
# distro-friendly defaults for BINDIR / LIBDIR / etc., so this follows
# standard Unix conventions.
#
# Default CMAKE_INSTALL_PREFIX is /usr/local (may need sudo). To stage
# into a non-system location, pass -DCMAKE_INSTALL_PREFIX=/some/path at
# cmake time, or use `cmake --install build --prefix /some/path` at
# install time. To drop the binary into this repo's `bin/` directory:
# cmake --install build --prefix ${CMAKE_SOURCE_DIR}
# ---------------------------------------------------------------------------
include(GNUInstallDirs)
install(TARGETS svaba
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(PROGRAMS ${CMAKE_SOURCE_DIR}/scripts/svaba_postprocess.sh
DESTINATION ${CMAKE_INSTALL_BINDIR})
================================================
FILE: Dockerfile
================================================
# Start with an Ubuntu image
FROM ubuntu:20.04
# Avoid prompts with tzdata (timezones)
ENV DEBIAN_FRONTEND=noninteractive
# Install dependencies for htslib and svaba
RUN apt update && apt install -y \
autoconf \
automake \
make \
gcc \
g++ \
git \
perl \
zlib1g-dev \
libbz2-dev \
liblzma-dev \
libcurl4-gnutls-dev \
libssl-dev \
cmake \
&& rm -rf /var/lib/apt/lists/*
# Clone and install htslib
WORKDIR /opt
RUN git clone --recursive https://github.com/samtools/htslib.git && \
cd htslib && \
autoheader && \
autoconf && \
./configure && \
make && \
make install
# Ensure shared libraries are noticed
RUN ldconfig
# Clone svaba
WORKDIR /opt
RUN git clone --recursive https://github.com/walaj/svaba.git && cd svaba && mkdir build
# Compile svaba with htslib
WORKDIR /opt/svaba/build
RUN cmake .. \
-DHTSLIB_DIR=/usr/local && \
make
# Add svaba to system PATH
ENV PATH="/opt/svaba/build:${PATH}"
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
================================================
FILE: R/archive_non_functional/create-databases.R
================================================
####################
## load CCDS and name table
ccds <- fread("/xchip/gistic/Jeremiah/tracks/ccdsGene.hg19.txt")
## load and format name table (CCDS to gene name conversion)
nm <- fread("/xchip/gistic/Jeremiah/tracks/name_conversion.tsv")
setnames(nm, colnames(nm), gsub(" ","_",colnames(nm)))
app_sym <- rep(nm$Approved_Symbol, times=nm[, length(strsplit(CCDS_IDs, ",")[[1]]), by=HGNC_ID]$V1)
nmm <- data.table(ccds=gsub(" ", "", unlist(strsplit(nm$CCDS_IDs,","))), gene=app_sym)
setkey(nmm, ccds)
## merge names with CCDS
ccds[, ccds := gsub("(.*?)\\..*", "\\1", name)]
setkey(ccds, ccds)
ccds <- nmm[ccds]
## switch frames 1 and 2, so that 2 refers to 2/3 of codon, 1 to 1/3
## eg. ++***+++***+++ starts with 2/3 of codon in its sense dir, so frame is 2
ccds$exonFrames <- gsub("1","A", ccds$exonFrames)
ccds$exonFrames <- gsub("2","1", ccds$exonFrames)
ccds$exonFrames <- gsub("A","2", ccds$exonFrames)
## dedupe on gene, taking the largest one
ccds[, negc := -exonCount]
setkey(ccds, gene, negc)
ccds <- ccds[!duplicated(gene)]
ccds[, negc := NULL]
## create exon intron track
ccds[, exon_start := min(as.numeric(strsplit(exonStarts, ",")[[1]])), by=c("name", "chrom", "txStart")]
ccds[, exon_end := max(as.numeric(strsplit(exonEnds, ",")[[1]])), by=c("name", "chrom", "txStart")]
ccds[, intron_ends := { ll <- as.numeric(strsplit(exonStarts, ",")[[1]]); paste(ll[seq(2, length(ll), length.out=ifelse(length(ll) > 1, length(ll)-1, 0))], collapse=",")} , by=c("name", "chrom", "txStart")]
ccds[, intron_starts := { ll <- as.numeric(strsplit(exonEnds, ",")[[1]]); paste(ll[seq(1, length(ll)-1, length.out=ifelse(length(ll) > 1, length(ll)-1, 0))], collapse=",")} , by=c("name", "chrom", "txStart")]
ccds[, intron_frame := {
ll <- as.numeric(strsplit(exonFrames, ",")[[1]])
if (length(ll) == 1) ## no exons
""
else if (strand == '+') ## in sense dir, give frame to exon after
paste(ll[seq(2, length(ll))], collapse=",")
else ## exons ordered in + direction, so sense exon after intron is actually the one BEFORE intron for this ordering
paste(ll[seq(1, length(ll)-1)], collapse=",")
} , by=c("name", "chrom", "txStart")]
intrEnds <- as.numeric(unlist(strsplit(ccds$intron_ends, ",")))
intrStarts <- as.numeric(unlist(strsplit(ccds$intron_starts, ",")))
intrFrame <- as.numeric(unlist(strsplit(ccds$intron_frame, ",")))
intrName <- rep(ccds$name, times=as.numeric(ccds$exonCount)- 1)
intrChr <- gsub("chr", "", rep(ccds$chrom, times=as.numeric(ccds$exonCount)-1))
intrNums <- as.numeric(ccds[, { s <- seq(length.out=ifelse(as.numeric(exonCount)==1, 0, as.numeric(exonCount)-1)); if (strand == '+') { s } else { rev(s) } }, by=c("name","chrom","txStart")]$V1)
exonNums <- as.numeric(ccds[, { s <- seq(as.numeric(exonCount)); if (strand=='+') { s } else { rev(s) }}, by=c("name","chrom","txStart")]$V1)
intrGene <- rep(ccds$gene, times=as.numeric(ccds$exonCount)-1)
exonGene <- rep(ccds$gene, times=as.numeric(ccds$exonCount))
ccdsGene <- as.numeric(ccds[, seq(as.numeric(exonCount)), by=c("name","chrom","txStart")]$V1)
ccds[, exon_end := max(as.numeric(strsplit(exonEnds, ",")[[1]])), by=c("name", "chrom", "txStart")]
exonFrames <- as.numeric(unlist(strsplit(ccds$exonFrames, ",")))
exonStarts <- as.numeric(unlist(strsplit(ccds$exonStarts, ",")))
exonEnds <- as.numeric(unlist(strsplit(ccds$exonEnds, ",")))
ccdsName <- rep(ccds$name, times=as.numeric(ccds$exonCount))
exonStrand <- rep(ccds$strand, times=as.numeric(ccds$exonCount))
#exonStarts[exonStrand == '+'] <- exonStarts[exonStrand == '+'] + 1 ## off by one?
exonStarts <- exonStarts + 1 ## off by one?
#exonEnds[exonStrand == '-'] <- exonEnds[exonStrand == '-'] + 1 ## off by one?
intrStrand <- rep(ccds$strand, times=as.numeric(ccds$exonCount)-1)
ccdschrom <- gsub("chr","",rep(ccds$chrom, times=as.numeric(ccds$exonCount)))
gr.exons <- sort(gr.fix(GRanges(ccdschrom, IRanges(exonStarts, exonEnds), strand=exonStrand, name=ccdsName, num=exonNums, frame=exonFrames, gene=exonGene), si))
gr.introns <- sort(gr.fix(GRanges(intrChr, IRanges(intrStarts, intrEnds), strand=intrStrand, name=intrName, num=intrNums, frame=intrFrame, gene=intrGene), si))
#### genes
gr.genes = sort(gr.fix(gr.nochr(with(fread("/xchip/gistic/Jeremiah/tracks/genes.hg19.ucsc.txt", sep="\t"), GRanges(chr, IRanges(beg, end), gene=symbol))), si))
gr.genes <- gr.genes[!grepl("^ULK|^NBPF|^MIR|^LOC|^OR|^SNO|^FAM|^ZNF|^SMN|^NF1P2|^POTEB|^RNF|^RGPD5|^RGPD2|^SNAR|^NBPF", gr.genes$gene) & width(gr.genes) < 3e6]
gr.genes$gene <- as.character(gr.genes$gene)
gg <- grbind(gr.exons, gr.introns)
#gg <- gr.val(gg, gr.genes, 'gene', sep="_")
#gg$gene[nchar(gg$gene) > 50] <- "MANYGENES"
## get complement
gg <- gr.stripstrand(gr.fix(gg, si))
gg$gene <- as.character(gg$gene)
gene.comp <- setdiff(gr.stripstrand(si2gr(si)), gg) + 1
fo <- gr2dt(gr.findoverlaps(gene.comp, gg))
fo <- fo[query.id %in% as.numeric(names(table(fo$query.id)[table(fo$query.id) == 2]))] ## now each intergenic region has front/back overlap
fo[, right := gg$gene[subject.id[2]], by=query.id]
fo[, left := gg$gene[subject.id[1]], by=query.id]
gene.comp$left <- gene.comp$right <- ""
gene.comp$left[fo$query.id] <- fo$left
gene.comp$right[fo$query.id] <- fo$right
gene.comp <- gene.comp - 1
## tubio
ff <- fread("/xchip/gistic/Jeremiah/tracks/master_db_29062016_ranges.txt")
setnames(ff, c("V1","V2","V3","V4","V5","V6"), c("seqnames","start","end","dir","type","sample"))
ff$strand <- ifelse(ff$dir == "plus","+","-")
gr.tub <- dt2gr(ff)
================================================
FILE: R/archive_non_functional/gen_quals.R
================================================
require(data.table)
f <- fread("grep -v ^# /broad/broadsv/NA12878/GCAT/gcat_illumina_150x/snowman/v115/v115.snowman.indel.vcf", sep='\t')
hdr <- fread("grep ^# /broad/broadsv/NA12878/GCAT/gcat_illumina_150x/snowman/v115/v115.snowman.indel.vcf", sep="\t", header=FALSE)
## rescale the quality score
f[, SL := gsub(".*?:.*?:.*?:.*?:.*?:.*?:.*?:.*?:.*?:(.*?)","\\1", V10)]
f[, V6 := SL]
p <- c(hdr$V1, f[, paste(V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,sep="\t")])
writeLines(p, "/broad/hptmp/jwala/test.vcf")
================================================
FILE: R/archive_non_functional/svaba-annotate.R
================================================
#!/usr/bin/env Rscript
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type = "character", default = NULL, help = "Input VCF file"),
make_option(c("-s", "--style"), type = "character", default = "ncbi", help = "[nbci] for 1,2,...X, or [ucsc] for chr1, chr2, ...chrX"),
make_option(c("-g", "--genome"), type = "character", default = NULL, help = "genome build (hg19 or hg38)"),
make_option(c("-b", "--db"), type = "character", default = "refseq", help = "[refseq] gene database to use (must be refseq or gencode)"),
make_option(c("-o", "--output"), type = "character", default = "no_id", help = "Output annotation name")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
if (!tolower(opt$style) %in% c("ncbi","ucsc"))
stop("Must specify style as ncbi or ucsc")
if (!tolower(opt$genome) %in% c("hg38","hg19"))
stop("Must specify --genome as hg19 or hg38")
write("...loading required packages\n", stderr())
library(roverlaps) ## can find at https://github.com/walaj/roverlaps
##library(plyr)
library(RMySQL)
library(data.table)
## setup the connect to UCSC
write("...downloading annotation tracks from UCSC\n", stderr())
assembly <- opt$genome
mychannel <- dbConnect(MySQL(), user="genome", host="genome-mysql.cse.ucsc.edu")
query <- function(...) dbGetQuery(mychannel, ...)
if (tolower(opt$db) == tolower("refseq")) {
## download the exons and their gene names from RefSeq (UCSC track)
## - reminder that refseq ids can be interpreted:
## https://www.ncbi.nlm.nih.gov/books/NBK50679/#RefSeqFAQ.what_is_the_difference_between
## - (from above): Accession numbers that begin with the prefix XM_ (mRNA), XR_ (non-coding RNA), and XP_ (protein)
## are model RefSeqs produced either by NCBIs genome annotation pipeline or copied from
## computationally annotated submissions to the INSDC.NCBI
## ...subsequently curated RefSeq records (with NM_, NR_, or NP_ accession prefixes)
genes <- suppressWarnings(data.table::as.data.table(query(paste0("SELECT name, name2, chrom, txStart, txEnd, strand, exonStarts, exonEnds, exonCount, exonFrames FROM ", assembly, ".refGene"))))
genes[, prefix := gsub("([A-Z]+)_.*","\\1", name)]
genes[, longest := { a=rep(FALSE, .N); a[which.max(txEnd - txStart)] <- TRUE; a }, by=name2]
genes[, numTranscriptAnnotations := .N, by=name2]
genes <- genes[txEnd - txStart >= 100] ## only genes longer than 100 bp
genes <- genes[genes$longest] ## just take the longest transcriopt
write('...defaulting to annotating only the longest transcript')
genes[, longest := NULL]
setnames(genes, c("name2","name"), c("geneSymbol","id"))
} else if (tolower(opt$db) == "gencode") {
## if want to use GENCODE instead, need to get table to convert GENCODE ids to gene names
genes <- suppressWarnings(data.table::as.data.table(query(paste0("SELECT name, chrom, txStart, txEnd, strand, exonStarts, exonEnds, exonCount FROM ", assembly, ".knownGene"))))
codes <- suppressWarnings(data.table::as.data.table(query(paste0("SELECT kgID, mRNA, geneSymbol, spID, refSeq FROM ", assembly, ".kgXref"))))
## merge the gene ids to gene names
setnames(genes, c("name"), c("kgID"))
setkey(genes, kgID)
setkey(codes, kgID)
genes <- codes[genes]
stopifnot(!any(duplicated(genes$kgID)))
genes[, geneSymbol := toupper(geneSymbol)]
setnames(genes, "kgID", "id")
} else {
stop("db option must be either GENCODE or RefSeq")
}
write("...parsing downloaded annotation data\n", stderr())
## convert to key'ed format
setnames(genes, c("chrom", "txStart","txEnd"), c("seqnames","start","end"))
if (tolower(opt$style) == "ncbi")
genes[, seqnames := gsub("^chr", "", seqnames)]
## make the exons structure
starts <- strsplit(genes[, exonStarts], ",")
ends <- strsplit(genes[, exonEnds], ",")
frames <- strsplit(genes[, exonFrames], ",")
stopifnot(sum(sapply(starts, length)) == sum(sapply(ends, length)))
stopifnot(sum(sapply(starts, length)) == sum(sapply(frames, length)))
stopifnot(sum(sapply(starts, length)) == sum(genes$exonCount))
exons <- data.table(id = genes[, rep(id, exonCount)],
geneSymbol = genes[, rep(geneSymbol, exonCount)],
exonCount = genes[, rep(exonCount, exonCount)],
seqnames = genes[, rep(seqnames, exonCount)],
frames = as.numeric(unlist(frames)),
start = as.numeric(unlist(starts)),
end = as.numeric(unlist(ends)),
strand = genes[, rep(strand, exonCount)])
exons[, exonNum := ifelse(strand=="+", seq(.N), rev(seq(.N))), by=id]
## make an introns track
## for the frames, define the intron from as the one of the previous exon, respecting strand
## EEEEiiiiiiiEEEEiiiiiiiiEEEEEiiiiiEEE <<< intron frame is frame of RIGHT exon (iiiiEEEE)
## EEEEiiiiiiiEEEEiiiiiiiiEEEEEiiiiiEEE >>> intron from is frame of LEFT exon (EEEEiii)
ends <- exons[exonCount > 1, start[seq(2, .N)], by=id]
starts <- exons[exonCount > 1, end[seq(1, .N-1)], by=id]
if ("exonFrames" %in% colnames(genes)) {
frames <- exons[exonCount > 1, {
if (strand[1]=="+")
frames[seq(1,.N-1)]
else
frames[seq(2,.N)]
}, by=id]
stopifnot(nrow(ends) == nrow(frames))
}
stopifnot(nrow(ends) == nrow(starts))
stopifnot(all(ends$id == starts$id))
stopifnot(min(as.numeric(names(table(exons[exonCount>1, exonCount])))) > 1)
if ("exonFrames" %in% colnames(genes)) {
introns <- data.table(id=ends$id, start=starts$V1, end=ends$V1, frame=frames$V1)
} else {
introns <- data.table(id=ends$id, start=starts$V1, end=ends$V1)
}
short <- exons[!duplicated(id), .(id, geneSymbol, seqnames, strand, exonCount)]
setkey(introns, id)
setkey(short, id)
introns <- short[introns]
introns[, intronCount := exonCount - 1]
introns[, exonCount := NULL]
introns[, intronNum := ifelse(strand == "+", seq(.N), rev(seq(.N))), by=id]
## make the inter-genic trackk
#ro <- roverlaps(genes, genes)
#ro[,max(end), by=query.id]$V1
#setkey(genes, seqnames, start)
#starts <- genes[,end[seq(2,.N)], by=seqnames]$V1
#ends <- genes[,start[seq(1,.N-1)], by=seqnames]$V1
## read in file
write("...reading VCF file\n", stderr())
if (grepl("gz$", opt$input)) {
print(paste("gunzip -c", opt$input, "| grep -v ^#"))
vcf <- data.table::fread(cmd=paste("gunzip -c", opt$input, "| grep -v ^#"), sep="\t")
} else {
vcf <- data.table::fread(cmd=paste("grep -v ^#", opt$input), sep="\t")
}
setnames(vcf, paste0("V", seq(9)), c("seqnames","start","id","ref","alt","qual","filter","info","geno"))
## bail early if nothing found
if (nrow(vcf) == 0) {
write("No rearrangements found\n", stderr())
quit(status=0)
} else {
write(paste("Found", nrow(vcf), "rearrangements\n"), stderr())
}
## grl.ix is unique ID for each rearrangement
## grl.iix is either 1 or 2 (for each side of the rearrangement)
vcf[, grl.ix := gsub("([0-9]+):[0-9]","\\1",id)]
vcf[, grl.iix := as.numeric(gsub("[0-9]+:([0-9])","\\1",id))]
## set the strand info for BND format
vcf[grepl("SVTYPE=BND", info), strand := ifelse(grepl("^\\[", alt) | grepl("^\\]", alt), '-', '+')]
vcf[, inv := strand[1] == strand[2], by=grl.ix]
vcf[, altstrand := rev(strand), by=grl.ix]
vcf[, altpos := as.integer(gsub(".*?:([0-9]+).*", "\\1", alt))]
vcf[, altchr := gsub(".*?(\\[|\\])(.*?):([0-9]+).*", "\\2", alt)]
vcf[, span := ifelse(seqnames==altchr, abs(start - altpos), -1)]
## annotate with gene overlaps
ro <- roverlaps::roverlaps(vcf, genes)
ro[, geneSymbol := genes$geneSymbol[subject.id]]
ro[, geneSymbol := paste(unique(geneSymbol), collapse="_"), by=query.id]
ro[, geneTranscriptCount := genes$numTranscriptAnnotations[subject.id]]
ro[, geneStrand := genes$strand[subject.id]]
vcf[ro$query.id, geneSymbol := ro$geneSymbol]
vcf[ro$query.id, geneStrand := ro$geneStrand]
vcf[, geneSymbolAlt := geneSymbol[rev(grl.iix)], by=grl.ix]
vcf[, geneStrandAlt := geneStrand[rev(grl.iix)], by=grl.ix]
vcf[ro$query.id, geneTranscriptCount := ro$geneTranscriptCount]
## annotate with genes within 20kb
PAD <- 20e3
genes20 <- data.table::copy(genes)
genes20[, start := start - PAD/2]
genes20[, end := end + PAD/2]
ro <- roverlaps::roverlaps(vcf, genes20)
ro[, geneSymbol20kb := genes$geneSymbol[subject.id]]
ro[, geneSymbol20kb := paste(unique(geneSymbol20kb), collapse="_"), by=query.id]
vcf[ro$query.id, geneSymbol20kb := ro$geneSymbol20kb]
vcf[, geneSymbol20kbAlt := geneSymbol20kb[rev(grl.iix)], by=grl.ix]
## overlap introns
ro <- roverlaps::roverlaps(vcf, introns)
ro[, intronNum := introns$intronNum[subject.id]]
ro[, intronGene := introns$geneSymbol[subject.id]]
ro[, intronStart := ifelse(introns$strand[subject.id] == "+", introns$start[subject.id], introns$end[subject.id])]
ro[, intronGene := paste(unique(intronGene), collapse="_"), by=query.id]
vcf[ro$query.id, intronGene := ro$intronGene]
vcf[ro$query.id, intronNum := ro$intronNum]
vcf[ro$query.id, intronStart := ro$intronStart]
vcf[, intronGeneAlt := intronGene[rev(grl.iix)], by=grl.ix]
vcf[, intronNumAlt := intronNum[rev(grl.iix)], by=grl.ix]
vcf[, intronBpsIn := abs(intronStart - start)]
if ("frame" %in% colnames(introns)) {
ro[, cdsFrame := introns$frame[subject.id]]
vcf[ro$query.id, cdsFrame := ro$cdsFrame]
}
vcf[!is.na(intronNum), bk_msg := paste(intronBpsIn, "bp_into_intron", intronNum, "of", intronGene, sep="_")]
## overlap exons
ro <- roverlaps(vcf, exons)
ro[, exonNum := exons$exonNum[subject.id]]
ro[, exonGene := exons$geneSymbol[subject.id]]
ro[, exonStart := ifelse(exons$strand[subject.id] == "+", exons$start[subject.id], exons$end[subject.id])]
ro[, exonGene := paste(unique(exonGene), collapse="_"), by=query.id]
vcf[ro$query.id, exonGene := ro$exonGene]
vcf[ro$query.id, exonNum := ro$exonNum]
vcf[ro$query.id, exonStart := ro$exonStart]
vcf[, exonGeneAlt := exonGene[rev(grl.iix)], by=grl.ix]
vcf[, exonNumAlt := exonNum[rev(grl.iix)], by=grl.ix]
vcf[, exonBpsIn := abs(exonStart - start)]
vcf[!is.na(exonNum), bk_msg := paste(exonBpsIn, "bp_into_exon", exonNum, "of", exonGene, sep="_")]
## annotate fusions
vcf[, fusion := {
f <- sum(grepl("intron|exon", bk_msg)) == 2 && (geneSymbol[1] != geneSymbol[2]);
if (is.na(f)) { FALSE } else { f }
} , by = grl.ix]
vcf[, sense :=
all(geneStrand != "") &&
((geneStrand[1] == geneStrand[2] && strand[1] != strand[2]) || (strand[1] != strand[2] && geneStrand[1] == geneStrand[2])),
by = grl.ix]
## DECIDE WHICH PIECE IS "first" in the fusion gene (where Tx starts)
## RAR_STRAND GENE_STRAND
## ++ or -- must be +- or -+. Starts on +
## +- or -- msut be -- or ++. Starts on side where gstrand == rarstrand
if (any(vcf$fusion)) {
vcf[vcf$fusion, gorder := {
mk <- c(NA,NA)
## inversion
if (strand[1] == strand[2]) {
if (geneStrand[1] == '+')
mk <- c(2,1)
else
mk <- c(1,2)
}
## non type
if (geneStrand[1] == geneStrand[1])
mk <- c(1,2)
else
mk <- c(2,1)
mk
}, by=grl.ix]
} else {
vcf[, gorder := 1] ##dummy
}
## decide if in frame
if ("cdsFrame" %in% colnames(vcf)) {
vcf[, in_frame := {
f = (cdsFrame[gorder[2]] - cdsFrame[gorder[1]]);
xbases <- 0; ##xbases <- nchar(INSERTION[1]) - nchar(HOMSEQ[1]);
if (is.na(f[1]))
FALSE
else if (all(grepl('intron', bk_msg)) && sense[1]) ## intron to intron
f == 0
else
f + xbases %% 3 == 0
}, by=grl.ix]
## make the fusion messages
vcf[, in_frame_fusion := in_frame && fusion && sense, by=grl.ix]
vcf[vcf$in_frame_fusion, msg := paste("In-frame sense fusion from:", bk_msg[gorder[1]], "to", bk_msg[gorder[2]]), by=grl.ix]
vcf[!vcf$in_frame_fusion & vcf$fusion & vcf$sense, msg := paste("Out-of-frame sense fusion between:", bk_msg[1], "and", bk_msg[2]), by=grl.ix]
vcf[!vcf$sense, msg := paste("Anti-sense fusion between:", bk_msg[1], "and", bk_msg[2]), by=grl.ix]
}
vcf[, c("fusion","gorder") := NULL]
## writing to stdout
write.table(vcf[,.(seqnames, start, start, altchr, altpos, altpos, strand, altstrand, id, ref, alt, qual, filter, info, grl.ix, grl.iix, span, geneSymbol, geneStrand, geneSymbol20kb, intronGene, intronNum, bk_msg, exonGene, exonNum, in_frame, in_frame_fusion, msg)],
row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t", file=stdout())
## ##llr$bk_msg[!grepl("intron|exon", llr$bk_msg)] <- ""
## write("...annotating intergenic")
## fo <- gr.findoverlaps(llr, gene.comp)
## if (nrow(fo)) {
## fo[, left := gene.comp$left[subject.id], by=query.id]
## fo[, right := gene.comp$right[subject.id], by=query.id]
## fo[, right_start := end(gene.comp)[subject.id], by=subject.id]
## fo[, left_end := start(gene.comp)[subject.id], by=subject.id]
## fo[, left_dist := start - left_end, by=query.id]
## fo[, right_dist := right_start - end, by=query.id]
## fo[, mmm := paste(left_dist, "bp to right of", left, "and", right_dist, "bp to left of", right)]
## llr$msg[!nchar(llr$msg)] <- ""
## llr$bk_msg[fo$query.id] <- ifelse(!nchar(llr$bk_msg[fo$query.id]), fo$mmm, llr$bk_msg[fo$query.id])
## }
## ############## make the circos plot
## write("...making Circos plot")
## library(RCircos)
## data(UCSC.HG19.Human.CytoBandIdeogram);
## chr.exclude <- NULL;
## cyto.info <- UCSC.HG19.Human.CytoBandIdeogram;
## tracks.inside <- 10;
## tracks.outside <- 0;
## RCircos.Set.Core.Components(cyto.info, chr.exclude, tracks.inside, tracks.outside);
## ## get the gene label dat
## #genes <- genes[width(genes) < 2e6]
## #fo1 <- gr.findoverlaps(gr1+10e3, genes)
## #fo2 <- gr.findoverlaps(gr2+10e3, genes)
## ## annoying bug with seqinfo clash on 'c'
## #if (length(fo1) && length(fo2)) {
## # fo <- c(fo1,fo2)
## #} else if (length(fo1)) {
## # fo <- fo1
## #} else {
## # fo <- fo2
## #}
## write("...constructing Circos plot")
## ## set the gene labels
## gene.dat <- data.frame(Chromsome=c(dt.bks$chr1, dt.bks$chr2), chromStart=c(dt.bks$pos1, dt.bks$pos2),
## chromEnd=c(dt.bks$pos1,dt.bks$pos2), Gene=as.character(c(dt.bks$gene1, dt.bks$gene2)),
## stringsAsFactors=FALSE)
## if (nrow(gene.dat))
## gene.dat <- gene.dat[nchar(gene.dat$Gene) > 0,]
## if (nrow(gene.dat))
## gene.dat <- gene.dat[!duplicated(gene.dat$Gene),]
## links = data.frame()
## if (length(bks))
## links = with(dt.bks, data.frame(Chromosome=chr1, chromStart=pos1, chromEnd=pos1, Chromsome.1=chr2, chromStart.1=pos2, chromEnd.1=pos2))
## ## plot the PDF
## pdf(file=paste0(opt$output,".pdf"), height=opt$height, width=opt$width, compress=TRUE);
## RCircos.Set.Plot.Area();
## RCircos.Chromosome.Ideogram.Plot();
## if (opt$genes != 0 && nrow(gene.dat) > 0) {
## track.num <- 1
## RCircos.Gene.Connector.Plot(gene.dat, track.num, "in");
## track.num <- 2;
## name.col <- 4;
## RCircos.Gene.Name.Plot(gene.dat, name.col, track.num, "in");
## }
## if (nrow(links) > 0) {
## track.num = 1
## RCircos.Link.Plot(links, track.num, by.chromosome=TRUE) ## by.chromosome is for color
## }
## dev.off()
================================================
FILE: R/archive_non_functional/svaba-asqg2pdf.R
================================================
#!/usr/bin/env Rscript
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type = "character", default = NULL, help = "Input asqg file from snowman run --write-asqg or sga"),
make_option(c("-c", "--inputcc"), type = "character", default = NULL, help = "Input cc file from snowman run --write-asqg or sga"),
make_option(c("-o", "--output"), type = "character", default = "graph.pdf", help = "Output pdf to write the graph"),
make_option(c("-d", "--height"), type = "numeric", default = 20, help = "Height"),
make_option(c("-w", "--width"), type = "numeric", default = 20, help = "Width"),
make_option(c("-m", "--mincount"), type = "numeric", default = 1, help = "Remove nodes with fewer than m components")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
require(igraph)
require(data.table)
#opt$input ="/xchip/gistic/Jeremiah/Projects/SnowmanPaper/Benchmark/150830/tmp.graph.after.asqg"
vert.file = paste(opt$input, "vert", sep=".")
##reads.file = file.path(dir, 'plots', 'readsForR_som.txt')
##reads <- import.snowman.reads(reads.file)
#cont = 'c_8:43487254-43492254_11'
##cont = 'c_8:43487254-43492254_0'
##this_reads <- reads$rname[as.character(seqnames(reads)) %in% cont]
## read the edges
ln= readLines(opt$input, n = 1000)
nskip = sum(grepl('^VT|^HT', ln))
tab.e <- read.delim(opt$input, skip=nskip, strings=FALSE, header=FALSE, sep=" ")
tab.e$V1 <- substring(tab.e$V1, 4)
colnames(tab.e) <- c('seq1', 'seq2', 'overlap_start1', 'overlap_end1', 'len1',
'overlap_start2', 'overlap_end2', 'len2', 'orientation', 'numdiff')
edges <- data.frame(from=tab.e$seq1, to=tab.e$seq2)
## read the verts
cmd = paste("grep", '"VT"', opt$input, ">", vert.file)
system(cmd)
tab <- read.delim(vert.file, skip=0, strings=FALSE, header=FALSE, sep="\t")
if (ncol(tab) == 4) {
colnames(tab) <- c("V", "rname", 'seq', 'ss')
} else if (ncol(tab) == 3) {
colnames(tab) <- c("V", "rname", 'seq')
}
verts <- data.frame(verts=unique(c(as.character(tab$rname), as.character(edges$to), as.character(edges$from))))
g <- graph.data.frame(edges, directed=FALSE, vertices=verts)
## vert lengths
vert.lens <- structure(nchar(tab$seq), names=tab$rname)
## open the cc file
if (file.info(opt$inputcc)$size > 0) {
cc <- fread(opt$inputcc)
cc$col <- sample(colors(), length(unique(cc$V2)))[match(cc$V2, unique(cc$V2))]
setkey(cc, col)
V(g)$color = "black"
for (x in cc$col) {
V(g)$color[verts$verts %in% cc[x]$V1] <- x
}
}
## format the verts
V(g)$names = paste(as.character(verts$verts), "len:", vert.lens[as.character(verts$verts)])
##V(g)$color[V(g)$names %in% this_reads] <- 'red'
## format the edges
E(g)$lab <- tab.e$overlap_end2 - tab.e$overlap_start2
## cluster
V(g)$community <- membership(cl<-clusters(g))
good_communities <- which(cl$csize >= opt$mincount)
g <- delete.vertices(g, V(g)[!community %in% good_communities])
pdf(opt$output, height=opt$height, width=opt$width)
plot(g, vertex.color=V(g)$color, vertex.label=V(g)$names, edge.label=E(g)$lab, vertex.size=2)
dev.off()
================================================
FILE: R/archive_non_functional/svaba-bam-qcplot.R
================================================
#!/usr/bin/env Rscript
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type = "character", default = "qcreport.txt", help = "Input txt file from a snowman preprocess qcreport.txt"),
make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output pdf to generate")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
if (!file.exists(opt$input)) {
print(print_help(parseobj))
stop(paste("Input file does not exist", opt$input, ". Must supply path to valid qcreport.txt file (generated from snowman preprocess or snowman run"))
}
print(opt)
require(ggplot2)
require(reshape2)
require(gridExtra)
## read the table
con <- file(opt$input, open = "r")
df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame()
rg <- list()
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
## found a new one
if (grepl("READGROUP", line)) {
thisrg = gsub("READGROUP:BI:(.*)", "\\1", line)
rg[[thisrg]] <- data.frame(readgroup = thisrg)
} else if (grepl("total", line)) {
rg[[thisrg]]$total = as.numeric(gsub("total,([0-9]+)", "\\1", line))
} else if (grepl("unmap", line)) {
rg[[thisrg]]$unmap = as.numeric(gsub("unmap,([0-9]+)", "\\1", line))
} else if (grepl("qcfail", line)) {
rg[[thisrg]]$qcfail = as.numeric(gsub("qcfail,([0-9]+)", "\\1", line))
} else if (grepl("duplicate", line)) {
rg[[thisrg]]$duplicate = as.numeric(gsub("duplicate,([0-9]+)", "\\1", line))
} else if (grepl("supplementary", line)) {
rg[[thisrg]]$supp = as.numeric(gsub("supplementary,([0-9]+)", "\\1", line))
} else if (grepl("mapq", line)) {
df.mapq <- rbind(df.mapq, as.numeric(strsplit(gsub("mapq,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("nm", line)) {
df.nm <- rbind(df.nm, as.numeric(strsplit(gsub("nm,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("isize", line)) {
df.isize <- rbind(df.isize, as.numeric(strsplit(gsub("isize,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("as", line)) {
df.as <- rbind(df.as, as.numeric(strsplit(gsub("as,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("xp", line)) {
df.xp <- rbind(df.xp, as.numeric(strsplit(gsub("xp,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("clip", line)) {
df.clip <- rbind(df.clip, as.numeric(strsplit(gsub("clip,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("len", line)) {
df.len <- rbind(df.len, as.numeric(strsplit(gsub("len,([0-9]+)", "\\1", line), ",")[[1]]))
} else if (grepl("phred", line)) {
df.phred <- rbind(df.phred, as.numeric(strsplit(gsub("phred,([0-9]+)", "\\1", line), ",")[[1]]))
} else {
stop(paste("Failed to read file at line:", line))
}
}
close(con)
colnames(df.mapq) <- seq(from=0,to=60)
colnames(df.nm) <- seq(from=0,to=ncol(df.nm)-1)
colnames(df.isize) <- seq(from=0,to=ncol(df.isize)-1)
colnames(df.xp) <- seq(from=0, to=ncol(df.xp)-1)
colnames(df.as) <- seq(from=0, to=ncol(df.as)-1)
colnames(df.len) <- seq(from=0, to=ncol(df.len)-1)
colnames(df.phred) <- seq(from=0, to=ncol(df.phred)-1)
colnames(df.clip) <- seq(from=0, to=ncol(df.clip)-1)
readg <- sapply(rg, function(x) x$readgroup)
df.mapq$readgroup <- readg
df.nm$readgroup <- readg
df.isize$readgroup <- readg
df.xp$readgroup <- readg
df.as$readgroup <- readg
df.phred$readgroup <- readg
df.len$readgroup <- readg
df.clip$readgroup <- readg
g.mapq <- ggplot(df.mapq.m <- melt(df.mapq, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61)) + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.nm <- ggplot(df.nm.m <- melt(df.nm, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50)) + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.isize <- ggplot(df.isize.m <- melt(df.isize, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(10,2001)) + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.xp <- ggplot(df.xp.m <- melt(df.xp, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('XP Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.as <- ggplot(df.as.m <- melt(df.as, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('AS Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.len <- ggplot(df.len.m <- melt(df.len, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102)) + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.clip <- ggplot(df.clip.m <- melt(df.clip, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,85)) + ylab('Reads') + xlab('Clipped bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
g.phred <- ggplot(df.phred.m <- melt(df.phred, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,43)) + ylab('Reads') + xlab('Mean read Phred quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
df.isize.m$variable <- as.numeric(as.character(df.isize.m$variable))
df.disco <- df.isize.m[df.isize.m$variable > 0 & df.isize.m$variable < 800, ]
g.isize <- ggplot(df.disco, aes(x=variable, y=value, color=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('Insert Size') + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9))
## get percentages
# disc, proper
df.disc.pie <- data.frame(rbind(rowSums(df.isize[, (800+1):(2000+1)]), rowSums(df.isize[, 1:800])), class=c("\"Discordant\" (I > 800)", "\"Proper\"(I < 800)"))
colnames(df.disc.pie) <- c(as.character(levels(readg)), "Class")
df.disc.pie.m <- melt(df.disc.pie, id='Class')
g.disc.pie <- ggplot(df.disc.pie.m, aes(x=factor(1), y=value, fill=Class)) + geom_bar(stat='identity', position="fill") + facet_wrap( ~ variable) + coord_polar(theta="y") + xlab("") + ylab("") + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9), legend.position="bottom")
df.clip.pie <- data.frame(rbind(rowSums(df.clip[, 6:102]), rowSums(df.clip[, 1:5])), class=c("\"Clipped\" (clip >= 5)", "\"Matched\"(clip < 5)"))
colnames(df.clip.pie) <- c(as.character(levels(readg)), "Class")
df.clip.pie.m <- melt(df.clip.pie, id='Class')
g.clip.pie <- ggplot(df.clip.pie.m, aes(x=factor(1), y=value, fill=Class)) + geom_bar(stat='identity', position="fill") + facet_wrap( ~ variable) + coord_polar(theta="y") + xlab("") + ylab("") + theme(legend.text = element_text(size=6), legend.title = element_text(size=6), text = element_text(size=9), legend.position="bottom")
pdf(opt$output, width=22, height=12)
print(grid.arrange(g.mapq, g.nm, g.isize, g.len, g.clip, g.phred, g.disc.pie, g.clip.pie, ncol=3))
dev.off()
================================================
FILE: R/archive_non_functional/svaba-benchmark.R
================================================
#!/use/bin/env/ Rscript
require(ggplot2)
require(data.table)
.libPaths = c("/xchip/gistic/Jeremiah/R", "/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.1.1-bioconductor-3.0/lib64/R/library")
library(optparse)
AVAIL_MODES <- c("realign-test")
option_list = list(
make_option(c("-i", "--input"), type = "character", default = NULL, help = "Input SV VCF file"),
make_option(c("-o", "--output"), type = "character", default = "no_id", help = "Output annotation name"),
make_option(c("-m", "--mode"), type = "character", default = "realign-test", help = "Benchmarking mode to analyze")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
if (!opt$mode %in% AVAIL_MODES)
stop(paste("Mode", opt$mode, "must be one of", paste(AVAIL_MODES, collapse=", ")))
.format_flag_df <- function(dfh2) {
dfh2 <- data.table(dfh[dfh$del_size == "0" & dfh$snv_rate == "0" & dfh$ins_size != "0", ])
dfh2$index = seq(nrow(dfh2))
dfh2[, total := sum(c(wrong_align, too_align, no_align, correct)), by=index]
dfh2 = data.frame(dfh2)
dfh2[, c("no_align", "wrong_align", "too_align","correct")] = sweep(dfh2[, c('no_align', 'wrong_align', 'too_align', 'correct')], 1, dfh2$total, "/")
dfh2 <- melt(dfh2, id.vars=c("width","ins_size", "del_size", "snv_rate", "total", "index"))
return (dfh2)
}
if (opt$mode == "realign-test") {
ff <- fread(opt$input, header=TRUE)
df <- data.frame();
dfh <- data.frame();
for (j in unique(ff$width)) {
for (k in unique(ff$snv_rate)) {
for (d in unique(ff$del_size)) {
for (ii in unique(ff$ins_size)) {
#ii = ifelse(d == 0, 0, ii) ## if del is zero, then make ins zero too, since just want to evaluate zeros
#d = ifelse(ii == 0, 0, d) ## if ins is zero, then make del zero too, since we just want to evaulate zeros
if (ii == 0 || d == 0) {
ix <- ff$width == j & ff$snv_rate == k & ff$ins_size == d & ff$del_size == ii
if (sum(ix)) {
cf <- ecdf(ff$num_align[ix])
s <- seq(0,max(ff$num_align))
df <- rbind(df, data.frame(num_align=s, cdf=cf(s), width=as.character(j), snv_rate=as.character(k), del_size = as.character(d), ins_size=as.character(ii), one.x = 0, one.y = cf(1)))
dfh <- rbind(dfh, data.frame(no_align=sum(ix & ff$num_align == 0),
wrong_align=sum(ix & ff$correct_hit_num == -1 & ff$num_align == 1),
too_align=sum(ix & ff$num_align > 1),
correct=sum(ix & ff$correct_hit_num == 0),
width=as.character(j),
del_size = as.character(d),
ins_size = as.character(ii),
snv_rate = as.character(k)))
}
}
}
}
}
}
## SNV ONLY
df2 <- df[df$ins_size == "0" & df$del_size == "0", ]
g <- ggplot(data=df2) + geom_line(aes(x=num_align, y=cdf, color=width)) + geom_point(aes(x=one.x, y=one.y, color=width)) + theme_bw() + xlab("Number of alignments") + ylab("CDF") + facet_wrap(~ snv_rate, nrow=1) + scale_y_continuous(limits=c(min(df2$one.y)-0.1,1), breaks=seq(0,1,by=0.05)) + labs(color="Sequence Length")
pdf("~/public_html/realign_test_snv_cigcheck.pdf", width=7, height=2)
print(g)
dev.off()
## DEL ONLY
df2 <- df[df$ins_size == "0" & df$snv_rate == "0", ]
g <- ggplot(data=df2) + geom_line(aes(x=num_align, y=cdf, color=width)) + geom_point(aes(x=one.x, y=one.y, color=width)) + theme_bw() + xlab("Number of alignments") + ylab("CDF") + facet_wrap(~ del_size, nrow=1) + scale_y_continuous(limits=c(min(df2$one.y)-0.1,1), breaks=seq(0,1,by=0.2)) + labs(color="Sequence Length")
pdf("~/public_html/realign_test_del_cigcheck.pdf", width=7, height=2)
print(g)
dev.off()
## INS ONLY
df2 <- df[df$del_size == "0" & df$snv_rate == "0" & df$ins_size != "0", ]
g <- ggplot(data=df2) + geom_line(aes(x=num_align, y=cdf, color=width)) + geom_point(aes(x=one.x, y=one.y, color=width)) + theme_bw() + xlab("Number of alignments") + ylab("CDF") + facet_wrap(~ ins_size, nrow=1) + scale_y_continuous(limits=c(min(df2$one.y)-0.1,1), breaks=seq(0,1,by=0.2)) + labs(color="Sequence Length")
pdf("~/public_html/realign_test_ins_cigcheck.pdf", width=7, height=2); print(g); dev.off()
dfh2 <- .format_flag_df(df2)
g2 <- ggplot(data=dfh2,aes(x=factor(width),y=value,fill=factor(variable))) + geom_bar(position="stack", stat='identity') + facet_wrap(~ ins_size, nrow=1) + scale_fill_manual(values=c("no_align"="black", "wrong_align"="red", "too_align"="purple","correct"="dark green"), labels=c("Unmapped", "Incorrect alignment", "> 1 alignment", "Accurate"), name="Alignment") + scale_y_continuous(breaks=seq(0,1,by=0.25), labels=c("0", "25", "50", "75", "100"), name="Percentage") + xlab("Sequence Length")
pdf("~/public_html/realign_ins_flag.pdf", width=7, height=2); print(g2); dev.off()
#g <- ggplot(data=ff) + geom_histogram(aes(x=num_aligns)) + scale_y_log10(limits=c(1, 10)) + theme_bw()
pdf("~/public_html/realign_test.pdf", width=7, height=7)
print(g)
dev.off()
}
if (FALSE) {
snow <- ra_breaks("/xchip/gistic/Jeremiah/Projects/SnowmanPaper/Benchmark/snow2/chr1.broad-snowman.DATECODE.somatic.sv.vcf")
simd <- fread("/xchip/gistic/Jeremiah/Projects/SnowmanPaper/Benchmark/connections.tsv")
gr.sim <- with(simd, GRanges(c(V1,V1)+1, IRanges(c(V2,V4), width=1), strand=ifelse(c(V3, V5)=='+', '+', '-'), id=rep(seq(nrow(simd)),each=2)))
grl.sim <- split(gr.sim, gr.sim$id)
ra.overlaps(snow, grl.sim, pad=10, ignore.strand=TRUE)
## read it
dt <- fread("/xchip/gistic/Jeremiah/Projects/SnowmanPaper/150805benchmark.csv")
dt [, mean_cc := mean(contig_coverage), by=c('kmer_corr', 'coverage', 'error_rate')]
dt [, se_cc := sd(contig_coverage), by=c('kmer_corr', 'coverage', 'error_rate')]
setkey(dt, kmer_corr, coverage, error_rate)
dt <- unique(dt)
dt <- readRDS("/xchip/gistic/Jeremiah/tracks/100map.dt.rds")
## relabel names
en <- c("0"="Error Rate: 0", "0.001"="Error Rate: 1e-3", "0.005"="Error Rate 5e-3",
"0.01"="Error Rate: 0.01", "0.03"="Error Rate: 0.03", "0.05"="Error Rate: 0.05", "0.1"="Error Rate: 0.1")
.labeller <- function(variable,value){
return(en[value])
}
## plot it
df = data.frame(dt)
df$error_rate = as.character(df$error_rate)
df$kmer_corr = factor(df$kmer_corr, levels=c(0,1))
g <- ggplot(df) + geom_line(aes(x=coverage, y=mean_cc, color=kmer_corr), size=1) +
geom_errorbar(aes(x=coverage,ymin=mean_cc-se_cc, ymax=mean_cc+se_cc, color=kmer_corr), width=1) +
theme_bw() + xlab("Coverage") + ylab("Percent Re-assembled") +
scale_x_continuous(breaks=seq(0,40,by=5)) + facet_grid(error_rate ~ ., labeller=.labeller) +
coord_cartesian(ylim=c(0,1.3)) +
scale_y_continuous(breaks=seq(0,1,by=0.25))
pdf("~/public_html/plot.pdf", width=6, height=12); print(g); dev.off()
}
================================================
FILE: R/archive_non_functional/svaba-bps-to-maflite.R
================================================
#!/usr/bin/env Rscript
library(optparse)
suppressMessages(suppressWarnings(require(VariantAnnotation, quietly=TRUE)))
option_list = list(
make_option(c("-i", "--input"), type = "character", default = NULL, help = "Input bps.txt.gz file"),
make_option(c("-o", "--output"), type = "character", default = "graph.pdf", help = "Output MAFLITE")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
v <- readVcf(opt$input, "hg19")
inf <- info(v)
gr <- rowData(v)
## remove non indel lines
gr <- gr[inf$EVDNC=="INDEL",]
inf <- inf[inf$EVDNC=="INDEL",]
## remove unnecesry columns
#tab <- data.frame(chr=as.character(seqnames(gr)), startr=start(gr), endr=end(gr), ref=gr$REF, alt=gr$ALT)
tab <- data.frame(chr=as.character(seqnames(gr)), startr=start(gr), end=end(gr), ref=as.character(gr$REF), alt=as.character(gr$ALT), stringsAsFactors=FALSE)
del = sapply(tab$ref, nchar) > sapply(tab$alt, nchar)
ins = sapply(tab$ref, nchar) < sapply(tab$alt, nchar)
tab$alt[ins] <- inf$INSERTION[ins]
tab$ref[del] <- sapply(tab$ref[del], function(x) substr(x, 2, nchar(x)))
tab$alt[del] = "-"
tab$ref[ins] = "-"
tab <- cbind(tab, inf[, c("SCTG", "SPAN", "MAPQ", "TSPLIT", "NSPLIT")])
writeLines(tab, opt$output)
================================================
FILE: R/archive_non_functional/svaba-circos.R
================================================
#!/usr/bin/env Rscript
## set the right library paths
.libPaths = c("/xchip/gistic/Jeremiah/R", "/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.1.1-bioconductor-3.0/lib64/R/library")
require(data.table)
ra_breaks <-
function(rafile, keep.features = T, seqlengths = hg_seqlengths(), chr.convert = T, snowman = FALSE, breakpointer = FALSE, seqlevels = NULL,
get.loose = FALSE ## if TRUE will return a list with fields $junctions and $loose.ends
)
{
if (is.character(rafile))
{
if (grepl('(vcf$)|(vcf.gz$)', rafile))
{
library(VariantAnnotation)
vcf = readVcf(rafile, Seqinfo(seqnames = names(seqlengths), seqlengths = seqlengths))
if (!('SVTYPE' %in% names(info(vcf)))) {
warning('Vcf not in proper format. Is this a rearrangement vcf?')
return(GRangesList());
}
vgr = rowData(vcf) ## parse BND format
## no events
if (length(vgr) == 0)
return (GRangesList())
## fix mateids if not included
if (!"MATEID"%in%colnames(mcols(vgr))) {
nm <- vgr$MATEID <- names(vgr)
ix <- grepl("1$",nm)
vgr$MATEID[ix] = gsub("(.*?)(1)$", "\\12", nm[ix])
vgr$MATEID[!ix] = gsub("(.*?)(2)$", "\\11", nm[!ix])
vgr$SVTYPE="BND"
}
if (!any(c("MATEID", "SVTYPE") %in% colnames(mcols(vgr))))
stop("MATEID or SVTYPE not included. Required")
vgr$mateid = info(vcf)$MATEID
vgr$svtype = info(vcf)$SVTYPE
if (!is.null(info(vcf)$SCTG))
vgr$SCTG = info(vcf)$SCTG
if (sum(vgr$svtype == 'BND')==0)
stop('Vcf not in proper format. Will only process rearrangements in BND format')
if (!all(vgr$svtype == 'BND'))
warning(sprintf('%s rows of vcf do not have svtype BND, ignoring these'), sum(vgr$svtype != 'BND'))
bix = which(vgr$svtype == "BND")
vgr = vgr[bix]
vgr$first = !grepl('^(\\]|\\[)', vgr$ALT) ## ? is this row the "first breakend" in the ALT string (i.e. does the ALT string not begin with a bracket)
vgr$right = grepl('\\[', vgr$ALT) ## ? are the (sharp ends) of the brackets facing right or left
vgr$coord = as.character(paste(seqnames(vgr), ':', start(vgr), sep = ''))
vgr$mcoord = as.character(gsub('.*(\\[|\\])(.*\\:.*)(\\[|\\]).*', '\\2', vgr$ALT))
vgr$mcoord = gsub('chr', '', vgr$mcoord)
if (all(is.na(vgr$mateid)))
if (!is.null(names(vgr)) & !any(duplicated(names(vgr))))
{
warning('MATEID tag missing, guessing BND partner by parsing names of vgr')
vgr$mateid = paste(gsub('::\\d$', '', names(vgr)), (sapply(strsplit(names(vgr), '\\:\\:'), function(x) as.numeric(x[length(x)])))%%2 + 1, sep = '::')
}
else if (!is.null(vgr$SCTG))
{
warning('MATEID tag missing, guessing BND partner from coordinates and SCTG')
require(igraph)
ucoord = unique(c(vgr$coord, vgr$mcoord))
vgr$mateid = paste(vgr$SCTG, vgr$mcoord, sep = '_')
if (any(duplicated(vgr$mateid)))
{
warning('DOUBLE WARNING! inferred mateids not unique, check VCF')
bix = bix[!duplicated(vgr$mateid)]
vgr = vgr[!duplicated(vgr$mateid)]
}
}
else
stop('MATEID tag missing')
vgr$mix = as.numeric(match(vgr$mateid, names(vgr)))
pix = which(!is.na(vgr$mix))
vgr.pair = vgr[pix]
if (length(vgr.pair)==0)
stop('No mates found despite nonzero number of BND rows in VCF')
vgr.pair$mix = match(vgr.pair$mix, pix)
vix = which(1:length(vgr.pair)<vgr.pair$mix )
vgr.pair1 = vgr.pair[vix]
vgr.pair2 = vgr.pair[vgr.pair1$mix]
## now need to reorient pairs so that the breakend strands are pointing away from the breakpoint
## if "first" and "right" then we set this entry "-" and the second entry "+"
tmpix = vgr.pair1$first & vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '-'
strand(vgr.pair2)[tmpix] = '+'
}
## if "first" and "left" then "-", "-"
tmpix = vgr.pair1$first & !vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '-'
strand(vgr.pair2)[tmpix] = '-'
}
## if "second" and "left" then "+", "-"
tmpix = !vgr.pair1$first & !vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '+'
strand(vgr.pair2)[tmpix] = '-'
}
## if "second" and "right" then "+", "+"
tmpix = !vgr.pair1$first & vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '+'
strand(vgr.pair2)[tmpix] = '+'
}
pos1 = as.logical(strand(vgr.pair1)=='+') ## positive strand junctions shift left by one (ie so that they refer to the base preceding the break for these junctions
if (any(pos1))
{
start(vgr.pair1)[pos1] = start(vgr.pair1)[pos1]-1
end(vgr.pair1)[pos1] = end(vgr.pair1)[pos1]-1
}
pos2 = as.logical(strand(vgr.pair2)=='+') ## positive strand junctions shift left by one (ie so that they refer to the base preceding the break for these junctions
if (any(pos2))
{
start(vgr.pair2)[pos2] = start(vgr.pair2)[pos2]-1
end(vgr.pair2)[pos2] = end(vgr.pair2)[pos2]-1
}
ra = grl.pivot(GRangesList(vgr.pair1[, c()], vgr.pair2[, c()]))
this.inf = info(vcf)[bix[pix[vix]], ]
if (is.null(this.inf$POS))
this.inf = cbind(data.frame(POS = ''), this.inf)
if (is.null(this.inf$CHROM))
this.inf = cbind(data.frame(CHROM = ''), this.inf)
if (is.null(this.inf$MATL))
this.inf = cbind(data.frame(MALT = ''), this.inf)
this.inf$CHROM = seqnames(vgr.pair1)
this.inf$POS = start(vgr.pair1)
this.inf$MATECHROM = seqnames(vgr.pair2)
this.inf$MATEPOS = start(vgr.pair2)
this.inf$MALT = vgr.pair2$ALT
values(ra) = cbind(fixed(vcf)[bix[pix[vix]],], this.inf)
if (is.null(values(ra)$TIER))
values(ra)$tier = ifelse(values(ra)$FILTER == "PASS", 2, 3) ## baseline tiering of PASS vs non PASS variants
else
values(ra)$tier = values(ra)$TIER
if (!get.loose)
return(ra)
else
{
npix = is.na(vgr$mix)
vgr.loose = vgr[npix, c()] ## these are possible "loose ends" that we will add to the segmentation
values(vgr.loose) = cbind(fixed(vcf)[bix[npix], ], info(vcf)[bix[npix], ])
return(list(junctions = ra, loose.ends = vgr.loose))
}
}
else
rafile = read.delim(rafile)
}
if (is.data.table(rafile))
rafile = as.data.frame(rafile)
if (nrow(rafile)==0)
{
out = GRangesList()
values(out) = rafile
return(out)
}
if (snowman) ## flip breaks so that they are pointing away from junction
{
rafile$str1 = ifelse(rafile$strand1 == '+', '-', '+')
rafile$str2 = ifelse(rafile$strand2 == '+', '-', '+')
}
if (!is.null(seqlevels)) ## convert seqlevels from notation in tab delim file to actual
{
rafile$chr1 = seqlevels[rafile$chr1]
rafile$chr2 = seqlevels[rafile$chr2]
}
if (is.null(rafile$str1))
rafile$str1 = rafile$strand1
if (is.null(rafile$str2))
rafile$str2 = rafile$strand2
if (!is.null(rafile$pos1) & !is.null(rafile$pos2))
{
if (breakpointer)
{
rafile$pos1 = rafile$T_BPpos1
rafile$pos2 = rafile$T_BPpos2
}
if (!is.numeric(rafile$pos1))
rafile$pos1 = as.numeric(rafile$pos1)
if (!is.numeric(rafile$pos2))
rafile$pos2 = as.numeric(rafile$pos2)
## clean the parenthesis from the string
rafile$str1 <- gsub('[()]', '', rafile$str1)
rafile$str2 <- gsub('[()]', '', rafile$str2)
if (is.character(rafile$str1) | is.factor(rafile$str1))
rafile$str1 = gsub('0', '-', gsub('1', '+', rafile$str1))
if (is.character(rafile$str2) | is.factor(rafile$str2))
rafile$str2 = gsub('0', '-', gsub('1', '+', rafile$str2))
if (is.numeric(rafile$str1))
rafile$str1 = ifelse(rafile$str1>0, '+', '-')
if (is.numeric(rafile$str2))
rafile$str2 = ifelse(rafile$str2>0, '+', '-')
rafile$rowid = 1:nrow(rafile)
bad.ix = is.na(rafile$chr1) | is.na(rafile$chr2) | is.na(rafile$pos1) | is.na(rafile$pos2) | is.na(rafile$str1) | is.na(rafile$str2) | rafile$str1 == '*'| rafile$str2 == '*' | rafile$pos1<0 | rafile$pos2<0
rafile = rafile[which(!bad.ix), ]
if (nrow(rafile)==0)
return(GRanges())
seg = rbind(data.frame(chr = rafile$chr1, pos1 = rafile$pos1, pos2 = rafile$pos1, strand = rafile$str1, ra.index = rafile$rowid, ra.which = 1, stringsAsFactors = F),
data.frame(chr = rafile$chr2, pos1 = rafile$pos2, pos2 = rafile$pos2, strand = rafile$str2, ra.index = rafile$rowid, ra.which = 2, stringsAsFactors = F))
if (chr.convert)
seg$chr = gsub('25', 'M', gsub('24', 'Y', gsub('23', 'X', seg$chr)))
out = seg2gr(seg, seqlengths = seqlengths)[, c('ra.index', 'ra.which')];
out = split(out, out$ra.index)
}
else if (!is.null(rafile$start1) & !is.null(rafile$start2) & !is.null(rafile$end1) & !is.null(rafile$end2))
{
ra1 = gr.flip(GRanges(rafile$chr1, IRanges(rafile$start1, rafile$end1), strand = rafile$str1))
ra2 = gr.flip(GRanges(rafile$chr2, IRanges(rafile$start2, rafile$end2), strand = rafile$str2))
out = grl.pivot(GRangesList(ra1, ra2))
}
if (keep.features)
values(out) = rafile[, ]
return(out)
}
grl.pivot <-
function(x)
{
if (length(x) == 0)
return(GRangesList(GRanges(seqlengths = seqlengths(x)), GRanges(seqlengths = seqlengths(x))))
return(split(unlist(x), rep(1:length(x[[1]]), length(x))))
}
read_hg <-
function(hg19 = T, fft = F)
{
if (fft)
return(readRDS(REFGENE.FILE.HG19.FFT))
else
{
require(BSgenome)
if (hg19)
library(BSgenome.Hsapiens.UCSC.hg19)
else
library(BSgenome.Hsapiens.UCSC.hg18)
}
return(Hsapiens)
}
hg_seqlengths <-
function(hg19 = T, chr = F, include.junk = F)
{
require(BSgenome)
hg = read_hg(hg19)
sl = seqlengths(hg)
if (!include.junk)
sl = sl[c(paste('chr', 1:22, sep = ''), 'chrX', 'chrY', 'chrM')]
if (!chr)
names(sl) = gsub('chr', '', names(sl))
return(sl)
}
grl.unlist <-
function(grl)
{
if (length(grl) == 0) ## JEREMIAH
return(GRanges())
# return(grl)
names(grl) = NULL
as.df = as.data.frame(grl)
el = as.df$element
if (is.null(el))
el = as.df$group
out = unlist(grl)
out$grl.ix = el
tmp = rle(el)
out$grl.iix = unlist(sapply(tmp$lengths, function(x) 1:x))
values(out) = cbind(values(grl)[out$grl.ix, , drop = FALSE], values(out))
return(out)
}
grdt <-
function(x)
{
require(data.table)
## new approach just directly instantiating data table
cmd = 'data.table(';
if (is(x, 'GRanges'))
{
was.gr = TRUE
f = c('seqnames', 'start', 'end', 'strand', 'width')
f2 = c('as.character(seqnames', 'c(start', 'c(end', 'as.character(strand', 'as.numeric(width')
cmd = paste(cmd, paste(f, '=', f2, '(x))', sep = '', collapse = ','), sep = '')
value.f = names(values(x))
}
else
{
was.gr = FALSE
value.f = names(x)
}
if (length(value.f)>0)
{
if (was.gr)
cmd = paste(cmd, ',', sep = '')
class.f = sapply(value.f, function(f) eval(parse(text=sprintf("class(x$'%s')", f))))
.StringSetListAsList = function(x) ### why do I need to do this, bioconductor peeps??
{
tmp1 = as.character(unlist(x))
tmp2 = rep(1:length(x), elementLengths(x))
return(split(tmp1, tmp2))
}
## take care of annoying S4 / DataFrame / data.frame (wish-they-were-non-)issues
as.statement = ifelse(grepl('Integer', class.f), 'as.integer',
ifelse(grepl('Character', class.f), 'as.character',
ifelse(grepl('StringSetList', class.f), '.StringSetListAsList',
ifelse(grepl('StringSet$', class.f), 'as.character',
ifelse(grepl('List', class.f), 'as.list',
ifelse(grepl('List', class.f), 'as.list', 'c'))))))
cmd = paste(cmd, paste(value.f, '=', as.statement, "(x$'", value.f, "')", sep = '', collapse = ','), sep = '')
}
cmd = paste(cmd, ')', sep = '')
return(eval(parse(text =cmd)))
}
gr.findoverlaps <-
function(query, subject, ignore.strand = T, first = F,
qcol = NULL, ## any query meta data columns to add to result
scol = NULL, ## any subject meta data columns to add to resultx
max.chunk = 1e13,
foverlaps = ifelse(is.na(as.logical(Sys.getenv('GRFO_FOVERLAPS'))), TRUE, as.logical(Sys.getenv('GRFO_FOVERLAPS'))) & exists('foverlaps'),
pintersect = NA,
verbose = F,
type = 'any',
by = NULL,
mc.cores = 1,
return.type = 'same',
...)
{
if (type != 'any')
{
foverlaps = FALSE
pintersect = FALSE
}
if (nchar(foverlaps)==0)
foverlaps = TRUE
if (is.na(foverlaps))
foverlaps = TRUE
isdt <- any(class(query) == 'data.table' )
if (return.type == 'same')
return.type <- ifelse(isdt, 'data.table', 'GRanges')
if (!((inherits(subject, 'GRanges') | inherits(subject, 'data.table')) & (inherits(query, 'GRanges') | inherits(query, 'data.table'))))
stop('both subject and query have to be GRanges or data.table')
if (is.na(pintersect))
if (isdt)
pintersect <- length(unique(query$seqnames)) > 50 & length(unique(subject$seqnames)) > 50
else
pintersect <- seqlevels(query) > 50 && seqlevels(subject) > 50
if (is.na(pintersect))
pintersect <- FALSE
if (!is.null(qcol))
if (!all(qcol %in% names(values(query))))
stop('Some qcol are not present in meta data of query')
if (!is.null(scol))
if (!all(scol %in% names(values(subject))))
stop('Some scol are not present in meta data of subject')
if (!is.null(by))
if (!(by %in% names(values(query)) & by %in% names(values(subject))))
stop('"by" field must be meta data column of both query and subject')
if ((as.numeric(length(query)) * as.numeric(length(subject))) > max.chunk)
{
if (verbose)
cat('Overflow .. computing overlaps in chunks. Adjust max.chunk parameter to gr.findoverlaps to avoid chunked computation\n')
chunk.size = floor(sqrt(max.chunk));
ix1 = c(seq(1, length(query), chunk.size), length(query)+1)
ix2 = c(seq(1, length(subject), chunk.size), length(subject)+1)
ij = cbind(rep(1:(length(ix1)-1), length(ix2)-1), rep(1:(length(ix2)-1), each = length(ix1)-1))
if (verbose)
print(paste('Number of chunks:', nrow(ij)))
out = do.call('c', mclapply(1:nrow(ij),
function(x)
{
if (verbose)
cat(sprintf('chunk i = %s-%s (%s), j = %s-%s (%s)\n', ix1[ij[x,1]], ix1[ij[x,1]+1]-1, length(query),
ix2[ij[x,2]], (ix2[ij[x,2]+1]-1), length(subject)))
i.chunk = ix1[ij[x,1]]:(ix1[ij[x,1]+1]-1)
j.chunk = ix2[ij[x,2]]:(ix2[ij[x,2]+1]-1)
out = gr.findoverlaps(query[i.chunk], subject[j.chunk], ignore.strand = ignore.strand, first = first, pintersect=pintersect, by = by, qcol = qcol, verbose = verbose, foverlaps = foverlaps, scol = scol, type = type, ...)
out$query.id = i.chunk[out$query.id]
out$subject.id = j.chunk[out$subject.id]
return(out)
}, mc.cores=mc.cores))
convert = FALSE
if ((return.type == 'same' & is(query, 'data.table')) | return.type == 'data.table')
out = grdt(out)
return(out)
}
if (foverlaps)
{
if (verbose)
print('overlaps by data.table::foverlaps')
if (ignore.strand)
by = c(by, 'seqnames', 'start', 'end')
else
by = c(by, 'seqnames', 'strand', 'start', 'end')
if (!is.data.table(query))
{
names(query) = NULL
querydt = grdt(query[, setdiff(by, c('seqnames', 'start', 'end', 'strand'))])
}
else
{
if (!all(by %in% names(query)))
stop(paste('the following columns are missing from query:',
paste(by, collapse = ',')))
querydt = query[, by, with = FALSE]
}
if (!is.data.table(subject))
{
names(subject) = NULL
subjectdt = grdt(subject[, setdiff(by, c('seqnames', 'start', 'end', 'strand'))])
}
else
{
if (!all(by %in% names(subject)))
stop(paste('the following columns are missing from subejct:',
paste(by, collapse = ',')))
subjectdt = subject[, by, with = FALSE]
}
ix1 = querydt$query.id = 1:nrow(querydt)
ix2 = subjectdt$subject.id = 1:nrow(subjectdt)
querydt = querydt[start<=end, ]
subjectdt = subjectdt[start<=end, ]
querydt = querydt[, c('query.id', by), with = F]
subjectdt = subjectdt[, c('subject.id', by), with = F]
setkeyv(querydt, by)
setkeyv(subjectdt, by)
h.df = foverlaps(querydt, subjectdt, by.x = by, by.y = by, mult = 'all', type = 'any', verbose = verbose)
h.df = h.df[!is.na(subject.id) & !is.na(query.id), ]
h.df[, start := pmax(start, i.start)]
h.df[, end := pmin(end, i.end)]
if (verbose)
cat(sprintf('Generated %s overlaps\n', nrow(h.df)))
}
else
{
if (isdt) {
sn1 <- query$seqnames
sn2 <- subject$seqnames
} else {
sn1 = as.character(seqnames(query))
sn2 = as.character(seqnames(subject))
}
if (is.null(by))
{
ix1 = which(sn1 %in% sn2)
ix2 = which(sn2 %in% sn1)
}
else
{
by1 = values(query)[, by]
by2 = values(subject)[, by]
ix1 = which(sn1 %in% sn2 & by1 %in% by2)
ix2 = which(sn2 %in% sn1 & by2 %in% by1)
by1 = by1[ix1]
by2 = by2[ix2]
}
query.ix = query[ix1]
subject.ix = subject[ix2]
sn1 = sn1[ix1]
sn2 = sn2[ix2]
if (pintersect)
{
if (verbose)
print('overlaps by pintersect')
require(data.table)
if (length(sn1)>0 & length(sn2)>0)
{
if (is.null(by))
{
dt1 <- data.table(i=seq_along(sn1), sn=sn1, key="sn")
dt2 <- data.table(j=seq_along(sn2), sn=sn2, key="sn")
ij <- merge(dt1, dt2, by = 'sn', allow.cartesian=TRUE)
}
else
{
dt1 <- data.table(i=seq_along(sn1), sn=sn1, by = by1, key=c("sn", "by"))
dt2 <- data.table(j=seq_along(sn2), sn=sn2, by = by2, key=c("sn", "by"))
ij <- merge(dt1, dt2, by = c('sn', 'by'), allow.cartesian=TRUE)
}
if (ignore.strand && isdt)
subject$strand <- '*'
else if (ignore.strand)
strand(subject) = '*'
qr <- query.ix[ij$i]
sb <- subject.ix[ij$j]
if (!isdt) {
seqlengths(qr) <- rep(NA, length(seqlengths(qr)))
seqlengths(sb) <- rep(NA, length(seqlengths(sb)))
}
if (!isdt && any(as.character(seqnames(qr)) != as.character(seqnames(sb))))
warning('gr.findoverlaps: violated pintersect assumption')
## changed to ranges(qr) etc rather than just GRanges call. Major problem if too many seqlevels
if (isdt) {
rqr <- IRanges(start=qr$start, end=qr$end)
rsb <- IRanges(start=sb$start, end=sb$end)
} else {
rqr <- ranges(qr)
rsb <- ranges(sb)
}
tmp <- pintersect(rqr, rsb, resolve.empty = 'start.x', ...)
names(tmp) = NULL
non.empty = which(width(tmp)!=0)
h.df = as.data.frame(tmp[non.empty])
if (isdt)
h.df$seqnames <- qr$seqnames[non.empty]
else
h.df$seqnames <- as.character(seqnames(qr))[non.empty]
h.df$query.id = ij$i[non.empty]
h.df$subject.id = ij$j[non.empty]
}
else
h.df = data.frame()
}
else
{
if (verbose)
print('overlaps by findOverlaps')
if (isdt) {
rqr <- IRanges(start=query.ix$start, end=query.ix$end)
rsb <- IRanges(start=subject.ix$start, end=subject.ix$end)
} else {
rqr <- ranges(query.ix)
rsb <- ranges(subject.ix)
}
h <- findOverlaps(rqr, rsb, type = type)
r <- ranges(h, rqr, rsb)
h.df <- data.frame(start = start(r), end = end(r), query.id = queryHits(h), subject.id = subjectHits(h), stringsAsFactors = F);
# sn.query = as.character(seqnames(query))[h.df$query.id]
# sn.subject = as.character(seqnames(subject))[h.df$subject.id]
sn.query <- sn1[h.df$query.id]
sn.subject <- sn2[h.df$subject.id]
if (is.null(by))
keep.ix <- sn.query == sn.subject
else
{
by.query <- by1[h.df$query.id]
by.subject <- by2[h.df$subject.id]
keep.ix <- sn.query == sn.subject & by.query == by.subject
}
h.df <- h.df[keep.ix, ]
h.df$seqnames <- sn.query[keep.ix];
}
if (!ignore.strand)
{
h.df$strand <- str.query <- as.character(strand(query)[ix1[h.df$query.id]])
str.subject <- as.character(strand(subject)[ix2[h.df$subject.id]])
h.df <- h.df[which(str.query == str.subject | str.query == '*' | str.subject == '*'),]
}
else if (nrow(h.df)>0)
h.df$strand = '*'
}
if (first)
h.df = h.df[!duplicated(h.df$query.id), ]
if (return.type=='GRanges')
if (nrow(h.df)>0)
{
if (('strand' %in% names(h.df)))
out.gr = GRanges(h.df$seqnames, IRanges(h.df$start, h.df$end),
query.id = ix1[h.df$query.id], subject.id = ix2[h.df$subject.id], strand = h.df$strand, seqlengths = seqlengths(query))
else
out.gr = GRanges(h.df$seqnames, IRanges(h.df$start, h.df$end),
query.id = ix1[h.df$query.id], subject.id = ix2[h.df$subject.id], seqlengths = seqlengths(query))
if (!is.null(qcol))
values(out.gr) = cbind(values(out.gr), values(query)[out.gr$query.id, qcol, drop = FALSE])
if (!is.null(scol))
values(out.gr) = cbind(values(out.gr), values(subject)[out.gr$subject.id, scol, drop = FALSE])
return(out.gr)
}
else
return(GRanges(seqlengths = seqlengths(query)))
else
if (nrow(h.df)>0) {
if (!is.data.table(h.df))
h.df = as.data.table(h.df)
h.df$query.id <- ix1[h.df$query.id]
h.df$subject.id <- ix2[h.df$subject.id]
if (!is.null(qcol))
h.df = cbind(h.df, as.data.table(as.data.frame(values(query))[h.df$query.id, qcol, drop = FALSE]))
if (!is.null(scol))
h.df = cbind(h.df, as.data.table(as.data.frame(values(subject))[h.df$subject.id, scol, drop = FALSE]))
if ('i.start' %in% colnames(h.df))
h.df[, i.start := NULL]
if ('i.end' %in% colnames(h.df))
h.df[, i.end := NULL]
return(h.df)
} else {
return(data.table())
}
}
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type = "character", default = NULL, help = "Input SV VCF file"),
make_option(c("-o", "--output"), type = "character", default = "circos", help = "Output basename of pdf to write the graph"),
make_option(c("-g", "--genes"), type = "logical", default = TRUE, help = "Add genes to the plot?"),
make_option(c("-H", "--height"), type = "numeric", default = 10, help = "Height"),
make_option(c("-W", "--width"), type = "numeric", default = 10, help = "Width")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
bks <- ra_breaks(opt$input)
## filter out discorant only
bks <- bks[-which(mcols(bks)$EVDNC == "DSCRD")]
require(RCircos)
data(UCSC.HG19.Human.CytoBandIdeogram);
chr.exclude <- NULL;
cyto.info <- UCSC.HG19.Human.CytoBandIdeogram;
tracks.inside <- 10;
tracks.outside <- 0;
RCircos.Set.Core.Components(cyto.info, chr.exclude, tracks.inside, tracks.outside);
b <- grl.unlist(bks)
## get the gene label dat
basedir <- '/xchip/gistic/Jeremiah/tracks'
genes <- readRDS(file.path(basedir, 'gr.allgenes.rds'))
genes <- genes[width(genes) < 2e6]
fo <- gr.findoverlaps(b+10e3, genes)
fo <- fo[!duplicated(fo$subject.id)]
gene.dat = data.frame(Chromosome = seqnames(genes[fo$subject.id]), chromStart=start(genes[fo$subject.id]),
chromEnd=end(genes[fo$subject.id]), Gene=genes$gene[fo$subject.id])
print(gene.dat)
gename.col <- 4;
side <- "in";
track.num <- 1;
x1 = b$grl.iix==1
x2 = b$grl.iix==2
links = data.frame(Chromosome=seqnames(b[x1]), chromStart=start(b[x1]), chromEnd=end(b[x1]), Chromosome.1=seqnames(b[x2]), chromStart.1=start(b[x2]), chromeEnd.1=end(b[x2]))
## plot the PDF
pdf(file=paste0(opt$output,".pdf"), height=opt$height, width=opt$width, compress=TRUE);
RCircos.Set.Plot.Area();
RCircos.Chromosome.Ideogram.Plot();
if (opt$genes && nrow(gene.dat) > 0) {
RCircos.Gene.Connector.Plot(gene.dat, track.num, side);
track.num <- 2;
name.col <- 4;
RCircos.Gene.Name.Plot(gene.dat, name.col,track.num, side);
}
if (nrow(links) > 0)
RCircos.Link.Plot(links, track.num, by.chromosome=TRUE) ## by.chromosome is for color
dev.off()
================================================
FILE: R/archive_non_functional/svaba-create-pon.R
================================================
#!/usr/bin/env Rscript
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type = "character", default = "qcreport.txt", help = "Input file containing paths to germline "),
make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output panel of normals file")
)
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
if (is.null(opt$input))
stop(print_help(parseobj))
if (!file.exists(opt$input)) {
print(print_help(parseobj))
stop(paste("Input file does not exist", opt$input, ". Must supply path to valid qcreport.txt file (generated from snowman preprocess or snowman run"))
}
print(opt)
## default
opt$input = '/home/unix/jwala/lung_wgs_pon_list.txt'
opt$output = "/home/unix/jwala/lung_wgs_pon.txt"
##
opt$vcf_files = read.delim(opt$input, stringsAsFactors=FALSE)[,1]
dt.all <- mclapply(opt$vcf_files, function(x) {
print(x)
if (!file.exists(x)) {
warning("File does not exist")
return (-1)
}
ab <- read_vcf(x)
ab$file = x
return(gr2dt(ab))
}, mc.cores=15)
dt <- rbindlist(dt.all)
## need to add indel tye, also svss
================================================
FILE: R/archive_non_functional/svaba-event-plot.R
================================================
#!/usr/bin/env Rscript
## source all the required packages
source.all <- function() {
githome <- Sys.getenv('GIT_HOME')
if (!nchar(githome))
githome <- '/xchip/gistic/Jeremiah/GIT'
suppressMessages(suppressWarnings(require(ff, quietly=TRUE)))
suppressMessages(suppressWarnings(require(VariantAnnotation, quietly=TRUE)))
suppressMessages(suppressWarnings(require(rtracklayer, quietly=TRUE)))
suppressMessages(suppressWarnings(require(data.table, quietly=TRUE)))
suppressMessages(suppressWarnings(require(plyr, quietly=TRUE)))
print('...sourced 5 packages')
suppressMessages(suppressWarnings(require(ggplot2, quietly=TRUE)))
suppressMessages(suppressWarnings(require(reshape2, quietly=TRUE)))
suppressMessages(suppressWarnings(require(GenomicRanges, quietly=TRUE)))
suppressMessages(suppressWarnings(require(popbio, quietly=TRUE)))
suppressMessages(suppressWarnings(require(BSgenome.Hsapiens.UCSC.hg19, quietly=TRUE)))
print('...sourced 10 packages')
suppressMessages(suppressWarnings(require(bitops, quietly=TRUE)))
suppressMessages(suppressWarnings(require(seqinr, quietly=TRUE)))
suppressMessages(suppressWarnings(require(Rsamtools, quietly=TRUE)))
suppressMessages(suppressWarnings(require(ff, quietly=TRUE)))
suppressMessages(suppressWarnings(require(multicore, quietly=TRUE)))
print('...sourced 15 packages')
suppressMessages(suppressWarnings(require(Biostrings, quietly=TRUE)))
suppressMessages(suppressWarnings(require(rtracklayer, quietly=TRUE)))
suppressMessages(suppressWarnings(require(lattice, quietly=TRUE)))
suppressMessages(suppressWarnings(require(RColorBrewer, quietly=TRUE)))
suppressMessages(suppressWarnings(require(Matrix, quietly=TRUE)))
print('...sourced 20 packages')
source(file.path(githome, "grUtils", "grUtils.R"))
source(file.path(githome, "trackData", "trackData.R"))
source(file.path(githome, "marcin", "R", "functions.R"))
source(file.path(githome, "marcin", "R", "db.R"))
source(file.path(githome, "isva", "TaigaSig", "sigUtils.R"))
print('...sourced grUtils.R, trackData.R, marcin/functions.R, marcin/db.R, sigUtils.R')
}
source.all()
##################
## PARSE OPTIONS
##################
library(optparse)
option_list = list(
make_option(c("-f", "--FHworkspace"), type = "character", default = NULL, help = "Firehose workspace to retrieve data from"),
make_option(c("-o", "--outdir"), type = "character", default = getwd(), help = "Firehose workspace to retrieve data from"),
make_option(c("-p", "--FHpairset"), type = "character", default = NULL, help = "Firehose pairset to retreive data from"),
make_option(c("-a", "--FHannotation"), type = "character", default = NULL, help = "Firehose annoation to retreive data from"),
make_option(c("-c", "--cores"), type = "numeric", default = 1, help = "Number of cores to use"),
make_option(c("-v", "--VCFlist"), type = "character", default = "/home/unix/jwala/test.vcflist.txt", help = "File containing a list of VCFs"))
parseobj = OptionParser(option_list=option_list)
opt = parse_args(parseobj)
### test data
opt$outdir = '/xchip/gistic/Jeremiah/tmp_sig'
dir.create(opt$outdir, showWarnings=FALSE)
opt$VCFlist = '/xchip/gistic/Jeremiah/Projects/Significance/Sanger578/list.txt'
setwd(opt$outdir)
###################
###################
================================================
FILE: R/archive_non_functional/svaba-histogram.R
================================================
#!/usr/bin/env Rscript
## set the right library paths
.libPaths = c("/xchip/gistic/Jeremiah/R", "/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.1.1-bioconductor-3.0/lib64/R/library")
file.dir <-
function(paths)
{
return(gsub('(^|(.*\\/))?([^\\/]*)$', '\\2', paths))
}
require(data.table)
suppressMessages(suppressWarnings(require(GenomicRanges, quietly = TRUE)))
suppressMessages(suppressWarnings(require(gplots, quietly = TRUE)))
ra_breaks <-
function(rafile, keep.features = T, seqlengths = hg_seqlengths(), chr.convert = T, snowman = FALSE, breakpointer = FALSE, seqlevels = NULL,
get.loose = FALSE ## if TRUE will return a list with fields $junctions and $loose.ends
)
{
if (is.character(rafile))
{
if (grepl('(vcf$)|(vcf.gz$)', rafile))
{
library(VariantAnnotation)
vcf = readVcf(rafile, Seqinfo(seqnames = names(seqlengths), seqlengths = seqlengths))
if (!('SVTYPE' %in% names(info(vcf)))) {
warning('Vcf not in proper format. Is this a rearrangement vcf?')
return(GRangesList());
}
vgr = rowData(vcf) ## parse BND format
## no events
if (length(vgr) == 0)
return (GRangesList())
## fix mateids if not included
if (!"MATEID"%in%colnames(mcols(vgr))) {
nm <- vgr$MATEID <- names(vgr)
ix <- grepl("1$",nm)
vgr$MATEID[ix] = gsub("(.*?)(1)$", "\\12", nm[ix])
vgr$MATEID[!ix] = gsub("(.*?)(2)$", "\\11", nm[!ix])
vgr$SVTYPE="BND"
}
if (!any(c("MATEID", "SVTYPE") %in% colnames(mcols(vgr))))
stop("MATEID or SVTYPE not included. Required")
vgr$mateid = info(vcf)$MATEID
vgr$svtype = info(vcf)$SVTYPE
if (!is.null(info(vcf)$SCTG))
vgr$SCTG = info(vcf)$SCTG
if (sum(vgr$svtype == 'BND')==0)
stop('Vcf not in proper format. Will only process rearrangements in BND format')
if (!all(vgr$svtype == 'BND'))
warning(sprintf('%s rows of vcf do not have svtype BND, ignoring these'), sum(vgr$svtype != 'BND'))
bix = which(vgr$svtype == "BND")
vgr = vgr[bix]
vgr$first = !grepl('^(\\]|\\[)', vgr$ALT) ## ? is this row the "first breakend" in the ALT string (i.e. does the ALT string not begin with a bracket)
vgr$right = grepl('\\[', vgr$ALT) ## ? are the (sharp ends) of the brackets facing right or left
vgr$coord = as.character(paste(seqnames(vgr), ':', start(vgr), sep = ''))
vgr$mcoord = as.character(gsub('.*(\\[|\\])(.*\\:.*)(\\[|\\]).*', '\\2', vgr$ALT))
vgr$mcoord = gsub('chr', '', vgr$mcoord)
if (all(is.na(vgr$mateid)))
if (!is.null(names(vgr)) & !any(duplicated(names(vgr))))
{
warning('MATEID tag missing, guessing BND partner by parsing names of vgr')
vgr$mateid = paste(gsub('::\\d$', '', names(vgr)), (sapply(strsplit(names(vgr), '\\:\\:'), function(x) as.numeric(x[length(x)])))%%2 + 1, sep = '::')
}
else if (!is.null(vgr$SCTG))
{
warning('MATEID tag missing, guessing BND partner from coordinates and SCTG')
require(igraph)
ucoord = unique(c(vgr$coord, vgr$mcoord))
vgr$mateid = paste(vgr$SCTG, vgr$mcoord, sep = '_')
if (any(duplicated(vgr$mateid)))
{
warning('DOUBLE WARNING! inferred mateids not unique, check VCF')
bix = bix[!duplicated(vgr$mateid)]
vgr = vgr[!duplicated(vgr$mateid)]
}
}
else
stop('MATEID tag missing')
vgr$mix = as.numeric(match(vgr$mateid, names(vgr)))
pix = which(!is.na(vgr$mix))
vgr.pair = vgr[pix]
if (length(vgr.pair)==0)
stop('No mates found despite nonzero number of BND rows in VCF')
vgr.pair$mix = match(vgr.pair$mix, pix)
vix = which(1:length(vgr.pair)<vgr.pair$mix )
vgr.pair1 = vgr.pair[vix]
vgr.pair2 = vgr.pair[vgr.pair1$mix]
## now need to reorient pairs so that the breakend strands are pointing away from the breakpoint
## if "first" and "right" then we set this entry "-" and the second entry "+"
tmpix = vgr.pair1$first & vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '-'
strand(vgr.pair2)[tmpix] = '+'
}
## if "first" and "left" then "-", "-"
tmpix = vgr.pair1$first & !vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '-'
strand(vgr.pair2)[tmpix] = '-'
}
## if "second" and "left" then "+", "-"
tmpix = !vgr.pair1$first & !vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '+'
strand(vgr.pair2)[tmpix] = '-'
}
## if "second" and "right" then "+", "+"
tmpix = !vgr.pair1$first & vgr.pair1$right
if (any(tmpix))
{
strand(vgr.pair1)[tmpix] = '+'
strand(vgr.pair2)[tmpix] = '+'
}
pos1 = as.logical(strand(vgr.pair1)=='+') ## positive strand junctions shift left by one (ie so that they refer to the base preceding the break for these junctions
if (any(pos1))
{
start(vgr.pair1)[pos1] = start(vgr.pair1)[pos1]-1
end(vgr.pair1)[pos1] = end(vgr.pair1)[pos1]-1
}
pos2 = as.logical(strand(vgr.pair2)=='+') ## positive strand junctions shift left by one (ie so that they refer to the base preceding the break for these junctions
if (any(pos2))
{
start(vgr.pair2)[pos2] = start(vgr.pair2)[pos2]-1
end(vgr.pair2)[pos2] = end(vgr.pair2)[pos2]-1
}
ra = grl.pivot(GRangesList(vgr.pair1[, c()], vgr.pair2[, c()]))
this.inf = info(vcf)[bix[pix[vix]], ]
if (is.null(this.inf$POS))
this.inf = cbind(data.frame(POS = ''), this.inf)
if (is.null(this.inf$CHROM))
this.inf = cbind(data.frame(CHROM = ''), this.inf)
if (is.null(this.inf$MATL))
this.inf = cbind(data.frame(MALT = ''), this.inf)
this.inf$CHROM = seqnames(vgr.pair1)
this.inf$POS = start(vgr.pair1)
this.inf$MATECHROM = seqnames(vgr.pair2)
this.inf$MATEPOS = start(vgr.pair2)
this.inf$MALT = vgr.pair2$ALT
values(ra) = cbind(fixed(vcf)[bix[pix[vix]],], this.inf)
if (is.null(values(ra)$TIER))
values(ra)$tier = ifelse(values(ra)$FILTER == "PASS", 2, 3) ## baseline tiering of PASS vs non PASS variants
else
values(ra)$tier = values(ra)$TIER
if (!get.loose)
return(ra)
else
{
npix = is.na(vgr$mix)
vgr.loose = vgr[npix, c()] ## these are possible "loose ends" that we will add to the segmentation
values(vgr.loose) = cbind(fixed(vcf)[bix[npix], ], info(vcf)[bix[npix], ])
return(list(junctions = ra, loose.ends = vgr.loose))
}
}
else
rafile = read.delim(rafile)
}
if (is.data.table(rafile))
rafile = as.data.frame(rafile)
if (nrow(rafile)==0)
{
out = GRangesList()
values(out) = rafile
return(out)
}
if (snowman) ## flip breaks so that they are pointing away from junction
{
rafile$str1 = ifelse(rafile$strand1 == '+', '-', '+')
rafile$str2 = ifelse(rafile$strand2 == '+', '-', '+')
}
if (!is.null(seqlevels)) ## convert seqlevels from notation in tab delim file to actual
{
rafile$chr1 = seqlevels[rafile$chr1]
rafile$chr2 = seqlevels[rafile$chr2]
}
if (is.null(rafile$str1))
rafile$str1 = rafile$strand1
if (is.null(rafile$str2))
rafile$str2 = rafile$strand2
if (!is.null(rafile$pos1) & !is.null(rafile$pos2))
{
if (breakpointer)
{
rafile$pos1 = rafile$T_BPpos1
rafile$pos2 = rafile$T_BPpos2
}
if (!is.numeric(rafile$pos1))
rafile$pos1 = as.numeric(rafile$pos1)
if (!is.numeric(rafile$pos2))
rafile$pos2 = as.numeric(rafile$pos2)
## clean the parenthesis from the string
rafile$str1 <- gsub('[()]', '', rafile$str1)
rafile$str2 <- gsub('[()]', '', rafile$str2)
if (is.character(rafile$str1) | is.factor(rafile$str1))
rafile$str1 = gsub('0', '-', gsub('1', '+', rafile$str1))
if (is.character(rafile$str2) | is.factor(rafile$str2))
rafile$str2 = gsub('0', '-', gsub('1', '+', rafile$str2))
if (is.numeric(rafile$str1))
rafile$str1 = ifelse(rafile$str1>0, '+', '-')
if (is.numeric(rafile$str2))
rafile$str2 = ifelse(rafile$str2>0, '+', '-')
rafile$rowid = 1:nrow(rafile)
bad.ix = is.na(rafile$chr1) | is.na(rafile$chr2) | is.na(rafile$pos1) | is.na(rafile$pos2) | is.na(rafile$str1) | is.na(rafile$str2) | rafile$str1 == '*'| rafile$str2 == '*' | rafile$pos1<0 | rafile$pos2<0
rafile = rafile[which(!bad.ix), ]
if (nrow(rafile)==0)
return(GRanges())
seg = rbind(data.frame(chr = rafile$chr1, pos1 = rafile$pos1, pos2 = rafile$pos1, strand = rafile$str1, ra.index = rafile$rowid, ra.which = 1, stringsAsFactors = F),
data.frame(chr = rafile$chr2, pos1 = rafile$pos2, pos2 = rafile$pos2, strand = rafile$str2, ra.index = rafile$rowid, ra.which = 2, stringsAsFactors = F))
if (chr.convert)
seg$chr = gsub('25', 'M', gsub('24', 'Y', gsub('23', 'X', seg$chr)))
out = seg2gr(seg, seqlengths = seqlengths)[, c('ra.index', 'ra.which')];
out = split(out, out$ra.index)
}
else if (!is.null(rafile$start1) & !is.null(rafile$start2) & !is.null(rafile$end1) & !is.null(rafile$end2))
{
ra1 = gr.flip(GRanges(rafile$chr1, IRanges(rafile$start1, rafile$end1), strand = rafile$str1))
ra2 = gr.flip(GRanges(rafile$chr2, IRanges(rafile$start2, rafile$end2), strand = rafile$str2))
out = grl.pivot(GRangesList(ra1, ra2))
}
if (keep.features)
values(out) = rafile[, ]
return(out)
}
grl.pivot <-
function(x)
{
if (length(x) == 0)
return(GRangesList(GRanges(seqlengths = seqlengths(x)), GRanges(seqlengths = seqlengths(x))))
return(split(unlist(x), rep(1:length(x[[1]]), length(x))))
}
read_hg <-
function(hg19 = T, fft = F)
{
if (fft)
return(readRDS(REFGENE.FILE.HG19.FFT))
else
{
require(BSgenome)
if (hg19)
library(BSgenome.Hsapiens.UCSC.hg19)
else
library(BSgenome.Hsapiens.UCSC.hg18)
}
return(Hsapiens)
}
hg_seqlengths <-
function(hg19 = T, chr = F, include.junk = F)
{
require(BSgenome)
hg = read_hg(hg19)
sl = seqlengths(hg)
if (!include.junk)
sl = sl[c(paste('chr', 1:22, sep = ''), 'chrX', 'chrY', 'chrM')]
if (!chr)
names(sl) = gsub('chr', '', names(sl))
return(sl)
}
grl.unlist <-
function(grl)
{
if (length(grl) == 0) ## JEREMIAH
return(GRanges())
# return(grl)
names(grl) = NULL
as.df = as.data.frame(grl)
el = as.df$element
if (is.null(el))
el = as.df$group
out = unlist(grl)
out$grl.ix = el
tmp = rle(el)
out$grl.iix = unlist(sapply(tmp$lengths, function(x) 1:x))
values(out) = cbind(values(grl)[out$grl.ix, , drop = FALSE], values(out))
return(out)
}
grdt <-
function(x)
{
require(data.table)
## new approach just directly instantiating data table
cmd = 'data.table(';
if (is(x, 'GRanges'))
{
was.gr = TRUE
f = c('seqnames', 'start', 'end', 'strand', 'width')
f2 = c('as.character(seqnames', 'c(start', 'c(end', 'as.character(strand', 'as.numeric(width')
cmd = paste(cmd, paste(f, '=', f2, '(x))', sep = '', collapse = ','), sep = '')
value.f = names(values(x))
}
else
{
was.gr = FALSE
value.f = names(x)
}
if (length(value.f)>0)
{
if (was.gr)
cmd = paste(cmd, ',', sep = '')
class.f = sapply(value.f, function(f) eval(parse(text=sprintf("class(x$'%s')", f))))
.StringSetListAsList = function(x) ### why do I need to do this, bioconductor peeps??
{
tmp1 = as.character(unlist(x))
tmp2 = rep(1:length(x), elementLengths(x))
return(split(tmp1, tmp2))
}
## take care of annoying S4 / DataFrame / data.frame (wish-they-were-non-)issues
as.statement = ifelse(grepl('Integer', class.f), 'as.integer',
ifelse(grepl('Character', class.f), 'as.character',
ifelse(grepl('StringSetList', class.f), '.StringSetListAsList',
ifelse(grepl('StringSet$', class.f), 'as.character',
ifelse(grepl('List', class.f), 'as.list',
ifelse(grepl('List', class.f), 'as.list', 'c'))))))
cmd = paste(cmd, paste(value.f, '=', as.statement, "(x$'", value.f, "')", sep = '', collapse = ','), sep = '')
}
cmd = paste(cmd, ')', sep = '')
return(eval(parse(text =cmd)))
}
gr.findoverlaps <-
function(query, subject, ignore.strand = T, first = F,
qcol = NULL, ## any query meta data columns to add to result
scol = NULL, ## any subject meta data columns to add to resultx
max.chunk = 1e13,
foverlaps = ifelse(is.na(as.logical(Sys.getenv('GRFO_FOVERLAPS'))), TRUE, as.logical(Sys.getenv('GRFO_FOVERLAPS'))) & exists('foverlaps'),
pintersect = NA,
verbose = F,
type = 'any',
by = NULL,
mc.cores = 1,
return.type = 'same',
...)
{
if (type != 'any')
{
foverlaps = FALSE
pintersect = FALSE
}
if (nchar(foverlaps)==0)
foverlaps = TRUE
if (is.na(foverlaps))
foverlaps = TRUE
isdt <- any(class(query) == 'data.table' )
if (return.type == 'same')
return.type <- ifelse(isdt, 'data.table', 'GRanges')
if (!((inherits(subject, 'GRanges') | inherits(subject, 'data.table')) & (inherits(query, 'GRanges') | inherits(query, 'data.table'))))
stop('both subject and query have to be GRanges or data.table')
if (is.na(pintersect))
if (isdt)
pintersect <- length(unique(query$seqnames)) > 50 & length(unique(subject$seqnames)) > 50
else
pintersect <- seqlevels(query) > 50 && seqlevels(subject) > 50
if (is.na(pintersect))
pintersect <- FALSE
if (!is.null(qcol))
if (!all(qcol %in% names(values(query))))
stop('Some qcol are not present in meta data of query')
if (!is.null(scol))
if (!all(scol %in% names(values(subject))))
stop('Some scol are not present in meta data of subject')
if (!is.null(by))
if (!(by %in% names(values(query)) & by %in% names(values(subject))))
stop('"by" field must be meta data column of both query and subject')
if ((as.numeric(length(query)) * as.numeric(length(subject))) > max.chunk)
{
if (verbose)
cat('Overflow .. computing overlaps in chunks. Adjust max.chunk parameter to gr.findoverlaps to avoid chunked computation\n')
chunk.size = floor(sqrt(max.chunk));
ix1 = c(seq(1, length(query), chunk.size), length(query)+1)
ix2 = c(seq(1, length(subject), chunk.size), length(subject)+1)
ij = cbind(rep(1:(length(ix1)-1), length(ix2)-1), rep(1:(length(ix2)-1), each = length(ix1)-1))
if (verbose)
print(paste('Number of chunks:', nrow(ij)))
out = do.call('c', mclapply(1:nrow(ij),
function(x)
{
if (verbose)
cat(sprintf('chunk i = %s-%s (%s), j = %s-%s (%s)\n', ix1[ij[x,1]], ix1[ij[x,1]+1]-1, length(query),
ix2[ij[x,2]], (ix2[ij[x,2]+1]-1), length(subject)))
i.chunk = ix1[ij[x,1]]:(ix1[ij[x,1]+1]-1)
j.chunk = ix2[ij[x,2]]:(ix2[ij[x,2]+1]-1)
out = gr.findoverlaps(query[i.chunk], subject[j.chunk], ignore.strand = ignore.strand, first = first, pintersect=pintersect, by = by, qcol = qcol, verbose = verbose, foverlaps = foverlaps, scol = scol, type = type, ...)
out$query.id = i.chunk[out$query.id]
out$subject.id = j.chunk[out$subject.id]
return(out)
}, mc.cores=mc.cores))
convert = FALSE
if ((return.type == 'same' & is(query, 'data.table')) | return.type == 'data.table')
out = grdt(out)
return(out)
}
if (foverlaps)
{
if (verbose)
print('overlaps by data.table::foverlaps')
if (ignore.strand)
by = c(by, 'seqnames', 'start', 'end')
else
by = c(by, 'seqnames', 'strand', 'start', 'end')
if (!is.data.table(query))
{
names(query) = NULL
querydt = grdt(query[, setdiff(by, c('seqnames', 'start', 'end', 'strand'))])
}
else
{
if (!all(by %in% names(query)))
stop(paste('the following columns are missing from query:',
paste(by, collapse = ',')))
querydt = query[, by, with = FALSE]
}
if (!is.data.table(subject))
{
names(subject) = NULL
subjectdt = grdt(subject[, setdiff(by, c('seqnames', 'start', 'end', 'strand'))])
}
else
{
if (!all(by %in% names(subject)))
stop(paste('the following columns are missing from subejct:',
paste(by, collapse = ',')))
subjectdt = subject[, by, with = FALSE]
}
ix1 = querydt$query.id = 1:nrow(querydt)
ix2 = subjectdt$subject.id = 1:nrow(subjectdt)
querydt = querydt[start<=end, ]
subjectdt = subjectdt[start<=end, ]
querydt = querydt[, c('query.id', by), with = F]
subjectdt = subjectdt[, c('subject.id', by), with = F]
setkeyv(querydt, by)
setkeyv(subjectdt, by)
h.df = foverlaps(querydt, subjectdt, by.x = by, by.y = by, mult = 'all', type = 'any', verbose = verbose)
h.df = h.df[!is.na(subject.id) & !is.na(query.id), ]
h.df[, start := pmax(start, i.start)]
h.df[, end := pmin(end, i.end)]
if (verbose)
cat(sprintf('Generated %s overlaps\n', nrow(h.df)))
}
else
{
if (isdt) {
sn1 <- query$seqnames
sn2 <- subject$seqnames
} else {
sn1 = as.character(seqnames(query))
sn2 = as.character(seqnames(subject))
}
if (is.null(by))
{
ix1 = which(sn1 %in% sn2)
ix2 = which(sn2 %in% sn1)
}
else
{
by1 = values(query)[, by]
by2 = values(subject)[, by]
ix1 = which(sn1 %in% sn2 & by1 %in% by2)
ix2 = which(sn2 %in% sn1 & by2 %in% by1)
by1 = by1[ix1]
by2 = by2[ix2]
}
query.ix = query[ix1]
subject.ix = subject[ix2]
sn1 = sn1[ix1]
sn2 = sn2[ix2]
if (pintersect)
{
if (verbose)
print('overlaps by pintersect')
require(data.table)
if (length(sn1)>0 & length(sn2)>0)
{
if (is.null(by))
{
dt1 <- data.table(i=seq_along(sn1), sn=sn1, key="sn")
dt2 <- data.table(j=seq_along(sn2), sn=sn2, key="sn")
ij <- merge(dt1, dt2, by = 'sn', allow.cartesian=TRUE)
}
else
{
dt1 <- data.table(i=seq_along(sn1), sn=sn1, by = by1, key=c("sn", "by"))
dt2 <- data.table(j=seq_along(sn2), sn=sn2, by = by2, key=c("sn", "by"))
ij <- merge(dt1, dt2, by = c('sn', 'by'), allow.cartesian=TRUE)
}
if (ignore.strand && isdt)
subject$strand <- '*'
else if (ignore.strand)
strand(subject) = '*'
qr <- query.ix[ij$i]
sb <- subject.ix[ij$j]
if (!isdt) {
seqlengths(qr) <- rep(NA, length(seqlengths(qr)))
seqlengths(sb) <- rep(NA, length(seqlengths(sb)))
}
if (!isdt && any(as.character(seqnames(qr)) != as.character(seqnames(sb))))
warning('gr.findoverlaps: violated pintersect assumption')
## changed to ranges(qr) etc rather than just GRanges call. Major problem if too many seqlevels
if (isdt) {
rqr <- IRanges(start=qr$start, end=qr$end)
rsb <- IRanges(start=sb$start, end=sb$end)
} else {
rqr <- ranges(qr)
rsb <- ranges(sb)
}
tmp <- pintersect(rqr, rsb, resolve.empty = 'start.x', ...)
names(tmp) = NULL
non.empty = which(width(tmp)!=0)
h.df = as.data.frame(tmp[non.empty])
if (isdt)
h.df$seqnames <- qr$seqnames[non.empty]
else
h.df$seqnames <- as.character(seqnames(qr))[non.empty]
h.df$query.id = ij$i[non.em
gitextract_ugvk6jc2/
├── CLAUDE.md
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── R/
│ └── archive_non_functional/
│ ├── create-databases.R
│ ├── gen_quals.R
│ ├── svaba-annotate.R
│ ├── svaba-asqg2pdf.R
│ ├── svaba-bam-qcplot.R
│ ├── svaba-benchmark.R
│ ├── svaba-bps-to-maflite.R
│ ├── svaba-circos.R
│ ├── svaba-create-pon.R
│ ├── svaba-event-plot.R
│ ├── svaba-histogram.R
│ ├── svaba-nozzle.R
│ ├── svaba-sig.R
│ └── svaba-vcf-to-maflite.R
├── README.md
├── docs/
│ ├── README.md
│ ├── alignments_viewer.html
│ ├── app.js
│ ├── bps_explorer.html
│ ├── bps_viewer.html
│ ├── comparison.html
│ ├── index.html
│ ├── learn_explorer.html
│ ├── r2c_explorer.html
│ ├── runtime_explorer.html
│ └── styles.css
├── notes
├── opt/
│ ├── jemalloc_test.sh
│ ├── memprof.sh
│ ├── memprof_osx.sh
│ ├── memusg.sh
│ ├── profiler.sh
│ └── runtime.R
├── scripts/
│ ├── combine_blacklists.sh
│ ├── extract_by_qname.sh
│ ├── extract_discordants.sh
│ ├── extract_pairs_by_seq.sh
│ ├── filter_contig_supporting_reads.sh
│ ├── gcloud_teardown.sh
│ ├── mosdepth_lowmapq_blacklist.sh
│ ├── plot_learn.sh
│ ├── r2c_for_contig.sh
│ ├── search_sequence.sh
│ ├── sort_and_dedupe_bps_old.sh
│ ├── sort_bps.sh
│ ├── svaba_cloud.sh
│ ├── svaba_local_function.sh
│ ├── svaba_postprocess.sh
│ └── update_svaba_image.sh
├── src/
│ ├── SGA/
│ │ ├── Algorithm/
│ │ │ ├── ClusterProcess.cpp
│ │ │ ├── ClusterProcess.h
│ │ │ ├── ConnectProcess.cpp
│ │ │ ├── ConnectProcess.h
│ │ │ ├── DPAlignment.cpp
│ │ │ ├── DPAlignment.h
│ │ │ ├── ErrorCorrectProcess.cpp
│ │ │ ├── ErrorCorrectProcess.h
│ │ │ ├── ExtensionDP.cpp
│ │ │ ├── ExtensionDP.h
│ │ │ ├── FMMergeProcess.cpp
│ │ │ ├── FMMergeProcess.h
│ │ │ ├── GapFillProcess.cpp
│ │ │ ├── GapFillProcess.h
│ │ │ ├── HaplotypeBuilder.cpp
│ │ │ ├── HaplotypeBuilder.h
│ │ │ ├── KmerOverlaps.cpp
│ │ │ ├── KmerOverlaps.h
│ │ │ ├── LRAlignment.cpp
│ │ │ ├── LRAlignment.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── OverlapAlgorithm.cpp
│ │ │ ├── OverlapAlgorithm.h
│ │ │ ├── OverlapBlock.cpp
│ │ │ ├── OverlapBlock.h
│ │ │ ├── OverlapTools.cpp
│ │ │ ├── OverlapTools.h
│ │ │ ├── QCProcess.cpp
│ │ │ ├── QCProcess.h
│ │ │ ├── ReadCluster.cpp
│ │ │ ├── ReadCluster.h
│ │ │ ├── SearchHistory.cpp
│ │ │ ├── SearchHistory.h
│ │ │ ├── SearchSeed.cpp
│ │ │ ├── SearchSeed.h
│ │ │ ├── StatsProcess.cpp
│ │ │ ├── StatsProcess.h
│ │ │ ├── StringGraphGenerator.cpp
│ │ │ ├── StringGraphGenerator.h
│ │ │ ├── StringThreader.cpp
│ │ │ ├── StringThreader.h
│ │ │ ├── VariationBuilderCommon.cpp
│ │ │ └── VariationBuilderCommon.h
│ │ ├── Bigraph/
│ │ │ ├── Bigraph.cpp
│ │ │ ├── Bigraph.h
│ │ │ ├── Edge.cpp
│ │ │ ├── Edge.h
│ │ │ ├── EdgeDesc.cpp
│ │ │ ├── EdgeDesc.h
│ │ │ ├── GraphCommon.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── Vertex.cpp
│ │ │ └── Vertex.h
│ │ ├── SGA/
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── OverlapCommon.cpp
│ │ │ ├── OverlapCommon.h
│ │ │ ├── SGACommon.h
│ │ │ ├── index.cpp
│ │ │ ├── index.h
│ │ │ ├── overlap.cpp
│ │ │ └── overlap.h
│ │ ├── SQG/
│ │ │ ├── ASQG.cpp
│ │ │ ├── ASQG.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── SQG.cpp
│ │ │ └── SQG.h
│ │ ├── StringGraph/
│ │ │ ├── CompleteOverlapSet.cpp
│ │ │ ├── CompleteOverlapSet.h
│ │ │ ├── GraphSearchTree.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── RemovalAlgorithm.cpp
│ │ │ ├── RemovalAlgorithm.h
│ │ │ ├── SGAlgorithms.cpp
│ │ │ ├── SGAlgorithms.h
│ │ │ ├── SGSearch.cpp
│ │ │ ├── SGSearch.h
│ │ │ ├── SGUtil.cpp
│ │ │ ├── SGUtil.h
│ │ │ ├── SGVisitors.cpp
│ │ │ ├── SGVisitors.h
│ │ │ ├── SGWalk.cpp
│ │ │ └── SGWalk.h
│ │ ├── SuffixTools/
│ │ │ ├── BWT.h
│ │ │ ├── BWTAlgorithms.cpp
│ │ │ ├── BWTAlgorithms.h
│ │ │ ├── BWTCABauerCoxRosone.cpp
│ │ │ ├── BWTCABauerCoxRosone.h
│ │ │ ├── BWTCARopebwt.cpp
│ │ │ ├── BWTCARopebwt.h
│ │ │ ├── BWTDiskConstruction.cpp
│ │ │ ├── BWTDiskConstruction.h
│ │ │ ├── BWTIndexSet.h
│ │ │ ├── BWTInterval.h
│ │ │ ├── BWTIntervalCache.cpp
│ │ │ ├── BWTIntervalCache.h
│ │ │ ├── BWTReader.cpp
│ │ │ ├── BWTReader.h
│ │ │ ├── BWTReaderAscii.cpp
│ │ │ ├── BWTReaderAscii.h
│ │ │ ├── BWTReaderBinary.cpp
│ │ │ ├── BWTReaderBinary.h
│ │ │ ├── BWTTraverse.cpp
│ │ │ ├── BWTTraverse.h
│ │ │ ├── BWTWriter.cpp
│ │ │ ├── BWTWriter.h
│ │ │ ├── BWTWriterAscii.cpp
│ │ │ ├── BWTWriterAscii.h
│ │ │ ├── BWTWriterBinary.cpp
│ │ │ ├── BWTWriterBinary.h
│ │ │ ├── FMMarkers.h
│ │ │ ├── GapArray.cpp
│ │ │ ├── GapArray.h
│ │ │ ├── HitData.h
│ │ │ ├── InverseSuffixArray.cpp
│ │ │ ├── InverseSuffixArray.h
│ │ │ ├── Makefile.am
│ │ │ ├── Makefile.in
│ │ │ ├── Occurrence.cpp
│ │ │ ├── Occurrence.h
│ │ │ ├── PopulationIndex.cpp
│ │ │ ├── PopulationIndex.h
│ │ │ ├── QuickBWT.cpp
│ │ │ ├── QuickBWT.h
│ │ │ ├── RLBWT.cpp
│ │ │ ├── RLBWT.h
│ │ │ ├── RLUnit.h
│ │ │ ├── RankProcess.cpp
│ │ │ ├── RankProcess.h
│ │ │ ├── SACAInducedCopying.cpp
│ │ │ ├── SACAInducedCopying.h
│ │ │ ├── SAReader.cpp
│ │ │ ├── SAReader.h
│ │ │ ├── SAWriter.cpp
│ │ │ ├── SAWriter.h
│ │ │ ├── SBWT.cpp
│ │ │ ├── SBWT.h
│ │ │ ├── STCommon.cpp
│ │ │ ├── STCommon.h
│ │ │ ├── STGlobals.h
│ │ │ ├── SampledSuffixArray.cpp
│ │ │ ├── SampledSuffixArray.h
│ │ │ ├── SparseGapArray.h
│ │ │ ├── SuffixArray.cpp
│ │ │ ├── SuffixArray.h
│ │ │ ├── SuffixCompare.cpp
│ │ │ └── SuffixCompare.h
│ │ └── Util/
│ │ ├── Alphabet.cpp
│ │ ├── Alphabet.h
│ │ ├── BWT4Codec.h
│ │ ├── BWTCodec.h
│ │ ├── Bamreader.cpp
│ │ ├── BitChar.cpp
│ │ ├── BitChar.h
│ │ ├── BitVector.cpp
│ │ ├── BitVector.h
│ │ ├── BloomFilter.cpp
│ │ ├── BloomFilter.h
│ │ ├── ClusterReader.cpp
│ │ ├── ClusterReader.h
│ │ ├── Contig.cpp
│ │ ├── Contig.h
│ │ ├── CorrectionThresholds.cpp
│ │ ├── CorrectionThresholds.h
│ │ ├── DNACodec.h
│ │ ├── DNADouble.h
│ │ ├── DNAString.cpp
│ │ ├── DNAString.h
│ │ ├── EncodedString.h
│ │ ├── HashMap.h
│ │ ├── Interval.cpp
│ │ ├── Interval.h
│ │ ├── IntervalTree.h
│ │ ├── KmerDistribution.cpp
│ │ ├── KmerDistribution.h
│ │ ├── Makefile.am
│ │ ├── Makefile.in
│ │ ├── Match.cpp
│ │ ├── Match.h
│ │ ├── Metrics.h
│ │ ├── MultiAlignment.cpp
│ │ ├── MultiAlignment.h
│ │ ├── MultiOverlap.cpp
│ │ ├── MultiOverlap.h
│ │ ├── NoCodec.h
│ │ ├── Pileup.cpp
│ │ ├── Pileup.h
│ │ ├── PrimerScreen.cpp
│ │ ├── PrimerScreen.h
│ │ ├── Profiler.h
│ │ ├── Quality.cpp
│ │ ├── Quality.h
│ │ ├── QualityCodec.h
│ │ ├── QualityTable.cpp
│ │ ├── QualityTable.h
│ │ ├── QualityVector.cpp
│ │ ├── QualityVector.h
│ │ ├── Read2Contig.h
│ │ ├── ReadInfoTable.cpp
│ │ ├── ReadInfoTable.h
│ │ ├── ReadTable.cpp
│ │ ├── ReadTable.h
│ │ ├── ReadTableNew.cpp
│ │ ├── ReadTableNew.h
│ │ ├── ReadTableS.h
│ │ ├── SGAStats.cpp
│ │ ├── SGAStats.h
│ │ ├── SeqCoord.cpp
│ │ ├── SeqCoord.h
│ │ ├── SeqReader.cpp
│ │ ├── SeqReader.h
│ │ ├── SimpleAllocator.h
│ │ ├── SimplePool.h
│ │ ├── StdAlnTools.cpp
│ │ ├── StdAlnTools.h
│ │ ├── Timer.h
│ │ ├── Util.cpp
│ │ ├── Util.h
│ │ ├── VCFUtil.cpp
│ │ ├── VCFUtil.h
│ │ ├── VariantIndex.cpp
│ │ ├── VariantIndex.h
│ │ ├── Verbosity.h
│ │ ├── bamreader.h
│ │ ├── bucketSort.cpp
│ │ ├── bucketSort.h
│ │ ├── gzstream.C
│ │ ├── gzstream.h
│ │ ├── mkqs.h
│ │ ├── old.AlignedContig.h
│ │ ├── stdaln.c
│ │ └── stdaln.h
│ ├── svaba/
│ │ ├── AlignedContig.cpp
│ │ ├── AlignedContig.h
│ │ ├── AlignmentFragment.cpp
│ │ ├── AlignmentFragment.h
│ │ ├── BamStats.cpp
│ │ ├── BamStats.h
│ │ ├── BreakPoint.cpp
│ │ ├── BreakPoint.h
│ │ ├── ContigAlignmentScore.cpp
│ │ ├── ContigAlignmentScore.h
│ │ ├── DBSnpFilter.cpp
│ │ ├── DBSnpFilter.h
│ │ ├── DiscordantCluster.cpp
│ │ ├── DiscordantCluster.h
│ │ ├── DiscordantRealigner.cpp
│ │ ├── DiscordantRealigner.h
│ │ ├── Histogram.cpp
│ │ ├── Histogram.h
│ │ ├── KmerFilter.cpp
│ │ ├── KmerFilter.h
│ │ ├── LearnBamParams.cpp
│ │ ├── LearnBamParams.h
│ │ ├── ReadToContigAligner.h
│ │ ├── STCoverage.cpp
│ │ ├── STCoverage.h
│ │ ├── SvabaASQG.cpp
│ │ ├── SvabaASQG.h
│ │ ├── SvabaAssemble.cpp
│ │ ├── SvabaAssemble.h
│ │ ├── SvabaAssemblerConfig.h
│ │ ├── SvabaAssemblerEngine.cpp
│ │ ├── SvabaAssemblerEngine.h
│ │ ├── SvabaBamWalker.cpp
│ │ ├── SvabaBamWalker.h
│ │ ├── SvabaDebug.h
│ │ ├── SvabaFileLoader.cpp
│ │ ├── SvabaFileLoader.h
│ │ ├── SvabaLogger.cpp
│ │ ├── SvabaLogger.h
│ │ ├── SvabaModels.cpp
│ │ ├── SvabaModels.h
│ │ ├── SvabaOptions.cpp
│ │ ├── SvabaOptions.h
│ │ ├── SvabaOutputWriter.cpp
│ │ ├── SvabaOutputWriter.h
│ │ ├── SvabaOverlapAlgorithm.cpp
│ │ ├── SvabaOverlapAlgorithm.h
│ │ ├── SvabaPostprocess.cpp
│ │ ├── SvabaPostprocess.h
│ │ ├── SvabaRead.cpp
│ │ ├── SvabaRead.h
│ │ ├── SvabaRegionProcessor.cpp
│ │ ├── SvabaRegionProcessor.h
│ │ ├── SvabaSharedConfig.h
│ │ ├── SvabaThreadUnit.cpp
│ │ ├── SvabaThreadUnit.h
│ │ ├── SvabaUtils.cpp
│ │ ├── SvabaUtils.h
│ │ ├── refilter.cpp
│ │ ├── refilter.h
│ │ ├── run_svaba.cpp
│ │ ├── svaba.cpp
│ │ ├── test_svaba.cpp
│ │ ├── threadpool.h
│ │ ├── tovcf.cpp
│ │ ├── vcf.cpp
│ │ └── vcf.h
│ └── svabautils/
│ ├── AssemblyBamWalker.cpp
│ ├── AssemblyBamWalker.h
│ ├── BamSplitter.cpp
│ ├── BamSplitter.h
│ ├── Fractions.cpp
│ ├── Fractions.h
│ ├── Makefile.am
│ ├── Makefile.in
│ ├── PowerLawSim.cpp
│ ├── PowerLawSim.h
│ ├── ReadSim.cpp
│ ├── ReadSim.h
│ ├── SeqFrag.cpp
│ ├── SeqFrag.h
│ ├── SimGenome.cpp
│ ├── SimGenome.h
│ ├── SimTrainerWalker.cpp
│ ├── SimTrainerWalker.h
│ ├── assembly2vcf.cpp
│ ├── assembly2vcf.h
│ ├── benchmark.cpp
│ ├── benchmark.h
│ ├── configure
│ ├── configure.ac
│ ├── snowmanutils.cpp
│ ├── snowtools.cpp
│ ├── splitcounter.cpp
│ └── splitcounter.h
├── svaba_jemalloc
└── tracks/
├── README.md
├── genome.hg38.sorted.bed
├── hg38.bed
├── hg38.blacklist.sorted.bed
├── hg38.combined_blacklist.bed
├── hg38.high_runtime.bed
├── hg38.manual.blacklist.bed
├── hg38.nonstd_chr.blacklist.bed
├── hg38.rmsk.simple_repeat.bed
├── hg38_arms_excl_centromeres.bed
├── lowmap30perc.bed
├── lowmap50perc.bed
├── lowmap70perc.bed
└── region_generator.R
SYMBOL INDEX (1041 symbols across 253 files)
FILE: docs/app.js
function resetFilters (line 225) | function resetFilters() {
function syncFilterState (line 256) | function syncFilterState() {
function loadBundledExample (line 271) | async function loadBundledExample() {
function loadFromBlob (line 288) | async function loadFromBlob(blob, fileName) {
function readBlobText (line 301) | async function readBlobText(blob, fileName) {
function readResponseText (line 312) | async function readResponseText(response, fileName) {
function ingestText (line 323) | function ingestText(text, fileName) {
function makeRow (line 359) | function makeRow(index, cells, sampleHeaders) {
function deriveEventType (line 400) | function deriveEventType(row) {
function parseSampleField (line 432) | function parseSampleField(header, raw, evidence) {
function rebuildChipGroups (line 453) | function rebuildChipGroups() {
function renderChipList (line 460) | function renderChipList(container, counts, filterKey) {
function render (line 491) | function render() {
function applyFilters (line 501) | function applyFilters(rows) {
function sortRows (line 546) | function sortRows(rows) {
function sortableValue (line 569) | function sortableValue(row, key) {
function updateSummary (line 579) | function updateSummary() {
function renderTable (line 602) | function renderTable() {
function renderDetail (line 683) | function renderDetail() {
function detailCard (line 766) | function detailCard(label, value) {
function detailPair (line 770) | function detailPair(label, value) {
function reconcileSelection (line 774) | function reconcileSelection() {
function findSelectedRow (line 781) | function findSelectedRow() {
function exportFilteredRows (line 785) | function exportFilteredRows() {
function countBy (line 800) | function countBy(rows, key) {
function topCountValue (line 809) | function topCountValue(rows, key) {
function toggleSetValue (line 822) | function toggleSetValue(set, value) {
function setStatus (line 830) | function setStatus(message) {
function passesNumericRange (line 834) | function passesNumericRange(value, min, max) {
function passesNumericMinimum (line 850) | function passesNumericMinimum(value, min) {
function parseNullableNumber (line 860) | function parseNullableNumber(raw) {
function parseMaybeNumber (line 868) | function parseMaybeNumber(raw) {
function formatNumber (line 876) | function formatNumber(value) {
function presentOrX (line 888) | function presentOrX(value) {
function normalizeSortNumber (line 892) | function normalizeSortNumber(value) {
function zeroIfNull (line 902) | function zeroIfNull(value) {
function stripLeadingHash (line 906) | function stripLeadingHash(value) {
function escapeHtml (line 910) | function escapeHtml(value) {
FILE: src/SGA/Algorithm/ClusterProcess.cpp
function ClusterResult (line 30) | ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
function ClusterResult (line 106) | ClusterResult ClusterProcess::process(const ClusterVector& inSequences)
FILE: src/SGA/Algorithm/ClusterProcess.h
type ClusterParameters (line 21) | struct ClusterParameters
type ClusterResult (line 32) | struct ClusterResult
function class (line 38) | class ClusterProcess
function class (line 56) | class ClusterPostProcess
FILE: src/SGA/Algorithm/ConnectProcess.cpp
function ConnectResult (line 33) | ConnectResult ConnectProcess::process(const SequenceWorkItemPair& workIt...
FILE: src/SGA/Algorithm/ConnectProcess.h
function class (line 21) | class ConnectResult
function class (line 28) | class ConnectProcess
function class (line 47) | class ConnectPostProcess
FILE: src/SGA/Algorithm/DPAlignment.cpp
function DPScoringScheme (line 45) | DPScoringScheme DPAlignment::getScoringScheme() const
function DPOperation (line 147) | DPOperation DPAlignment::getPathOperationToCell(const std::string& s1, c...
function DPPath (line 187) | DPPath DPAlignment::calculatePath(const std::string& s1, const std::stri...
FILE: src/SGA/Algorithm/DPAlignment.h
function min3 (line 14) | inline int min3(int a, int b, int c)
function max3 (line 19) | inline int max3(int a, int b, int c)
function class (line 26) | class EditDistanceScoring
function class (line 35) | class SimilarityScoring
type DPOperation (line 44) | enum DPOperation
type DPPathNode (line 54) | struct DPPathNode
function compare3 (line 133) | inline int compare3(int a, int b, int c) const
type std (line 151) | typedef std::vector<int> IntVector;
type std (line 152) | typedef std::vector<IntVector> IntMatrix;
FILE: src/SGA/Algorithm/ErrorCorrectProcess.cpp
function ErrorCorrectResult (line 34) | ErrorCorrectResult ErrorCorrectProcess::process(const SequenceWorkItem& ...
function ErrorCorrectResult (line 42) | ErrorCorrectResult ErrorCorrectProcess::correct(const SequenceWorkItem& ...
function ErrorCorrectResult (line 79) | ErrorCorrectResult ErrorCorrectProcess::overlapCorrection(const Sequence...
function ErrorCorrectResult (line 160) | ErrorCorrectResult ErrorCorrectProcess::overlapCorrectionNew(const Seque...
function ErrorCorrectResult (line 209) | ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWor...
function ErrorCorrectResult (line 412) | ErrorCorrectResult ErrorCorrectProcess::threadingCorrection(const Sequen...
FILE: src/SGA/Algorithm/ErrorCorrectProcess.h
type ErrorCorrectAlgorithm (line 23) | enum ErrorCorrectAlgorithm
type ECFlag (line 31) | enum ECFlag
type ErrorCorrectParameters (line 40) | struct ErrorCorrectParameters
function class (line 63) | class ErrorCorrectResult
function class (line 79) | class ErrorCorrectProcess
function class (line 102) | class ErrorCorrectPostProcess
FILE: src/SGA/Algorithm/ExtensionDP.cpp
function max3 (line 26) | inline int max3(int a, int b, int c)
function min3 (line 32) | inline int min3(int a, int b, int c)
function BandedDPColumn (line 92) | const BandedDPColumn* BandedDPColumn::getPreviousColumn() const
function BandedDPColumn (line 192) | BandedDPColumn* ExtensionDP::createNewColumn(char b, const std::string& ...
function ExtensionDPAlignment (line 376) | ExtensionDPAlignment ExtensionDP::findTrimmedAlignment(const BandedDPCol...
function ExtensionDPAlignment (line 447) | ExtensionDPAlignment ExtensionDP::findGlocalAlignment(const BandedDPColu...
FILE: src/SGA/Algorithm/ExtensionDP.h
type std (line 30) | typedef std::vector<DPCell> CellVector;
type ExtensionDPAlignment (line 34) | struct ExtensionDPAlignment
function getBestRowIndex (line 68) | int getBestRowIndex() const;
FILE: src/SGA/Algorithm/FMMergeProcess.cpp
function FMMergeResult (line 30) | FMMergeResult FMMergeProcess::process(const SequenceWorkItem& item)
FILE: src/SGA/Algorithm/FMMergeProcess.h
type FMMergeResult (line 19) | struct FMMergeResult
type FMMergeCandidate (line 33) | struct FMMergeCandidate
type std (line 39) | typedef std::queue<FMMergeCandidate> FMMergeQueue;
function class (line 42) | class FMMergeProcess
function class (line 74) | class FMMergePostProcess
FILE: src/SGA/Algorithm/GapFillProcess.cpp
function GapFillResult (line 54) | GapFillResult GapFillProcess::processScaffold(const std::string& scaffol...
function GapFillReturnCode (line 143) | GapFillReturnCode GapFillProcess::processGap(size_t k, int estimatedSize...
function AnchorSequence (line 190) | AnchorSequence GapFillProcess::findAnchor(size_t k, const std::string& s...
function GapFillReturnCode (line 228) | GapFillReturnCode GapFillProcess::selectGapSequence(int estimatedSize, c...
FILE: src/SGA/Algorithm/GapFillProcess.h
type GapFillParameters (line 28) | struct GapFillParameters
type GapFillResult (line 46) | struct GapFillResult
type GapFillReturnCode (line 51) | enum GapFillReturnCode
type GapFillStats (line 66) | struct GapFillStats
function class (line 81) | class GapFillProcess
function class (line 123) | class GapFillPostProcess
FILE: src/SGA/Algorithm/HaplotypeBuilder.cpp
function HaplotypeBuilderReturnCode (line 65) | HaplotypeBuilderReturnCode HaplotypeBuilder::run()
function HaplotypeBuilderReturnCode (line 136) | HaplotypeBuilderReturnCode HaplotypeBuilder::parseWalks(HaplotypeBuilder...
FILE: src/SGA/Algorithm/HaplotypeBuilder.h
type AnchorSequence (line 20) | struct AnchorSequence
type std (line 32) | typedef std::vector<AnchorSequence> AnchorVector;
type HaplotypeBuilderResult (line 35) | struct HaplotypeBuilderResult
type HaplotypeBuilderReturnCode (line 41) | enum HaplotypeBuilderReturnCode
function class (line 52) | class HaplotypeBuilder
FILE: src/SGA/Algorithm/KmerOverlaps.cpp
function MultipleAlignment (line 18) | MultipleAlignment KmerOverlaps::buildMultipleAlignment(const std::string...
type KmerMatch (line 37) | struct KmerMatch
type KmerMatchKey (line 58) | struct KmerMatchKey
function SequenceOverlapPairVector (line 67) | SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::strin...
type SeedEdit (line 215) | struct SeedEdit
method SeedEdit (line 217) | SeedEdit(int i, char b) : index(i), base(b) {}
type ApproxSeed (line 224) | struct ApproxSeed
function _approximateSeededMatch (line 240) | void _approximateSeededMatch(const std::string& in_query,
function SequenceOverlapPairVector (line 454) | SequenceOverlapPairVector KmerOverlaps::approximateMatch(const std::stri...
FILE: src/SGA/Algorithm/KmerOverlaps.h
type SequenceOverlapPair (line 18) | struct SequenceOverlapPair
type std (line 27) | typedef std::vector<SequenceOverlapPair> SequenceOverlapPairVector;
function namespace (line 29) | namespace KmerOverlaps
FILE: src/SGA/Algorithm/LRAlignment.cpp
type LRAlignment (line 18) | namespace LRAlignment
function bwaswAlignment (line 58) | void bwaswAlignment(const std::string& query, const BWT* pTargetBWT, c...
function updateStack (line 272) | int updateStack(LRStack* pStack,
function saveBestPositionHits (line 354) | void saveBestPositionHits(const SuffixArray* pQuerySA, LRStackEntry* u...
function saveAllHits (line 402) | void saveAllHits(const SuffixArray* pQuerySA, const SampledSuffixArray...
function saveTerminalHits (line 437) | void saveTerminalHits(const SuffixArray* pQuerySA, const SampledSuffix...
function initializeDAWGHash (line 476) | void initializeDAWGHash(BWT* pQueryBWT, LRHash& hashTable)
function mergeStackEntries (line 515) | void mergeStackEntries(LRStackEntry* u, LRStackEntry* v)
function removeDuplicateCells (line 533) | void removeDuplicateCells(LRStackEntry* u, LRHash& hash)
function fillCells (line 589) | int fillCells(const LRParams& params, int match_score, LRCell* c[4])
function resolveDuplicateHitsByID (line 625) | int resolveDuplicateHitsByID(const BWT* pTargetBWT, const SampledSuffi...
function resolveDuplicateHits (line 688) | int resolveDuplicateHits(const BWT* pTargetBWT, const SampledSuffixArr...
function cutTail (line 838) | void cutTail(LRStackEntry* u, const LRParams& params)
function cutTailByScorePercent (line 861) | void cutTailByScorePercent(LRStackEntry* u, const LRParams& params)
function cutTailByZBest (line 889) | void cutTailByZBest(LRStackEntry* u, const LRParams& params)
function cutTailByStratifiedZBest (line 940) | void cutTailByStratifiedZBest(LRStackEntry* u, const LRParams& params)
function MultiAlignment (line 1026) | MultiAlignment convertHitsToMultiAlignment(const std::string& query,
function extendHitFullLength (line 1104) | void extendHitFullLength(LRHit& hit, uint8_t* pQueryPacked, uint8_t* p...
FILE: src/SGA/Algorithm/LRAlignment.h
function namespace (line 22) | namespace LRAlignment
type LRHit (line 64) | struct LRHit
FILE: src/SGA/Algorithm/OverlapAlgorithm.cpp
function OverlapResult (line 19) | OverlapResult OverlapAlgorithm::overlapRead(const SeqRecord& read, int m...
function OverlapResult (line 34) | OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read...
function OverlapResult (line 138) | OverlapResult OverlapAlgorithm::alignReadDuplicate(const SeqRecord& read...
function OverlapResult (line 153) | OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, ...
FILE: src/SGA/Algorithm/OverlapAlgorithm.h
type OverlapMode (line 21) | enum OverlapMode
type OverlapResult (line 27) | struct OverlapResult
function class (line 34) | class OverlapAlgorithm
function setExactModeIrreducible (line 75) | void setExactModeIrreducible(bool b) { m_exactModeIrreducible = b; }
function BWT (line 78) | const BWT* getBWT() const { return m_pBWT; }
function BWT (line 79) | const BWT* getRBWT() const { return m_pRevBWT; }
FILE: src/SGA/Algorithm/OverlapBlock.cpp
function BWT (line 35) | const BWT* OverlapBlock::getExtensionBWT(const BWT* pBWT, const BWT* pRe...
function AlphaCount64 (line 44) | AlphaCount64 OverlapBlock::getCanonicalExtCount(const BWT* pBWT, const B...
function BWTInterval (line 62) | BWTInterval OverlapBlock::getCanonicalInterval() const
function EdgeDir (line 114) | EdgeDir OverlapBlock::getEdgeDir() const
function Overlap (line 123) | Overlap OverlapBlock::toOverlap(const std::string queryID, const std::st...
function printBlockList (line 161) | void printBlockList(const OverlapBlockList* pList)
function removeSubMaximalBlocks (line 170) | void removeSubMaximalBlocks(OverlapBlockList* pList, const BWT* pBWT, co...
type TracingInterval (line 218) | struct TracingInterval
function OverlapBlockList (line 232) | OverlapBlockList resolveOverlap(const OverlapBlock& A, const OverlapBloc...
function partitionBlockList (line 434) | void partitionBlockList(int readLen, OverlapBlockList* pCompleteList,
function removeContainmentBlocks (line 449) | void removeContainmentBlocks(int readLen, OverlapBlockList* pList)
function MultiOverlap (line 462) | MultiOverlap blockListToMultiOverlap(const SeqRecord& record, OverlapBlo...
function makeIdxString (line 504) | std::string makeIdxString(int64_t idx)
FILE: src/SGA/Algorithm/OverlapBlock.h
function isQueryRev (line 24) | struct AlignFlags
function write (line 54) | void write(std::ostream& out)
function read (line 59) | void read(std::istream& in)
function setTargetRev (line 67) | void setTargetRev(bool b) { data.set(TARGETREV_BIT, b); }
function setQueryComp (line 68) | void setQueryComp(bool b) { data.set(QUERYCOMP_BIT, b); }
function sortSizeDescending (line 78) | struct OverlapBlock
function sortIntervalLeft (line 148) | static bool sortIntervalLeft(const OverlapBlock& ob1, const OverlapBlock...
type std (line 190) | typedef std::list<OverlapBlock> OverlapBlockList;
type OverlapBlockList (line 191) | typedef OverlapBlockList::iterator OBLIter;
FILE: src/SGA/Algorithm/OverlapTools.h
function namespace (line 17) | namespace OverlapTools
FILE: src/SGA/Algorithm/QCProcess.cpp
type KmerWindow (line 14) | struct KmerWindow
method getCount (line 16) | int64_t getCount(bool bothStrand) const
function QCResult (line 49) | QCResult QCProcess::process(const SequenceWorkItem& workItem)
function DuplicateCheckResult (line 202) | DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWork...
FILE: src/SGA/Algorithm/QCProcess.h
function setDefaults (line 20) | struct QCParameters
function class (line 105) | class QCResult
type DuplicateCheckResult (line 116) | enum DuplicateCheckResult
function class (line 124) | class QCProcess
function class (line 153) | class QCPostProcess
FILE: src/SGA/Algorithm/ReadCluster.cpp
function ClusterNode (line 30) | ClusterNode ReadCluster::addSeed(const std::string& sequence, bool bChec...
function ClusterNodeVector (line 195) | ClusterNodeVector ReadCluster::getOutput() const
FILE: src/SGA/Algorithm/ReadCluster.h
function equal (line 17) | struct ClusterNode
type std (line 34) | typedef std::queue<ClusterNode> ClusterNodeQueue;
type std (line 35) | typedef std::vector<ClusterNode> ClusterNodeVector;
type std (line 36) | typedef std::set<int64_t> ClusterIntervalSet;
function class (line 39) | class ReadCluster
FILE: src/SGA/Algorithm/SearchHistory.cpp
function SearchHistoryLink (line 38) | SearchHistoryLink& SearchHistoryLink::operator=(SearchHistoryLink const&...
function SearchHistoryLink (line 72) | SearchHistoryLink SearchHistoryNode::createChild(int var_pos, char var_b...
function SearchHistoryLink (line 78) | SearchHistoryLink SearchHistoryNode::createRoot()
function SearchHistoryVector (line 84) | SearchHistoryVector SearchHistoryNode::getHistoryVector()
FILE: src/SGA/Algorithm/SearchHistory.h
type SearchHistoryItem (line 15) | struct SearchHistoryItem
type std (line 32) | typedef std::vector<SearchHistoryItem> HistoryItemVector;
function class (line 35) | class SearchHistoryVector
function class (line 71) | class SearchHistoryLink
function class (line 95) | class SearchHistoryNode
FILE: src/SGA/Algorithm/SearchSeed.h
type ExtendDirection (line 19) | enum ExtendDirection
function isSeed (line 26) | struct SearchSeed
FILE: src/SGA/Algorithm/StatsProcess.cpp
function StatsResult (line 43) | StatsResult StatsProcess::process(const SequenceWorkItem& workItem)
FILE: src/SGA/Algorithm/StatsProcess.h
function class (line 19) | class StatsResult
function class (line 31) | class StatsProcess
function class (line 52) | class StatsPostProcess
FILE: src/SGA/Algorithm/StringGraphGenerator.cpp
function SGWalkVector (line 105) | SGWalkVector StringGraphGenerator::searchWalks()
function Vertex (line 178) | Vertex* StringGraphGenerator::addTerminalVertex(const SeqRecord& record)
FILE: src/SGA/Algorithm/StringGraphGenerator.h
type GraphFrontier (line 22) | struct GraphFrontier
type std (line 29) | typedef std::queue<GraphFrontier> FrontierQueue;
function class (line 31) | class StringGraphGenerator
FILE: src/SGA/Algorithm/StringThreader.cpp
function StringThreaderNode (line 66) | StringThreaderNode* StringThreaderNode::createChild(const std::string& l...
function StringThreaderResult (line 133) | StringThreaderResult StringThreaderNode::getAlignment() const
function StringVector (line 304) | StringVector StringThreader::getDeBruijnExtensions(StringThreaderNode* p...
FILE: src/SGA/Algorithm/StringThreader.h
type std (line 23) | typedef std::list<StringThreaderNode*> STNodePtrList;
type StringThreaderResult (line 26) | struct StringThreaderResult
type std (line 31) | typedef std::vector<StringThreaderResult> StringThreaderResultVector;
function getGlobalErrorRate (line 59) | double getGlobalErrorRate() const;
FILE: src/SGA/Algorithm/VariationBuilderCommon.h
type BubbleResultCode (line 21) | enum BubbleResultCode
type BubbleResult (line 35) | struct BubbleResult
type BuilderExtensionNode (line 46) | struct BuilderExtensionNode
type std (line 55) | typedef std::queue<BuilderExtensionNode> BuilderExtensionQueue;
type std (line 56) | typedef std::map<std::string, int> StrIntMap;
function namespace (line 59) | namespace VariationBuilderCommon
FILE: src/SGA/Bigraph/Bigraph.cpp
function Vertex (line 124) | Vertex* Bigraph::getVertex(VertexID id) const
function VertexIDVec (line 443) | VertexIDVec Bigraph::getNonBranchingVertices() const
function PathVector (line 464) | PathVector Bigraph::getLinearComponents()
function Path (line 485) | Path Bigraph::constructLinearPath(VertexID id)
function Path (line 524) | Path Bigraph::reversePath(const Path& path)
function Vertex (line 533) | Vertex* Bigraph::getFirstVertex() const
function VertexPtrVec (line 542) | VertexPtrVec Bigraph::getAllVertices() const
FILE: src/SGA/Bigraph/Bigraph.h
type std (line 29) | typedef std::unordered_map<VertexID, Vertex*> VertexPtrMap;
type VertexPtrMap (line 32) | typedef VertexPtrMap::iterator VertexPtrMapIter;
type VertexPtrMap (line 33) | typedef VertexPtrMap::const_iterator VertexPtrMapConstIter;
type EdgePtrVec (line 38) | typedef EdgePtrVec Path;
type std (line 39) | typedef std::vector<Path> PathVector;
type std (line 40) | typedef std::vector<VertexID> VertexIDVec;
type std (line 41) | typedef std::vector<Vertex*> VertexPtrVec;
function class (line 43) | class Bigraph
FILE: src/SGA/Bigraph/Edge.cpp
function EdgeDesc (line 13) | EdgeDesc Edge::getTwinDesc() const
function Match (line 61) | Match Edge::getMatch() const
function Overlap (line 69) | Overlap Edge::getOverlap() const
FILE: src/SGA/Bigraph/Edge.h
function setDir (line 26) | struct EdgeData
function setComp (line 40) | void setComp(EdgeComp comp)
function flipDir (line 48) | void flipDir() { m_data.flip(DIR_BIT); }
function flipComp (line 49) | void flipComp() { m_data.flip(COMP_BIT); }
function class (line 68) | class Edge
FILE: src/SGA/Bigraph/EdgeDesc.h
type EdgeDesc (line 16) | struct EdgeDesc
FILE: src/SGA/Bigraph/GraphCommon.h
type EdgeDir (line 18) | enum EdgeDir
type EdgeComp (line 28) | enum EdgeComp
type DotFlags (line 38) | enum DotFlags
type GraphColor (line 47) | typedef uint8_t GraphColor;
type std (line 56) | typedef std::string VertexID;
type std (line 57) | typedef std::vector<VertexID> VertexIDVec;
function EdgeDir (line 73) | inline EdgeDir correctDir(EdgeDir dir, EdgeComp comp)
FILE: src/SGA/Bigraph/Vertex.cpp
function MultiOverlap (line 158) | MultiOverlap Vertex::getMultiOverlap() const
function EdgePtrVecIter (line 272) | EdgePtrVecIter Vertex::findEdge(const EdgeDesc& ed)
function EdgePtrVecConstIter (line 283) | EdgePtrVecConstIter Vertex::findEdge(const EdgeDesc& ed) const
function Edge (line 317) | Edge* Vertex::getEdge(const EdgeDesc& ed)
function EdgePtrVec (line 325) | EdgePtrVec Vertex::findEdgesTo(VertexID id)
function Edge (line 340) | Edge* Vertex::getLongestOverlapEdge(EdgeDir dir) const
function EdgePtrVec (line 365) | EdgePtrVec Vertex::getEdges(EdgeDir dir) const
function EdgePtrVec (line 379) | EdgePtrVec Vertex::getEdges() const
FILE: src/SGA/Bigraph/Vertex.h
type EdgeIDComp (line 32) | struct EdgeIDComp
type EdgeLenComp (line 38) | struct EdgeLenComp
type std (line 45) | typedef std::map<EdgeDesc, Edge*> EdgePtrMap;
type std (line 46) | typedef std::vector<Edge*> EdgePtrVec;
type std (line 47) | typedef std::set<EdgeDesc> EdgeDescSet;
type std (line 48) | typedef std::list<Edge*> EdgePtrList;
type EdgePtrMap (line 49) | typedef EdgePtrMap::iterator EdgePtrMapIter;
type EdgePtrMap (line 50) | typedef EdgePtrMap::const_iterator EdgePtrMapConstIter;
type EdgePtrVec (line 51) | typedef EdgePtrVec::iterator EdgePtrVecIter;
type EdgePtrVec (line 52) | typedef EdgePtrVec::const_iterator EdgePtrVecConstIter;
type EdgePtrList (line 53) | typedef EdgePtrList::iterator EdgePtrListIter;
type EdgePtrList (line 54) | typedef EdgePtrList::const_iterator EdgePtrListConstIter;
function class (line 56) | class Vertex
FILE: src/SGA/SGA/OverlapCommon.h
function namespace (line 19) | namespace OverlapCommon
FILE: src/SGA/SGA/index.cpp
type opt (line 62) | namespace opt
type option (line 82) | struct option
function indexMain (line 98) | int indexMain(int argc, char** argv)
function indexInMemoryBCR (line 119) | void indexInMemoryBCR()
function indexInMemoryRopebwt (line 149) | void indexInMemoryRopebwt()
function indexInMemorySAIS (line 190) | void indexInMemorySAIS()
function indexOnDisk (line 219) | void indexOnDisk()
function buildIndexForTable (line 248) | void buildIndexForTable(std::string prefix, const ReadTable* pRT, bool i...
function parseIndexOptions (line 272) | void parseIndexOptions(int argc, char** argv)
FILE: src/SGA/SGA/overlap.cpp
type OutputType (line 28) | enum OutputType
type opt (line 84) | namespace opt
type option (line 107) | struct option
function overlapMain (line 128) | int overlapMain(int argc, char** argv)
function computeHitsSerial (line 213) | size_t computeHitsSerial(const std::string& prefix, const std::string& r...
function computeHitsParallel (line 236) | size_t computeHitsParallel(int numThreads, const std::string& prefix, co...
function convertHitsToASQG (line 267) | void convertHitsToASQG(const std::string& indexPrefix, const StringVecto...
function parseOverlapOptions (line 327) | void parseOverlapOptions(int argc, char** argv)
FILE: src/SGA/SQG/ASQG.cpp
type ASQG (line 13) | namespace ASQG
function RecordType (line 343) | RecordType getRecordType(const std::string& record)
function writeFields (line 376) | void writeFields(std::ostream& out, const StringVector& fields)
FILE: src/SGA/SQG/ASQG.h
function namespace (line 16) | namespace ASQG
type VertexRecord (line 63) | struct VertexRecord
type EdgeRecord (line 88) | struct EdgeRecord
FILE: src/SGA/SQG/SQG.cpp
type SQG (line 12) | namespace SQG
function StringVector (line 16) | StringVector tokenizeRecord(const std::string& record)
function StringVector (line 22) | StringVector tokenizeTagValue(const std::string& tagValue)
FILE: src/SGA/SQG/SQG.h
function namespace (line 17) | namespace SQG
FILE: src/SGA/StringGraph/CompleteOverlapSet.h
type ExploreElement (line 21) | struct ExploreElement
type CompareExploreElemOverlapLength (line 30) | struct CompareExploreElemOverlapLength
type std (line 39) | typedef std::priority_queue<ExploreElement,
type std (line 42) | typedef std::list<EdgeDesc> EdgeDescList;
type std (line 43) | typedef std::queue<ExploreElement> ExploreQueue;
function class (line 45) | class CompleteOverlapSet
FILE: src/SGA/StringGraph/GraphSearchTree.h
type std (line 27) | typedef std::vector<EDGE*> _EDGEPtrVector;
function GraphSearchNode (line 40) | GraphSearchNode* getParent() const { return m_pParent; }
function VERTEX (line 41) | VERTEX* getVertex() const { return m_pVertex; }
function EDGE (line 43) | EDGE* getEdgeFromParent() const { return m_pEdgeFromParent; }
type GraphSearchNode (line 62) | typedef GraphSearchNode<VERTEX,EDGE,DISTANCE> _SearchNode;
type typename (line 63) | typedef typename _SearchNode::GraphSearchNodePtrDeque _SearchNodePtrDeque;
type typename (line 64) | typedef typename std::set<_SearchNode*> _SearchNodePtrSet;
type std (line 65) | typedef std::vector<EDGE*> WALK;
type std (line 66) | typedef std::vector<WALK> WALKVector;
type std (line 68) | typedef std::vector<VERTEX*> VertexPtrVector;
type std (line 69) | typedef std::vector<VertexPtrVector> VertexPtrVectorVector;
FILE: src/SGA/StringGraph/RemovalAlgorithm.h
function namespace (line 21) | namespace RemovalAlgorithm
FILE: src/SGA/StringGraph/SGAlgorithms.cpp
function Edge (line 16) | Edge* SGAlgorithms::createEdgesFromOverlap(StringGraph* pGraph, const Ov...
function Overlap (line 181) | Overlap SGAlgorithms::inferTransitiveOverlap(const Overlap& ovrXY, const...
type EDOPairCompare (line 232) | struct EDOPairCompare
function EdgeDesc (line 329) | EdgeDesc SGAlgorithms::overlapToEdgeDesc(Vertex* pY, const Overlap& ovrXY)
function EdgeDesc (line 348) | EdgeDesc SGAlgorithms::getEdgeDescFromEdge(Edge* pEdge)
FILE: src/SGA/StringGraph/SGAlgorithms.h
function namespace (line 18) | namespace SGAlgorithms
FILE: src/SGA/StringGraph/SGSearch.h
type SGDistanceFunction (line 20) | struct SGDistanceFunction
type GraphSearchTree (line 29) | typedef GraphSearchTree<Vertex, Edge, SGDistanceFunction> SGSearchTree;
type SGWalkBuilder (line 32) | struct SGWalkBuilder
function namespace (line 53) | namespace SGSearch
FILE: src/SGA/StringGraph/SGUtil.cpp
function StringGraph (line 15) | StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigne...
function StringGraph (line 147) | StringGraph* SGUtil::loadASQG(std::stringstream& pReader, const unsigned...
function StringGraph (line 279) | StringGraph* SGUtil::loadFASTA(const std::string& filename)
FILE: src/SGA/StringGraph/SGUtil.h
type Bigraph (line 17) | typedef Bigraph StringGraph;
function namespace (line 19) | namespace SGUtil
FILE: src/SGA/StringGraph/SGVisitors.h
function previsit (line 21) | struct SGFastaVisitor
function postvisit (line 30) | void postvisit(StringGraph* /*pGraph*/) {}
type SGVisitorContig (line 38) | struct SGVisitorContig
type SGTransitiveReductionVisitor (line 55) | struct SGTransitiveReductionVisitor
type SGIdenticalRemoveVisitor (line 67) | struct SGIdenticalRemoveVisitor
type SGContainRemoveVisitor (line 77) | struct SGContainRemoveVisitor
type SGSmallRepeatResolveVisitor (line 96) | struct SGSmallRepeatResolveVisitor
type SGOverlapRatioVisitor (line 108) | struct SGOverlapRatioVisitor
type SGTrimVisitor (line 120) | struct SGTrimVisitor
type SGDuplicateVisitor (line 133) | struct SGDuplicateVisitor
type SGSuperRepeatVisitor (line 145) | struct SGSuperRepeatVisitor
type SGSmoothingVisitor (line 156) | struct SGSmoothingVisitor
type SGGraphStatsVisitor (line 182) | struct SGGraphStatsVisitor
FILE: src/SGA/StringGraph/SGWalk.cpp
function SGWalk (line 69) | SGWalk& SGWalk::operator=(const SGWalk& other)
function Vertex (line 116) | Vertex* SGWalk::getStartVertex() const
function Edge (line 467) | Edge* SGWalk::getFirstEdge() const
function Edge (line 476) | Edge* SGWalk::getLastEdge() const
function Vertex (line 485) | Vertex* SGWalk::getLastVertex() const
function Edge (line 494) | Edge* SGWalk::getEdge(size_t idx) const
function Vertex (line 501) | Vertex* SGWalk::getVertex(size_t idx) const
function VertexPtrVec (line 511) | VertexPtrVec SGWalk::getVertices() const
FILE: src/SGA/StringGraph/SGWalk.h
type SGWalkType (line 18) | enum SGWalkType
type SGWalkVertexPlacement (line 26) | struct SGWalkVertexPlacement
type std (line 32) | typedef std::vector<SGWalkVertexPlacement> SGWalkVertexPlacementVector;
FILE: src/SGA/SuffixTools/BWT.h
type RLBWT (line 21) | typedef RLBWT BWT;
FILE: src/SGA/SuffixTools/BWTAlgorithms.cpp
function BWTInterval (line 14) | BWTInterval BWTAlgorithms::findInterval(const BWT* pBWT, const std::stri...
function BWTInterval (line 36) | BWTInterval BWTAlgorithms::findIntervalWithCache(const BWT* pBWT, const ...
function BWTInterval (line 67) | BWTInterval BWTAlgorithms::findInterval(const BWTIndexSet& indices, cons...
function BWTIntervalPair (line 78) | BWTIntervalPair BWTAlgorithms::findIntervalPair(const BWT* pBWT, const B...
function BWTIntervalPair (line 98) | BWTIntervalPair BWTAlgorithms::findIntervalPairWithCache(const BWT* pBWT,
function AlphaCount64 (line 181) | AlphaCount64 BWTAlgorithms::calculateExactExtensions(const unsigned int ...
function AlphaCount64 (line 219) | AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensions(const std::strin...
function AlphaCount64 (line 287) | AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const...
function _extractRankedPrefixes (line 456) | void _extractRankedPrefixes(const BWT* pBWT, BWTInterval interval, const...
function RankedPrefixVector (line 484) | RankedPrefixVector BWTAlgorithms::extractRankedPrefixes(const BWT* pBWT,...
FILE: src/SGA/SuffixTools/BWTAlgorithms.h
type RankedPrefix (line 27) | struct RankedPrefix
type std (line 32) | typedef std::vector<RankedPrefix> RankedPrefixVector;
function namespace (line 35) | namespace BWTAlgorithms
FILE: src/SGA/SuffixTools/BWTCABauerCoxRosone.h
function namespace (line 18) | namespace BWTCA
type std (line 45) | typedef std::vector<BCRElem> BCRVector;
FILE: src/SGA/SuffixTools/BWTCARopebwt.h
function namespace (line 15) | namespace BWTCA
FILE: src/SGA/SuffixTools/BWTDiskConstruction.cpp
type MergeItem (line 28) | struct MergeItem
function buildBWTDisk (line 78) | void buildBWTDisk(const BWTDiskParameters& parameters)
function MergeVector (line 165) | MergeVector computeInitialSAIS(const BWTDiskParameters& parameters)
function MergeVector (line 228) | MergeVector computeInitialBCR(const BWTDiskParameters& parameters)
function mergeIndependentIndices (line 281) | void mergeIndependentIndices(const std::string& readsFile1, const std::s...
function removeReadsFromIndices (line 314) | void removeReadsFromIndices(const std::string& allReadsPrefix, const std...
function mergeReadFiles (line 350) | void mergeReadFiles(const std::string& readsFile1, const std::string& re...
function computeGapArray (line 385) | void computeGapArray(SeqReader* pReader, size_t n, const BWT* pBWT, bool...
function merge (line 434) | int64_t merge(SeqReader* pReader,
function writeMergedIndex (line 475) | void writeMergedIndex(const BWT* pBWTInternal, const MergeItem& external...
function writeRemovalIndex (line 577) | void writeRemovalIndex(const BWT* pBWTInternal, const std::string& sai_i...
function makeTempName (line 696) | std::string makeTempName(const std::string& prefix, int id, const std::s...
function makeFilename (line 704) | std::string makeFilename(const std::string& prefix, const std::string& e...
FILE: src/SGA/SuffixTools/BWTDiskConstruction.h
type BWTDiskParameters (line 21) | struct BWTDiskParameters
FILE: src/SGA/SuffixTools/BWTIndexSet.h
type BWTIndexSet (line 24) | struct BWTIndexSet
FILE: src/SGA/SuffixTools/BWTInterval.h
type BWTInterval (line 19) | struct BWTInterval
function write (line 53) | void write(std::ostream& out)
function read (line 59) | void read(std::istream& in)
type BWTIntervalPair (line 71) | struct BWTIntervalPair
function sortFirstLower (line 84) | static bool sortFirstLower(const BWTIntervalPair& a, const BWTIntervalPa...
function sortSecondLower (line 90) | static bool sortSecondLower(const BWTIntervalPair& a, const BWTIntervalP...
FILE: src/SGA/SuffixTools/BWTIntervalCache.h
function class (line 16) | class BWTIntervalCache
FILE: src/SGA/SuffixTools/BWTReader.cpp
function IBWTReader (line 14) | IBWTReader* BWTReader::createReader(const std::string& filename)
FILE: src/SGA/SuffixTools/BWTReader.h
type BWIOStage (line 17) | enum BWIOStage
type BWFlag (line 27) | enum BWFlag
function class (line 38) | class IBWTReader
function namespace (line 51) | namespace BWTReader
FILE: src/SGA/SuffixTools/BWTReaderAscii.h
function class (line 21) | class BWTReaderAscii : public IBWTReader
FILE: src/SGA/SuffixTools/BWTReaderBinary.h
function class (line 22) | class BWTReaderBinary : public IBWTReader
FILE: src/SGA/SuffixTools/BWTTraverse.h
function getCurrChar (line 16) | struct TraverseElem
type std (line 59) | typedef std::stack<TraverseElem> TraverseStack;
type std (line 60) | typedef std::vector<bool> bool_vec;
function namespace (line 62) | namespace BWTTraverse
FILE: src/SGA/SuffixTools/BWTWriter.cpp
function IBWTWriter (line 35) | IBWTWriter* BWTWriter::createWriter(const std::string& filename)
FILE: src/SGA/SuffixTools/BWTWriter.h
function class (line 19) | class IBWTWriter
function namespace (line 33) | namespace BWTWriter
FILE: src/SGA/SuffixTools/BWTWriterAscii.h
function class (line 22) | class BWTWriterAscii : public IBWTWriter
FILE: src/SGA/SuffixTools/BWTWriterBinary.h
function class (line 23) | class BWTWriterBinary : public IBWTWriter
FILE: src/SGA/SuffixTools/FMMarkers.h
function getActualPosition (line 19) | struct LargeMarker
type std (line 62) | typedef std::vector<LargeMarker> LargeMarkerVector;
function getCountSum (line 68) | struct SmallMarker
type std (line 94) | typedef std::vector<SmallMarker> SmallMarkerVector;
FILE: src/SGA/SuffixTools/GapArray.cpp
function GapArray (line 48) | GapArray* createGapArray(int storage)
function updateGapArray (line 71) | void updateGapArray(const DNAString& w, const BWT* pBWTInternal, GapArra...
function analyzeGapArray (line 107) | void analyzeGapArray(GapArray* pGapArray)
FILE: src/SGA/SuffixTools/GapArray.h
function class (line 18) | class GapArray
function class (line 45) | class SimpleGapArray : public GapArray
FILE: src/SGA/SuffixTools/HitData.h
type Hit (line 17) | struct Hit
type std (line 61) | typedef std::vector<Hit> HitVector;
FILE: src/SGA/SuffixTools/InverseSuffixArray.h
type std (line 15) | typedef std::vector<uint64_t> RankVector;
type std (line 16) | typedef std::map<uint64_t, uint32_t> RankCountMap;
type std (line 17) | typedef std::map<uint64_t, RankVector> RankVectorMap;
function class (line 19) | class InverseSuffixArray
FILE: src/SGA/SuffixTools/Occurrence.h
function class (line 24) | class Occurrence
FILE: src/SGA/SuffixTools/PopulationIndex.cpp
function StringVector (line 79) | StringVector PopulationIndex::getSamples() const
function PopulationMember (line 123) | PopulationMember PopulationIndex::str2member(const std::string& line)
FILE: src/SGA/SuffixTools/PopulationIndex.h
type std (line 16) | typedef std::vector<std::string> StringVector;
type PopulationMember (line 18) | struct PopulationMember
function class (line 32) | class PopulationIndex
FILE: src/SGA/SuffixTools/QuickBWT.cpp
function createQuickBWT (line 13) | void createQuickBWT(const std::string& str, BWT*& pBWT, SuffixArray*& pSA)
FILE: src/SGA/SuffixTools/RLBWT.h
function class (line 28) | class RLBWT
FILE: src/SGA/SuffixTools/RLUnit.h
function addCount (line 22) | struct RLUnit
function subtractAlphaCount (line 76) | inline size_t subtractAlphaCount(AlphaCount64& ac, size_t max) const
function subtractCount (line 89) | inline size_t subtractCount(char b, size_t& base_count, size_t max) const
function incrementCount (line 101) | inline void incrementCount()
function decrementCount (line 110) | inline void decrementCount()
function setChar (line 127) | inline void setChar(char symbol)
FILE: src/SGA/SuffixTools/RankProcess.cpp
function RankResult (line 38) | RankResult RankProcess::process(const SequenceWorkItem& workItem)
FILE: src/SGA/SuffixTools/RankProcess.h
type std (line 18) | typedef std::vector<int64_t> RankVector;
type RankResult (line 19) | struct RankResult
FILE: src/SGA/SuffixTools/SACAInducedCopying.cpp
function saca_induced_copying (line 25) | void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int nu...
function induceSAl (line 141) | void induceSAl(const ReadTable* pRT, SuffixArray* pSA, char** p_array, i...
function induceSAs (line 161) | void induceSAs(const ReadTable* pRT, SuffixArray* pSA, char** p_array, i...
function countBuckets (line 183) | void countBuckets(const ReadTable* pRT, int64_t* counts, int K)
function getBuckets (line 200) | void getBuckets(int64_t* counts, int64_t* buckets, int K, bool end)
function setBit (line 214) | void setBit(char** p_array, size_t str_idx, size_t bit_idx, bool b)
function getBit (line 220) | bool getBit(char** p_array, size_t str_idx, size_t bit_idx)
function printType (line 225) | void printType(const ReadTable* pRT, char** p_array, size_t str_idx)
FILE: src/SGA/SuffixTools/SAReader.cpp
function SAElem (line 104) | SAElem SAReader::readElem()
FILE: src/SGA/SuffixTools/SAReader.h
type SAIOStage (line 18) | enum SAIOStage
function class (line 28) | class SAReader
FILE: src/SGA/SuffixTools/SAWriter.h
function class (line 19) | class SAWriter
FILE: src/SGA/SuffixTools/SBWT.h
function class (line 25) | class SBWT
function validate (line 71) | void validate() const;
FILE: src/SGA/SuffixTools/STCommon.cpp
function printMap (line 56) | void printMap(const std::map<K,V>& m)
function printVector (line 66) | void printVector(const std::vector<T>& v)
FILE: src/SGA/SuffixTools/STCommon.h
type SAElem (line 36) | struct SAElem
function setID (line 50) | inline void setID(uint64_t i)
function setPos (line 61) | void setPos(uint64_t i)
type std (line 104) | typedef std::vector<SAElem> SAElemVector;
type std (line 105) | typedef std::pair<SAElem, SAElem> SAElemPair;
type std (line 106) | typedef std::vector<SAElemPair> SAElemPairVec;
type std (line 107) | typedef std::set<uint64_t> NumericIDSet;
FILE: src/SGA/SuffixTools/STGlobals.h
type std (line 24) | typedef std::string BWStr;
type Label (line 25) | typedef uint32_t Label;
type AIdx (line 26) | typedef uint8_t AIdx;
type std (line 27) | typedef std::vector<int> IntVector;
type std (line 28) | typedef std::vector<std::string> StringVector;
FILE: src/SGA/SuffixTools/SampledSuffixArray.cpp
function SAElem (line 44) | SAElem SampledSuffixArray::calcSA(int64_t idx, const BWT* pBWT) const
FILE: src/SGA/SuffixTools/SampledSuffixArray.h
type SSA_INT_TYPE (line 19) | typedef uint32_t SSA_INT_TYPE;
type SSAFileType (line 21) | enum SSAFileType
function class (line 27) | class SampledSuffixArray
FILE: src/SGA/SuffixTools/SparseGapArray.h
function set (line 40) | inline void set(size_t i, IntType c)
function setCAS (line 47) | inline bool setCAS(size_t i, IntType oldV, IntType newV)
function IntType (line 55) | inline IntType get(size_t i) const
function getMax (line 60) | inline static size_t getMax()
function class (line 82) | class SparseBaseStorage4
function class (line 183) | class SparseBaseStorage1
function resize (line 261) | void resize(size_t n)
function attemptBaseIncrement (line 270) | bool attemptBaseIncrement(size_t i)
function incrementOverflowSerial (line 305) | void incrementOverflowSerial(size_t i)
function initOverflow (line 321) | void initOverflow(size_t i, OverflowStorage c)
function incrementOverflow (line 327) | void incrementOverflow(size_t i)
function get (line 335) | size_t get(size_t i) const
function getBaseMax (line 361) | size_t getBaseMax() const
type SparseGapArray (line 380) | typedef SparseGapArray<SparseBaseStorage1, size_t> SparseGapArray1;
type SparseGapArray (line 381) | typedef SparseGapArray<SparseBaseStorage4, size_t> SparseGapArray4;
type SparseGapArray (line 382) | typedef SparseGapArray<SparseBaseStorage<uint8_t>, size_t> SparseGapArray8;
type SparseGapArray (line 383) | typedef SparseGapArray<SparseBaseStorage<uint16_t>, size_t> SparseGapArr...
type SparseGapArray (line 384) | typedef SparseGapArray<SparseBaseStorage<uint32_t>, size_t> SparseGapArr...
FILE: src/SGA/SuffixTools/SuffixArray.h
function class (line 17) | class SuffixArray
FILE: src/SGA/SuffixTools/SuffixCompare.h
function class (line 16) | class SuffixCompareRadix
function class (line 95) | class SuffixCompareID
function class (line 113) | class SuffixCompareIndex
FILE: src/SGA/Util/Alphabet.h
type BaseCount (line 39) | typedef uint64_t BaseCount;
function namespace (line 41) | namespace DNA_ALPHABET
function namespace (line 76) | namespace BWT_ALPHABET
function namespace (line 111) | namespace IUPAC
function clear (line 140) | inline void clear()
function set (line 146) | inline void set(char b, Storage v)
function setByIdx (line 152) | inline void setByIdx(size_t i, Storage v)
function increment (line 159) | inline void increment(char b)
function add (line 169) | inline void add(char b, Storage v)
function subtract (line 179) | inline void subtract(char b, Storage v)
function Storage (line 189) | inline Storage get(char b) const
function Storage (line 195) | inline Storage getByIdx(const int i) const
function getBase (line 201) | static char getBase(size_t i)
function getMaxValue (line 207) | static size_t getMaxValue()
function complement (line 214) | inline void complement()
function getLessThan (line 230) | inline size_t getLessThan(char b) const
function getNumNonZero (line 240) | uint8_t getNumNonZero() const
function hasUniqueDNAChar (line 253) | inline bool hasUniqueDNAChar()
function hasDNAChar (line 273) | inline bool hasDNAChar()
function getMaxBase (line 285) | inline char getMaxBase() const
function Storage (line 311) | inline Storage getMaxCount() const
function getSum (line 337) | inline size_t getSum() const
function getUniqueDNAChar (line 361) | inline char getUniqueDNAChar()
type AlphaCountCompareDesc (line 450) | struct AlphaCountCompareDesc
type AlphaCount (line 463) | typedef AlphaCount<uint64_t> AlphaCount64;
type AlphaCount (line 464) | typedef AlphaCount<uint16_t> AlphaCount16;
type AlphaCount (line 465) | typedef AlphaCount<uint8_t> AlphaCount8;
function alphacount_add16 (line 468) | inline void alphacount_add16(AlphaCount64& lhs, const AlphaCount16& rhs)
function alphacount_subtract16 (line 477) | inline void alphacount_subtract16(AlphaCount64& lhs, const AlphaCount16&...
function alphacount_add (line 489) | inline void alphacount_add(AlphaCount64& lhs, const AlphaCount8& rhs)
function alphacount_subtract (line 498) | inline void alphacount_subtract(AlphaCount64& lhs, const AlphaCount8& rhs)
FILE: src/SGA/Util/BWT4Codec.h
function class (line 19) | class BWT4Codec
FILE: src/SGA/Util/BWTCodec.h
function class (line 19) | class BWTCodec
FILE: src/SGA/Util/BitChar.cpp
function printBinary (line 38) | void printBinary(std::ostream& out, const BitChar& bc)
FILE: src/SGA/Util/BitChar.h
type BitChar (line 14) | struct BitChar
FILE: src/SGA/Util/BitVector.h
function class (line 16) | class BitVector
FILE: src/SGA/Util/BloomFilter.h
function class (line 17) | class BloomFilter
FILE: src/SGA/Util/ClusterReader.h
type ClusterRecord (line 18) | struct ClusterRecord
type std (line 25) | typedef std::vector<ClusterRecord> ClusterVector;
function class (line 27) | class ClusterReader
FILE: src/SGA/Util/Contig.cpp
function SequenceVector (line 51) | SequenceVector Contig::getVariants() const
FILE: src/SGA/Util/Contig.h
type UniqueFlag (line 14) | enum UniqueFlag
function class (line 22) | class Contig
FILE: src/SGA/Util/CorrectionThresholds.cpp
function CorrectionThresholds (line 20) | CorrectionThresholds& CorrectionThresholds::Instance()
FILE: src/SGA/Util/CorrectionThresholds.h
function class (line 14) | class CorrectionThresholds
FILE: src/SGA/Util/DNACodec.h
function class (line 17) | class DNACodec
FILE: src/SGA/Util/DNADouble.h
function class (line 15) | class DNADouble
FILE: src/SGA/Util/DNAString.cpp
function DNAString (line 29) | DNAString& DNAString::operator=(const DNAString& dna)
function DNAString (line 40) | DNAString& DNAString::operator=(const std::string& str)
FILE: src/SGA/Util/DNAString.h
function class (line 17) | class DNAString
FILE: src/SGA/Util/EncodedString.h
type typename (line 31) | typedef typename Codec::UNIT_TYPE StorageUnit;
function resize (line 84) | void resize(size_t n)
function append (line 92) | void append(const std::string& str)
function append (line 102) | void append(const EncodedString& other)
function swap (line 112) | void swap(EncodedString& other)
function get (line 147) | inline char get(size_t idx) const
function set (line 154) | inline void set(size_t idx, char b)
function _copy (line 233) | void _copy(const EncodedString& other)
function _copyUnitData (line 243) | void _copyUnitData(StorageUnit* pData, size_t num_units)
function _append (line 251) | void _append(const char* pData, size_t n)
function _append (line 260) | void _append(const EncodedString& other)
function _alloc (line 270) | void _alloc(size_t n)
function _realloc (line 280) | void _realloc(size_t n)
function _dealloc (line 292) | void _dealloc()
type EncodedString (line 312) | typedef EncodedString<DNACodec> DNAEncodedString;
type EncodedString (line 313) | typedef EncodedString<BWT4Codec> BWTString;
type EncodedString (line 314) | typedef EncodedString<NoCodec> NoEncodingString;
type std (line 316) | typedef std::vector<DNAEncodedString> DNAEncodedStringVector;
FILE: src/SGA/Util/HashMap.h
type std (line 23) | typedef std::hash<std::string> StringHasher;
type std (line 29) | typedef std::tr1::hash<std::string> StringHasher;
type StringHasher (line 44) | struct StringHasher
FILE: src/SGA/Util/Interval.cpp
function Interval (line 11) | Interval Interval::intersect(const Interval& r1, const Interval& r2)
FILE: src/SGA/Util/Interval.h
function isIntersecting (line 15) | struct Interval
FILE: src/SGA/Util/IntervalTree.h
type vector (line 59) | typedef vector<interval> intervalVector;
type IntervalTTree (line 60) | typedef IntervalTTree<T,K> intervalTree;
function findOverlapping (line 172) | void findOverlapping(K start, K stop, intervalVector& overlapping) {
function findContained (line 192) | void findContained(K start, K stop, intervalVector& contained) {
FILE: src/SGA/Util/KmerDistribution.h
function class (line 17) | class KmerDistribution
FILE: src/SGA/Util/Match.cpp
function SeqCoord (line 87) | SeqCoord Match::translate(const SeqCoord& c) const
function SeqCoord (line 103) | SeqCoord Match::inverseTranslate(const SeqCoord& c) const
function Match (line 141) | Match Match::infer(const Match& match_xy, const Match& match_xz)
FILE: src/SGA/Util/Match.h
function getNumDiffs (line 16) | struct Match
function setNumDiffs (line 29) | void setNumDiffs(int n) { numDiff = n; }
function calculateTranslation (line 32) | int calculateTranslation() const;
type Overlap (line 83) | struct Overlap
FILE: src/SGA/Util/Metrics.h
type ErrorCount (line 15) | struct ErrorCount
type std (line 24) | typedef std::map<Key, ErrorCount> DataMap;
function incrementSample (line 30) | void incrementSample(const Key& key)
function incrementError (line 36) | void incrementError(const Key& key)
function write (line 42) | void write(std::ostream* pWriter, const std::string& leader, const std::...
FILE: src/SGA/Util/MultiAlignment.cpp
type CigarIter (line 17) | struct CigarIter
method updateAndEmit (line 23) | char updateAndEmit(char updateMode)
method getCigarSymbol (line 81) | char getCigarSymbol()
method getOutputBase (line 88) | char getOutputBase(char cigSym)
method sortPosition (line 96) | static bool sortPosition(const CigarIter& a, const CigarIter& b)
function MultiAlignment (line 465) | MultiAlignment MultiAlignmentTools::alignSequencesGlobal(const SeqItemVe...
function MultiAlignment (line 491) | MultiAlignment MultiAlignmentTools::alignSequencesLocal(const SeqItemVec...
FILE: src/SGA/Util/MultiAlignment.h
type MAlignData (line 17) | struct MAlignData
type std (line 33) | typedef std::vector<MAlignData> MAlignDataVector;
function class (line 35) | class MultiAlignment
function namespace (line 83) | namespace MultiAlignmentTools
FILE: src/SGA/Util/MultiOverlap.cpp
function Overlap (line 63) | Overlap MultiOverlap::getOverlap(size_t idx) const
function AlphaCount64 (line 444) | AlphaCount64 MultiOverlap::getAlphaCount(int idx) const
function Pileup (line 466) | Pileup MultiOverlap::getPileup(int idx) const
function Pileup (line 488) | Pileup MultiOverlap::getPileup(int idx, int numElems) const
function Pileup (line 509) | Pileup MultiOverlap::getSingletonPileup(int base_idx, int ovr_idx) const
function PileupVector (line 548) | PileupVector MultiOverlap::getPartitionedPileup(int idx, int num_parts) ...
FILE: src/SGA/Util/MultiOverlap.h
function class (line 17) | class MultiOverlap
FILE: src/SGA/Util/NoCodec.h
function class (line 14) | class NoCodec
FILE: src/SGA/Util/Pileup.cpp
function AlphaCount64 (line 30) | AlphaCount64 Pileup::getAlphaCount() const
function DNADouble (line 42) | DNADouble Pileup::calculateSimpleAlphaProb() const
function DNADouble (line 87) | DNADouble Pileup::calculateLikelihoodNoQuality(double p_error) const
FILE: src/SGA/Util/Pileup.h
function calculateSimpleConsensus (line 19) | struct PUElem
type std (line 56) | typedef std::vector<Pileup> PileupVector;
FILE: src/SGA/Util/PrimerScreen.h
function class (line 15) | class PrimerScreen
FILE: src/SGA/Util/Profiler.h
function class (line 24) | class TimeTracker
FILE: src/SGA/Util/Quality.cpp
function DoubleVector (line 12) | DoubleVector Quality::uniformLogProbVector(double p_error, size_t n)
FILE: src/SGA/Util/Quality.h
type std (line 22) | typedef std::vector<double> DoubleVector;
function namespace (line 23) | namespace Quality
FILE: src/SGA/Util/QualityCodec.h
type QualityStorageUnit (line 16) | typedef uint8_t QualityStorageUnit;
function encode (line 35) | inline uint8_t encode(char b) const
function decode (line 53) | inline char decode(uint8_t c) const
function getRequiredUnits (line 64) | inline size_t getRequiredUnits(size_t n) const
function getCapacity (line 71) | inline size_t getCapacity(size_t n) const
function getUnitIndex (line 78) | inline size_t getUnitIndex(size_t i) const
function getUnitOffset (line 85) | inline size_t getUnitOffset(size_t i) const
function getOffsetShift (line 93) | inline uint8_t getOffsetShift(size_t offset) const
function getOffsetMask (line 99) | inline uint8_t getOffsetMask(size_t offset) const
function store (line 105) | inline void store(UNIT_TYPE* pData, size_t i, char b) const
function get (line 133) | inline char get(const UNIT_TYPE* pData, size_t i) const
FILE: src/SGA/Util/QualityTable.h
type QualityString (line 16) | struct QualityString
type std (line 22) | typedef std::vector<QualityString> QualityStringVector;
function class (line 24) | class QualityTable
FILE: src/SGA/Util/QualityVector.cpp
function DNADouble (line 45) | DNADouble QualityVector::get(size_t idx) const
FILE: src/SGA/Util/QualityVector.h
type std (line 17) | typedef std::vector<DNADouble> APVec;
function class (line 19) | class QualityVector
FILE: src/SGA/Util/Read2Contig.h
type Read2Contig (line 10) | struct Read2Contig {
FILE: src/SGA/Util/ReadInfoTable.cpp
function ReadInfo (line 85) | const ReadInfo ReadInfoTable::getReadInfo(size_t idx) const
FILE: src/SGA/Util/ReadInfoTable.h
type ReadInfoOption (line 17) | enum ReadInfoOption
type ReadInfo (line 23) | struct ReadInfo
FILE: src/SGA/Util/ReadTable.cpp
function SeqItem (line 106) | const SeqItem& ReadTable::getRead(size_t idx) const
function SeqItem (line 117) | const SeqItem& ReadTable::getRead(const std::string& id) const
FILE: src/SGA/Util/ReadTable.h
type std (line 17) | typedef std::vector<SeqItem> ReadVector;
type std (line 18) | typedef std::map<std::string, SeqItem*> ReadIndex;
function class (line 20) | class ReadTable
FILE: src/SGA/Util/ReadTableNew.cpp
function SeqItem (line 75) | const SeqItem& ReadTable::getRead(size_t idx) const
function SeqItem (line 99) | const SeqItem& ReadTable::getRead(const std::string& id) const
FILE: src/SGA/Util/ReadTableNew.h
type std (line 16) | typedef std::vector<SeqItem> ReadVector;
type std (line 17) | typedef std::map<std::string, SeqItem*> ReadIndex;
function class (line 19) | class ReadTable
FILE: src/SGA/Util/ReadTableS.h
type std (line 18) | typedef std::vector<SeqItem> ReadVector;
type std (line 19) | typedef std::map<std::string, SeqItem*> ReadIndex;
type std (line 20) | typedef std::vector<BamTools::BamAlignment> BamAlignmentVector;
function class (line 22) | class ReadTable
FILE: src/SGA/Util/SGAStats.h
function namespace (line 14) | namespace SGAStats
FILE: src/SGA/Util/SeqCoord.cpp
function SeqCoord (line 16) | SeqCoord SeqCoord::complement() const
function QualityVector (line 55) | QualityVector SeqCoord::getSubvector(const QualityVector& vec) const
FILE: src/SGA/Util/SeqCoord.h
type SeqCoord (line 18) | struct SeqCoord
function setEmpty (line 65) | inline void setEmpty()
function setFull (line 72) | inline void setFull()
function flip (line 97) | inline void flip()
function flip (line 106) | static inline int flip(int p, int l)
FILE: src/SGA/Util/SeqReader.h
type RecordType (line 15) | enum RecordType
function class (line 26) | class SeqReader
FILE: src/SGA/Util/SimpleAllocator.h
type SimplePool (line 20) | typedef SimplePool<T> StorageType;
type std (line 21) | typedef std::list<StorageType* > StorageList;
function dealloc (line 47) | void dealloc(void* /*ptr*/)
FILE: src/SGA/Util/SimplePool.h
function dealloc (line 53) | void dealloc(void* /*ptr*/)
function isFull (line 58) | bool isFull()
FILE: src/SGA/Util/StdAlnTools.cpp
function LocalAlignmentResult (line 58) | LocalAlignmentResult StdAlnTools::localAlignment(const std::string& targ...
FILE: src/SGA/Util/StdAlnTools.h
function setDefaults (line 19) | struct GlobalAlnParams
function setPacBio (line 35) | void setPacBio()
type LocalAlignmentResult (line 54) | struct LocalAlignmentResult
type std (line 71) | typedef std::vector<LocalAlignmentResult> LocalAlignmentResultVector;
function namespace (line 73) | namespace StdAlnTools
FILE: src/SGA/Util/Timer.h
function class (line 16) | class Timer
FILE: src/SGA/Util/Util.cpp
function reverseComplement (line 19) | std::string reverseComplement(const std::string& seq)
function reverseComplementIUPAC (line 31) | std::string reverseComplementIUPAC(const std::string& seq)
function reverse (line 43) | std::string reverse(const std::string& seq)
function complement (line 49) | std::string complement(const std::string& seq)
function complementIUPAC (line 59) | std::string complementIUPAC(const std::string& seq)
function prefix (line 69) | std::string prefix(const std::string& seq, const unsigned int len)
function suffix (line 76) | std::string suffix(const std::string& seq, const unsigned int len)
function calculateDustScore (line 86) | double calculateDustScore(const std::string& seq)
function maxDustWindow (line 114) | double maxDustWindow(const std::string& seq, size_t windowSize, size_t m...
function countDifferences (line 132) | int countDifferences(const std::string& s1, const std::string& s2, size_...
function getDiffString (line 144) | std::string getDiffString(const std::string& s1, const std::string& s2)
function randomBase (line 154) | char randomBase()
function stripFilename (line 175) | std::string stripFilename(const std::string& filename)
function stripExtension (line 185) | std::string stripExtension(const std::string& filename)
function stripGzippedExtension (line 197) | std::string stripGzippedExtension(const std::string& filename)
function stripDirectories (line 207) | std::string stripDirectories(const std::string& filename)
function getFileExtension (line 218) | std::string getFileExtension(const std::string& filename)
function writeFastaRecord (line 229) | void writeFastaRecord(std::ostream* pWriter, const std::string& id, cons...
function isGzip (line 240) | bool isGzip(const std::string& filename)
function isFastq (line 253) | bool isFastq(const std::string& filename)
function getFilesize (line 260) | std::ifstream::pos_type getFilesize(const std::string& filename)
function assertFileOpen (line 305) | void assertFileOpen(std::ifstream& fh, const std::string& fn)
function assertFileOpen (line 315) | void assertFileOpen(std::ofstream& fh, const std::string& fn)
function assertGZOpen (line 325) | void assertGZOpen(gzstreambase& gh, const std::string& fn)
function StringVector (line 335) | StringVector split(std::string in, char delimiter)
function splitKeyValue (line 352) | void splitKeyValue(std::string in, std::string& key, std::string& value)
function getPairBasename (line 369) | std::string getPairBasename(const std::string& id)
function getPairID (line 381) | std::string getPairID(const std::string& id)
function getPairIndex (line 407) | int getPairIndex(const std::string& id)
function debug_getReadDistFromNames (line 426) | size_t debug_getReadDistFromNames(const std::string& name1, const std::s...
FILE: src/SGA/Util/Util.h
type std (line 40) | typedef std::string Sequence;
type std (line 41) | typedef std::string ContigID;
type std (line 42) | typedef std::vector<int> IntVec;
type std (line 43) | typedef std::vector<double> DoubleVec;
type std (line 44) | typedef std::vector<std::string> StringVector;
type std (line 45) | typedef std::list<std::string> StringList;
type std (line 46) | typedef std::vector<Sequence> SequenceVector;
type std (line 61) | typedef std::vector<SeqItem> SeqItemVector;
type SeqRecord (line 64) | struct SeqRecord
function getPhredScore (line 97) | int getPhredScore(size_t i) const
type std (line 115) | typedef std::vector<SeqRecord> SeqRecordVector;
function string (line 145) | string makeKeyValue(std::string key, C value)
function getBaseRank (line 187) | inline static uint8_t getBaseRank(char b)
function complement (line 218) | inline char complement(char base)
function complementIUPAC (line 240) | inline char complementIUPAC(char c)
function isErrorRateAcceptable (line 273) | inline bool isErrorRateAcceptable(double er, double threshold)
FILE: src/SGA/Util/VCFUtil.cpp
function VCFClassification (line 144) | VCFClassification VCFRecord::classify() const
function VCFReturnCode (line 174) | VCFReturnCode VCFUtil::generateVCFFromCancerVariant(const std::string& ref,
type tm (line 412) | struct tm
FILE: src/SGA/Util/VCFUtil.h
type VCFClassification (line 17) | enum VCFClassification
type VCFReturnCode (line 27) | enum VCFReturnCode
type VCFRecord (line 39) | struct VCFRecord
type std (line 84) | typedef std::vector<VCFRecord> VCFVector;
type std (line 85) | typedef std::vector<std::string> StringVector;
type VCFCollection (line 88) | struct VCFCollection
function namespace (line 94) | namespace VCFUtil
FILE: src/SGA/Util/VariantIndex.cpp
function VariantRecordVector (line 70) | VariantRecordVector VariantIndex::getNearVariants(const std::string& ref...
FILE: src/SGA/Util/VariantIndex.h
type VariantRecord (line 20) | struct VariantRecord
type std (line 28) | typedef std::vector<VariantRecord> VariantRecordVector;
type std (line 29) | typedef std::vector<int> IntVector;
type std (line 30) | typedef std::vector<IntVector> IntVectorVector;
type std (line 33) | typedef std::map<std::string, IntVectorVector> VariantIndexMap;
function class (line 35) | class VariantIndex
FILE: src/SGA/Util/Verbosity.h
function class (line 14) | class Verbosity
FILE: src/SGA/Util/bamreader.h
function class (line 8) | class SVBamReader {
FILE: src/SGA/Util/bucketSort.h
type typename (line 18) | typedef typename std::iterator_traits<IterType>::value_type base_value;
type std (line 19) | typedef std::list<base_value> base_list;
type typename (line 20) | typedef typename base_list::iterator list_iterator;
FILE: src/SGA/Util/gzstream.C
function namespace (line 40) | namespace GZSTREAM_NAMESPACE {
function underflow (line 84) | int gzstreambuf::underflow() { // used for input buffer only
function flush_buffer (line 109) | int gzstreambuf::flush_buffer() {
function overflow (line 119) | int gzstreambuf::overflow( int c) { // used for output buffer only
function sync (line 131) | int gzstreambuf::sync() {
function open (line 155) | void gzstreambase::open( const char* name, int open_mode) {
function close (line 160) | void gzstreambase::close() {
FILE: src/SGA/Util/gzstream.h
function namespace (line 45) | namespace GZSTREAM_NAMESPACE {
FILE: src/SGA/Util/mkqs.h
function vecswap (line 30) | void vecswap(int i, int j, int n, T* x)
function vecswap2 (line 45) | void vecswap2(T* a, T* b, int n)
function inssort (line 67) | void inssort(T* a, int n, int d, const PrimarySorter& primarySorter, con...
FILE: src/SGA/Util/old.AlignedContig.h
function class (line 6) | class AlignedContig {
FILE: src/SGA/Util/stdaln.c
function AlnAln (line 232) | AlnAln *aln_init_AlnAln()
function aln_free_AlnAln (line 241) | void aln_free_AlnAln(AlnAln *aa)
type dpcell_t (line 321) | typedef struct
type dpscore_t (line 326) | typedef struct
function aln_init_score_array (line 332) | void aln_init_score_array(unsigned char *seq, int len, int row, int *sco...
function aln_global_core (line 345) | int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, ...
function aln_local_core (line 528) | int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, i...
function AlnAln (line 777) | AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnPara...
function AlnAln (line 857) | AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *a...
function aln_extend_core (line 877) | int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, ...
function main (line 1058) | int main()
FILE: src/SGA/Util/stdaln.h
type AlnParam (line 86) | typedef struct
type path_t (line 97) | typedef struct
type AlnAln (line 103) | typedef struct
FILE: src/svaba/AlignedContig.cpp
function r2cEscape (line 198) | std::string r2cEscape(const std::string& s) {
function r2cCigarString (line 211) | std::string r2cCigarString(const SeqLib::Cigar& c) {
type BpSupportEntry (line 359) | struct BpSupportEntry {
function BreakPointPtrVector (line 615) | BreakPointPtrVector AlignedContig::getAllBreakPoints() const {
FILE: src/svaba/AlignedContig.h
function class (line 19) | class AlignedContig {
type std (line 167) | typedef std::unordered_map<std::string, AlignedContig> ContigMap;
type std (line 168) | typedef std::vector<AlignedContig> AlignedContigVec;
FILE: src/svaba/BreakPoint.cpp
function to_string (line 32) | static inline std::string to_string(SVType t) {
function to_string (line 45) | static inline std::string to_string(SomaticState s) {
function GenomicRegion (line 60) | GenomicRegion BreakPoint::BreakEndAsGenomicRegionLeft() const {
function GenomicRegion (line 64) | GenomicRegion BreakPoint::BreakEndAsGenomicRegionRight() const {
function HashVector (line 81) | HashVector BreakPoint::getBreakEndHashes() {
function __myround (line 138) | double __myround(double x) { return std:: floor(x * 10) / 10; }
function split_tabs (line 2440) | inline std::vector<std::string> split_tabs(const std::string& s) {
function to_long (line 2453) | inline long to_long(const std::string& s) {
function to_int (line 2457) | inline int to_int(const std::string& s) {
function to_double (line 2461) | inline double to_double(const std::string& s) {
function parse_strand (line 2465) | inline char parse_strand(const std::string& s) {
function upper (line 2469) | inline std::string upper(std::string v){
function SVType (line 2476) | inline SVType parse_svtype(const std::string& s_in) {
function SomaticState (line 2489) | inline SomaticState parse_somatic(const std::string& s_in) {
function stod_safe (line 2496) | inline double stod_safe(const std::string& s) {
FILE: src/svaba/BreakPoint.h
type class (line 16) | enum class
type class (line 23) | enum class
type class (line 33) | enum class
function namespace (line 42) | namespace SeqLib {
type BreakPoint (line 56) | struct BreakPoint
type BreakEnd (line 57) | struct BreakEnd
type std (line 59) | typedef std::vector<std::string> HashVector;
type std (line 60) | typedef std::vector<BreakPoint> BPVec;
type std (line 61) | typedef std::shared_ptr<BreakPoint> BreakPointPtr;
type std (line 62) | typedef std::vector<BreakPointPtr> BreakPointPtrVector;
type std (line 64) | typedef std::unordered_set<std::string> ReadNameSet;
type BreakEnd (line 66) | struct BreakEnd {
function class (line 105) | class BreakPoint {
FILE: src/svaba/ContigAlignmentScore.cpp
type svaba (line 28) | namespace svaba {
function T (line 33) | inline T clip_range(T v, T lo, T hi) {
function ContigAlignScore (line 39) | ContigAlignScore scoreContigAlignment(const SeqLib::BamRecord& r) {
function tagContigAlignment (line 144) | void tagContigAlignment(SeqLib::BamRecord& r, const ContigAlignScore& ...
function readContigConfTag (line 165) | double readContigConfTag(const SeqLib::BamRecord& r) {
FILE: src/svaba/ContigAlignmentScore.h
function namespace (line 24) | namespace svaba {
FILE: src/svaba/DBSnpFilter.h
function namespace (line 13) | namespace SeqLib {
function class (line 17) | class DBSnpSite: public SeqLib::GenomicRegion {
type SeqLib (line 31) | typedef SeqLib::GenomicRegionCollection<DBSnpSite> DBC;
function class (line 33) | class DBSnpFilter {
FILE: src/svaba/DiscordantCluster.cpp
type SeqLib (line 19) | namespace SeqLib {
class BamHeader (line 20) | class BamHeader
class GenomicRegion (line 21) | class GenomicRegion
function DiscordantClusterMap (line 26) | DiscordantClusterMap DiscordantCluster::clusterReads(svabaReadPtrVector&...
function GenomicRegion (line 613) | GenomicRegion DiscordantCluster::GetMateRegionOfOverlap(const GenomicReg...
FILE: src/svaba/DiscordantCluster.h
type std (line 14) | typedef std::unordered_map<std::string, svabaReadPtr> DiscordantReadMap;
type std (line 15) | typedef std::vector<svabaReadPtrVector> svabaReadClusterVector;
type std (line 16) | typedef std::vector<DiscordantCluster> DiscordantClusterVector;
type std (line 17) | typedef std::unordered_map<std::string, DiscordantCluster> DiscordantClu...
function isEmpty (line 46) | bool isEmpty() const;
FILE: src/svaba/DiscordantRealigner.h
function namespace (line 3) | namespace SeqLib {
function class (line 10) | class DiscordantRealigner {
FILE: src/svaba/Histogram.cpp
function Bin (line 207) | Bin& Bin::operator++()
function Bin (line 214) | Bin& Bin::operator--() {
FILE: src/svaba/Histogram.h
type SeqLib (line 16) | typedef SeqLib::TInterval<Bin> BinInterval;
type SeqLib (line 17) | typedef SeqLib::TIntervalTree<Bin> BinIntervalTree;
type std (line 18) | typedef std::vector<BinInterval> BinIntervalVector;
function class (line 26) | class Bin {
FILE: src/svaba/KmerFilter.h
type std (line 12) | typedef std::map<std::string, int> KmerCountMap;
function class (line 14) | class KmerFilter {
FILE: src/svaba/LearnBamParams.cpp
function extractReadGroups (line 24) | std::vector<std::string> extractReadGroups(const SeqLib::BamHeader& hdr) {
FILE: src/svaba/LearnBamParams.h
function namespace (line 13) | namespace SeqLib {
function class (line 23) | class BamReadGroup {
function class (line 76) | class LearnBamParams {
FILE: src/svaba/STCoverage.h
function namespace (line 20) | namespace SeqLib {
type std (line 24) | typedef std::unordered_map<int32_t,uint32_t> CovMap;
function class (line 30) | class STCoverage {
FILE: src/svaba/SvabaASQG.cpp
type svabaASQG (line 13) | namespace svabaASQG
function RecordType (line 343) | RecordType getRecordType(const std::string& record)
function writeFields (line 376) | void writeFields(std::ostream& out, const StringVector& fields)
FILE: src/svaba/SvabaASQG.h
function namespace (line 16) | namespace svabaASQG
type VertexRecord (line 63) | struct VertexRecord
type EdgeRecord (line 88) | struct EdgeRecord
FILE: src/svaba/SvabaAssemble.cpp
function StringGraph (line 26) | StringGraph* assemble(std::stringstream& asqg_stream, int minOverlap, in...
function walkExtra (line 115) | void walkExtra(StringGraph * pGraph, SGWalkVector& outWalks) {
FILE: src/svaba/SvabaAssemble.h
type AssemblyOptions (line 22) | struct AssemblyOptions {
FILE: src/svaba/SvabaAssemblerConfig.h
function namespace (line 38) | namespace svaba {
FILE: src/svaba/SvabaAssemblerEngine.cpp
function repeat (line 19) | static std::string repeat(char base, int count) {
function ReadTable (line 270) | ReadTable* svabaAssemblerEngine::removeDuplicates(ReadTable* pRT) {
FILE: src/svaba/SvabaAssemblerEngine.h
function class (line 12) | class svabaAssemblerEngine
FILE: src/svaba/SvabaBamWalker.cpp
function debug_read (line 17) | inline void debug_read(std::string_view msg,
function hasAdapter (line 36) | static bool hasAdapter(const svabaRead& r) {
FILE: src/svaba/SvabaBamWalker.h
function class (line 21) | class MateRegion: public SeqLib::GenomicRegion
type SeqLib (line 31) | typedef SeqLib::GenomicRegionCollection<MateRegion> MateRegionVector;
function class (line 33) | class svabaBamWalker: public SeqLib::BamReader {
FILE: src/svaba/SvabaDebug.h
function _svaba_trace_match (line 34) | inline bool _svaba_trace_match(const std::string& cname) {
function _svaba_read_trace_match (line 78) | inline bool _svaba_read_trace_match(const std::string& qname) {
function std (line 132) | inline std::string _svaba_revcomp(const std::string& s) {
function _svaba_kmer_match (line 146) | inline bool _svaba_kmer_match(const std::string& seq) {
FILE: src/svaba/SvabaFileLoader.h
function namespace (line 12) | namespace SeqLib {
function class (line 18) | class SvabaFileLoader {
FILE: src/svaba/SvabaLogger.h
function class (line 11) | class SvabaLogger {
FILE: src/svaba/SvabaModels.cpp
type SvabaModels (line 8) | namespace SvabaModels {
function LogLikelihood (line 11) | double LogLikelihood(double ref,
function SomaticLOD (line 70) | double SomaticLOD(double aN, double dN, double aT, double dT,
function SomaticLOD_withSplitErrors (line 86) | static inline double SomaticLOD_withSplitErrors(double aN, double dN, ...
function GenotypeQuality (line 338) | int GenotypeQuality(const std::vector<int>& PLs) {
function GenotypeLikelihoods (line 356) | double GenotypeLikelihoods(int g,
FILE: src/svaba/SvabaModels.h
function namespace (line 5) | namespace SvabaModels {
FILE: src/svaba/SvabaOptions.cpp
function SvabaOptions (line 113) | SvabaOptions SvabaOptions::parse(int argc, char** argv) {
FILE: src/svaba/SvabaOptions.h
function class (line 161) | class SvabaOptions {
FILE: src/svaba/SvabaOutputWriter.cpp
function make_header_with_hd (line 27) | static SeqLib::BamHeader make_header_with_hd(const SeqLib::BamHeader& src,
FILE: src/svaba/SvabaOutputWriter.h
type SvabaSharedConfig (line 15) | struct SvabaSharedConfig
type SvabaOutputWriter (line 17) | struct SvabaOutputWriter {
FILE: src/svaba/SvabaOverlapAlgorithm.cpp
function OverlapResult (line 21) | OverlapResult svabaOverlapAlgorithm::overlapRead(const SeqRecord& read, ...
function OverlapResult (line 35) | OverlapResult svabaOverlapAlgorithm::overlapReadInexact(const SeqRecord&...
function OverlapResult (line 169) | OverlapResult svabaOverlapAlgorithm::alignReadDuplicate(const SeqRecord&...
function OverlapResult (line 184) | OverlapResult svabaOverlapAlgorithm::overlapReadExact(const SeqRecord& r...
FILE: src/svaba/SvabaOverlapAlgorithm.h
function class (line 33) | class svabaOverlapAlgorithm
function setExactModeIrreducible (line 74) | void setExactModeIrreducible(bool b) { m_exactModeIrreducible = b; }
function BWT (line 77) | const BWT* getBWT() const { return m_pBWT; }
function BWT (line 78) | const BWT* getRBWT() const { return m_pRevBWT; }
FILE: src/svaba/SvabaPostprocess.cpp
type Opts (line 100) | struct Opts {
type option (line 112) | struct option
function printUsage (line 123) | void printUsage() {
function Opts (line 145) | Opts parseOpts(int argc, char** argv) {
function fileExists (line 177) | bool fileExists(const std::string& p) {
function off_t (line 182) | off_t fileSize(const std::string& p) {
function logLine (line 190) | void logLine(Args&&... args) {
function shQuote (line 200) | std::string shQuote(const std::string& s) {
type PgChainInfo (line 233) | struct PgChainInfo {
function PgChainInfo (line 238) | PgChainInfo scanPgChain(const std::string& hdr_text) {
function hasSvabaPostprocessPg (line 272) | bool hasSvabaPostprocessPg(const std::string& hdr_text) {
function uniquifyId (line 281) | std::string uniquifyId(const std::string& base,
function sanitizeHeaderValue (line 295) | std::string sanitizeHeaderValue(const std::string& s) {
function appendSvabaPostprocessPg (line 307) | std::string appendSvabaPostprocessPg(const std::string& hdr_text,
function stampedHeader (line 325) | SeqLib::BamHeader stampedHeader(const SeqLib::BamHeader& src,
function readHeaderOnly (line 332) | SeqLib::BamHeader readHeaderOnly(const std::string& bam) {
function isCoordinateSorted (line 351) | bool isCoordinateSorted(const std::string& bam) {
function reheaderBamWithPg (line 380) | int reheaderBamWithPg(const std::string& bam,
function indexBam (line 437) | int indexBam(const std::string& bam, const std::string& tag, int verbose) {
function runSort (line 459) | int runSort(const std::string& in_bam,
type DedupStats (line 492) | struct DedupStats {
function mergeCommaTokens (line 507) | bool mergeCommaTokens(std::string& cur, std::string_view incoming) {
function mergeZTagInto (line 541) | bool mergeZTagInto(SeqLib::BamRecord& existing,
function DedupStats (line 571) | DedupStats streamDedup(const std::string& in_bam,
function renameOrThrow (line 786) | void renameOrThrow(const std::string& from, const std::string& to) {
function isDedupSuffix (line 793) | bool isDedupSuffix(const std::string& s) {
function processSuffix (line 813) | void processSuffix(const std::string& id,
function runPostprocess (line 938) | void runPostprocess(int argc, char** argv) {
FILE: src/svaba/SvabaRead.cpp
function r2c (line 19) | r2c svabaRead::GetR2C(const std::string& contig_name) const {
FILE: src/svaba/SvabaRead.h
type std (line 44) | typedef std::unordered_map<std::string, r2c> R2CMap;
type std (line 47) | typedef std::shared_ptr<svabaRead> svabaReadPtr;
type std (line 48) | typedef std::vector<svabaReadPtr> svabaReadPtrVector;
function SetPrefix (line 80) | void SetPrefix(const std::string_view pref) { p = pref; }
function SetDD (line 88) | void SetDD(int d) { dd = d; }
function SeqLength (line 92) | int SeqLength() const;
FILE: src/svaba/SvabaRegionProcessor.cpp
function queryInterval (line 62) | inline std::pair<int,int> queryInterval(const SeqLib::BamRecord& r) {
function reciprocalOverlap (line 74) | inline double reciprocalOverlap(std::pair<int,int> a, std::pair<int,int>...
function preferStandardChromosomes (line 82) | size_t preferStandardChromosomes(BamRecordPtrVector& alns, int maxStdChr) {
function GRC (line 158) | GRC SvabaRegionProcessor::runMateCollectionLoop(const GenomicRegion& reg...
FILE: src/svaba/SvabaRegionProcessor.h
function namespace (line 18) | namespace SeqLib {
function class (line 24) | class SvabaRegionProcessor {
FILE: src/svaba/SvabaSharedConfig.h
function class (line 19) | class SvabaSharedConfig {
FILE: src/svaba/SvabaThreadUnit.cpp
function make_header_with_hd (line 12) | static SeqLib::BamHeader make_header_with_hd(const SeqLib::BamHeader& src,
FILE: src/svaba/SvabaThreadUnit.h
function namespace (line 29) | namespace SeqLib {
function class (line 35) | class svabaThreadUnit {
FILE: src/svaba/SvabaUtils.cpp
type svabaUtils (line 7) | namespace svabaUtils {
function fileDateString (line 64) | std::string fileDateString() {
function myreplace (line 129) | std::string myreplace(std::string &s,
function hasRepeat (line 136) | bool hasRepeat(const std::string& seq) {
function overlapSize (line 153) | int overlapSize(const SeqLib::BamRecord& query, const SeqLib::BamRecor...
function print (line 186) | void print(std::stringstream& s, std::ofstream& log, bool cerr) {
function runTimeString (line 194) | std::string runTimeString(int num_t_reads, int num_n_reads, int contig...
function countJobs (line 224) | int countJobs(const std::string& regionFile, SeqLib::GRC &file_regions...
function __bamOptParse (line 289) | std::string __bamOptParse(std::map<std::string, std::string>& obam, st...
function __openWriterBam (line 299) | bool __openWriterBam(const SeqLib::BamHeader& h, const std::string& na...
function CalcMHWScore (line 320) | double CalcMHWScore(std::vector<int>& scores)
function weightedRandom (line 339) | int weightedRandom(const std::vector<double>& cs) {
function tokenize_delimited (line 352) | std::vector<std::string> tokenize_delimited(const std::string& str, ch...
function checkHeaderCompatibility (line 370) | void checkHeaderCompatibility(const SeqLib::BamHeader& bamHeader,
function find_repeats (line 422) | std::vector<std::pair<int, int>> find_repeats(std::string_view seq, si...
function parsePLString (line 463) | std::vector<int> parsePLString(const std::string& pl_str) {
function SubstringList (line 486) | SubstringList find_long_homopolymers(const std::string& s) {
function SubstringList (line 516) | SubstringList find_long_dinuc_repeats(const std::string& s) {
FILE: src/svaba/SvabaUtils.h
type std (line 17) | typedef std::pair<size_t, size_t> CountPair;
type std (line 18) | typedef std::tuple<size_t, size_t, std::string> Substring;
type std (line 19) | typedef std::vector<Substring> SubstringList;
function namespace (line 23) | namespace svabaUtils {
FILE: src/svaba/refilter.cpp
type opt (line 45) | namespace opt {
type option (line 77) | struct option
function parseBreakOptions (line 113) | static void parseBreakOptions(int argc, char** argv) {
function splitSampleHeader (line 153) | static std::pair<std::string,std::string>
function runRefilterBreakpoints (line 160) | void runRefilterBreakpoints(int argc, char** argv) {
FILE: src/svaba/run_svaba.cpp
type SvabaBatchWorkItem (line 58) | struct SvabaBatchWorkItem {
method SvabaBatchWorkItem (line 64) | SvabaBatchWorkItem(const std::vector<std::pair<SeqLib::GenomicRegion,i...
function sendThreads (line 78) | void sendThreads(const SeqLib::GRC& regionsToRun,
function makeVCFs (line 114) | void makeVCFs(SvabaSharedConfig& sc) {
function runsvaba (line 159) | void runsvaba(int argc, char** argv) {
FILE: src/svaba/svaba.cpp
function printUsage (line 28) | static void printUsage() {
function main (line 45) | int main(int argc, char* argv[]) {
FILE: src/svaba/test_svaba.cpp
type TestOpts (line 68) | struct TestOpts {
type option (line 107) | struct option
function TestOpts (line 123) | TestOpts parseCli(int argc, char** argv) {
function readNReads (line 171) | SeqLib::BamRecordVector readNReads(const std::string& bam,
type Phase (line 203) | struct Phase {
method total (line 210) | double total() const { return walk + ec + realign + assem + align; }
function since (line 213) | double since(std::chrono::steady_clock::time_point t0) {
function Phase (line 217) | Phase oneTrial(const TestOpts& opts,
method total (line 210) | double total() const { return walk + ec + realign + assem + align; }
function median (line 332) | double median(std::vector<double> v) {
function printTable (line 342) | void printTable(const TestOpts& opts,
function runTest (line 412) | void runTest(int argc, char** argv) {
FILE: src/svaba/threadpool.h
function shutdown (line 39) | void shutdown(size_t numThreads) {
function submit (line 86) | void submit(std::unique_ptr<WorkItem> job){
function shutdown (line 91) | void shutdown(){
FILE: src/svaba/tovcf.cpp
type ToVcfOpts (line 59) | struct ToVcfOpts {
type option (line 119) | struct option
function QualMode (line 137) | QualMode parse_qual_mode(const std::string& s) {
function ToVcfOpts (line 146) | ToVcfOpts parse_cli(int argc, char** argv) {
function splitSampleHeader (line 206) | std::pair<std::string,std::string> splitSampleHeader(const std::string& ...
function runToVCF (line 215) | void runToVCF(int argc, char** argv) {
FILE: src/svaba/vcf.cpp
function hash_string32 (line 69) | inline uint32_t hash_string32(const std::string& s) {
function compareInfoFields (line 76) | bool compareInfoFields(const std::pair<std::string, std::string>& lhs,
function pairCompareDesc (line 86) | bool pairCompareDesc(
function svtype_to_string (line 93) | std::string svtype_to_string(SVType t) {
function max_allele_lo (line 108) | double max_allele_lo(const BreakPoint& bp) {
function populate_sv_header (line 541) | void populate_sv_header(VCFHeader& h) {
function populate_indel_header (line 609) | void populate_indel_header(VCFHeader& h) {
class GenomicRegionWithID (line 756) | class GenomicRegionWithID : public SeqLib::GenomicRegion {
method GenomicRegionWithID (line 758) | GenomicRegionWithID(int32_t c, uint32_t p1, uint32_t p2, int i, int p)
function classify_symbolic_kind (line 1050) | std::string classify_symbolic_kind(const BreakPoint& bp) {
function entry_lt_for_output (line 1065) | bool entry_lt_for_output(const VCFEntry& a, const VCFEntry& b) {
function open_vcf_out (line 1077) | std::unique_ptr<std::ostream> open_vcf_out(const std::string& path, bool...
FILE: src/svaba/vcf.h
function QualMode (line 60) | enum class QualMode { SUM_LO_PHRED, MAXLOD_PHRED, MISSING };
FILE: src/svabautils/AssemblyBamWalker.cpp
type timespec (line 28) | struct timespec
function __good_contig (line 32) | bool __good_contig(const SeqLib::BamRecordVector& brv, const SeqLib::Gen...
function runAC (line 52) | bool runAC(const ContigElement * c) {
FILE: src/svabautils/AssemblyBamWalker.h
function class (line 12) | class AssemblyBamWalker: public SeqLib::BamReader {
function AssemblyWalkerWorkItem (line 32) | struct ContigElement {
FILE: src/svabautils/BamSplitter.h
function class (line 8) | class BamSplitter: public SeqLib::BamReader
FILE: src/svabautils/Fractions.h
function class (line 12) | class FracRegion : public SeqLib::GenomicRegion {
function class (line 27) | class Fractions {
FILE: src/svabautils/PowerLawSim.cpp
function PowerLawSim (line 14) | void PowerLawSim(faidx_t* findex, int num_breaks, double power_law, SeqL...
function drawFromPower (line 203) | std::vector<int> drawFromPower(double x0, double x1, double power, int n...
function genRandomSequence (line 222) | void genRandomSequence(std::string& s, SeqLib::GenomicRegion& gr, int wi...
FILE: src/svabautils/PowerLawSim.h
type SVEvent (line 14) | struct SVEvent {
FILE: src/svabautils/ReadSim.cpp
function Indel (line 208) | Indel ReadSim::makeDelErrors(std::string& s, int del_size) {
function Indel (line 239) | Indel ReadSim::makeDelErrors(std::string& s, int sstart, const std::stri...
function Indel (line 279) | Indel ReadSim::makeInsErrors(std::string& s, bool keep_size, int indel_s...
FILE: src/svabautils/ReadSim.h
type Indel (line 10) | struct Indel {
function class (line 34) | class ReadSim {
FILE: src/svabautils/SeqFrag.h
function class (line 12) | class SeqFrag {
FILE: src/svabautils/SimGenome.h
function class (line 8) | class SimGenome {
FILE: src/svabautils/SimTrainerWalker.h
function class (line 7) | class SimTrainerWalker : public SeqLib::BamReader {
FILE: src/svabautils/assembly2vcf.cpp
type bidx_delete (line 21) | struct bidx_delete {
type option (line 26) | struct option
type opt (line 71) | namespace opt {
function runAssembly2VCF (line 92) | void runAssembly2VCF(int argc, char** argv)
function parseAssembly2VCFOptions (line 188) | void parseAssembly2VCFOptions(int argc, char** argv) {
FILE: src/svabautils/benchmark.cpp
type opt (line 45) | namespace opt {
type option (line 126) | struct option
function runBenchmark (line 155) | void runBenchmark(int argc, char** argv) {
function genBreaks (line 271) | std::string genBreaks() {
function splitBam (line 397) | void splitBam() {
function assemblyTest (line 427) | void assemblyTest() {
function parseBenchmarkOptions (line 593) | void parseBenchmarkOptions(int argc, char** argv) {
function parseErrorRates (line 680) | std::vector<double> parseErrorRates(const std::string& s) {
function errorRateString (line 698) | std::string errorRateString(const std::vector<double>& v, const std::str...
function realignBreaks (line 711) | void realignBreaks() {
function realignRandomSegments (line 780) | void realignRandomSegments() {
FILE: src/svabautils/snowmanutils.cpp
function main (line 21) | int main(int argc, char** argv) {
FILE: src/svabautils/snowtools.cpp
function ParseFilterObject (line 16) | bool ParseFilterObject(const string& filterName, const Json::Value& filt...
function string (line 47) | const string GetScriptContents(string script) {
function main (line 84) | int main(int argc, char** argv) {
FILE: src/svabautils/splitcounter.cpp
type opt (line 21) | namespace opt {
type option (line 30) | struct option
function runSplitFasta (line 68) | void runSplitFasta(int argc, char** argv) {
function runSplitCounter (line 129) | void runSplitCounter(int argc, char** argv) {
function parseSplitCounterOptions (line 218) | void parseSplitCounterOptions(int argc, char** argv) {
function parseFastaSplitOptions (line 249) | void parseFastaSplitOptions(int argc, char** argv) {
Condensed preview — 399 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (5,890K chars).
[
{
"path": "CLAUDE.md",
"chars": 46931,
"preview": "# CLAUDE.md — svaba working notes\n\nThis file captures conventions, file landmarks, and open investigations for the\nsvaba"
},
{
"path": "CMakeLists.txt",
"chars": 11247,
"preview": "cmake_minimum_required(VERSION 3.10)\n\n# Set the C++ standard required for the project\nproject(svaba) # LANGUAGES CXX)\nse"
},
{
"path": "Dockerfile",
"chars": 991,
"preview": "# Start with an Ubuntu image\nFROM ubuntu:20.04\n\n# Avoid prompts with tzdata (timezones)\nENV DEBIAN_FRONTEND=noninteracti"
},
{
"path": "LICENSE",
"chars": 35147,
"preview": " GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free "
},
{
"path": "R/archive_non_functional/create-databases.R",
"chars": 5516,
"preview": "####################\n## load CCDS and name table\nccds <- fread(\"/xchip/gistic/Jeremiah/tracks/ccdsGene.hg19.txt\")\n\n## lo"
},
{
"path": "R/archive_non_functional/gen_quals.R",
"chars": 500,
"preview": "require(data.table)\n\nf <- fread(\"grep -v ^# /broad/broadsv/NA12878/GCAT/gcat_illumina_150x/snowman/v115/v115.snowman.ind"
},
{
"path": "R/archive_non_functional/svaba-annotate.R",
"chars": 15045,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\n\noption_list = list(\n make_option(c(\"-i\", \"--input\"), type = \"character\", de"
},
{
"path": "R/archive_non_functional/svaba-asqg2pdf.R",
"chars": 3151,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\n\noption_list = list(\n make_option(c(\"-i\", \"--input\"), type = \"character\", "
},
{
"path": "R/archive_non_functional/svaba-bam-qcplot.R",
"chars": 8955,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\n\noption_list = list(\n make_option(c(\"-i\", \"--input\"), type = \"character\", "
},
{
"path": "R/archive_non_functional/svaba-benchmark.R",
"chars": 7063,
"preview": "#!/use/bin/env/ Rscript\n\nrequire(ggplot2)\nrequire(data.table)\n\n.libPaths = c(\"/xchip/gistic/Jeremiah/R\", \"/broad/softwar"
},
{
"path": "R/archive_non_functional/svaba-bps-to-maflite.R",
"chars": 1292,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\nsuppressMessages(suppressWarnings(require(VariantAnnotation, quietly=TRUE)))\n\n"
},
{
"path": "R/archive_non_functional/svaba-circos.R",
"chars": 31819,
"preview": "#!/usr/bin/env Rscript\n\n## set the right library paths\n.libPaths = c(\"/xchip/gistic/Jeremiah/R\", \"/broad/software/free/L"
},
{
"path": "R/archive_non_functional/svaba-create-pon.R",
"chars": 1139,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\n\noption_list = list(\n make_option(c(\"-i\", \"--input\"), type = \"character\", "
},
{
"path": "R/archive_non_functional/svaba-event-plot.R",
"chars": 3397,
"preview": "#!/usr/bin/env Rscript\n\n## source all the required packages\nsource.all <- function() {\n\n githome <- Sys.getenv('GIT_HOM"
},
{
"path": "R/archive_non_functional/svaba-histogram.R",
"chars": 35953,
"preview": "#!/usr/bin/env Rscript\n\n## set the right library paths\n.libPaths = c(\"/xchip/gistic/Jeremiah/R\", \"/broad/software/free/L"
},
{
"path": "R/archive_non_functional/svaba-nozzle.R",
"chars": 21202,
"preview": "#!/usr/bin/env Rscript\n\n####\n# load the libraries\n###\n.nozzleLibraries <- function() {\n\n print('...loading libraries')\n"
},
{
"path": "R/archive_non_functional/svaba-sig.R",
"chars": 12924,
"preview": "#!/usr/bin/env Rscript\n\nlibrary(optparse)\nrequire(VariantAnnotation)\n\noption_list = list(\n make_option(c(\"-i\", \"--inp"
},
{
"path": "R/archive_non_functional/svaba-vcf-to-maflite.R",
"chars": 1684,
"preview": "#!/usr/bin/env Rscript\n\nsuppressMessages(suppressWarnings(require(optparse, quietly=TRUE)))\n\noption_list = list(\n mak"
},
{
"path": "README.md",
"chars": 20617,
"preview": "## *SvABA* — Structural variation and indel analysis by assembly\n\nSvABA (formerly *Snowman*) is an SV and indel caller f"
},
{
"path": "docs/README.md",
"chars": 1282,
"preview": "# SvABA BPS Viewer\n\nThis folder contains a standalone static viewer for SvABA `bps.txt` and `bps.txt.gz` files.\n\n## File"
},
{
"path": "docs/alignments_viewer.html",
"chars": 41166,
"preview": "<!doctype html>\n<html lang=\"en\">\n<head>\n<meta charset=\"utf-8\">\n<title>svaba alignments viewer</title>\n<style>\n :root {\n"
},
{
"path": "docs/app.js",
"chars": 28465,
"preview": "(function () {\n \"use strict\";\n\n const BASE_COLUMNS = [\n \"chr1\",\n \"pos1\",\n \"strand1\",\n \"chr2\",\n \"pos2\",\n"
},
{
"path": "docs/bps_explorer.html",
"chars": 54332,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/bps_viewer.html",
"chars": 10436,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n <head>\n <meta charset=\"utf-8\">\n <meta name=\"viewport\" content=\"width=device-wid"
},
{
"path": "docs/comparison.html",
"chars": 55385,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/index.html",
"chars": 6610,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/learn_explorer.html",
"chars": 20401,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/r2c_explorer.html",
"chars": 41845,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/runtime_explorer.html",
"chars": 32350,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
},
{
"path": "docs/styles.css",
"chars": 10053,
"preview": ":root {\n --bg: #f3ece1;\n --bg-deep: #d7e2df;\n --panel: rgba(255, 251, 246, 0.85);\n --panel-strong: rgba(255, 253, 25"
},
{
"path": "notes",
"chars": 1054,
"preview": "AlignedContig constructor from BamRecordPtrVector\n--- Makes the alignment fragments\n------ Makes the indel bps from each"
},
{
"path": "opt/jemalloc_test.sh",
"chars": 1004,
"preview": "REGION=chr12:10,000,000-12,000,000\nT=/Users/jeremiahwala/Desktop/svaba_compare/SG.wgs.UCLA.2025.01.tumor_cleaned.recal.b"
},
{
"path": "opt/memprof.sh",
"chars": 1780,
"preview": "#!/usr/bin/env bash\nset -euo pipefail\n\nif (( $# < 1 )); then\n echo \"Usage: $0 <cmd> [args]\"\n exit 1\nfi\n\nOUT_LOG=\"memlo"
},
{
"path": "opt/memprof_osx.sh",
"chars": 1492,
"preview": "#!/usr/bin/env bash\nset -euo pipefail\nif (( $# < 1 )); then\n echo \"Usage: $0 <cmd> [args]\"\n exit 1\nfi\nOUT_LOG=\"memlog."
},
{
"path": "opt/memusg.sh",
"chars": 1065,
"preview": "#!/usr/bin/env bash\n# memusg -- Measure memory usage of processes\n# Usage: memusg COMMAND [ARGS]...\n#\n# Author: Jaeho Sh"
},
{
"path": "opt/profiler.sh",
"chars": 656,
"preview": "#!/usr/bin/env bash\n\"$@\" & # Run the given command line in the background.\npid=$! peak=0\nrm mem.log\n\n## make the Rscript"
},
{
"path": "opt/runtime.R",
"chars": 1587,
"preview": "#!/usr/bin/env Rscript\n\nsuppressPackageStartupMessages({\n library(data.table)\n library(ggplot2)\n library(ggrepel)\n})\n"
},
{
"path": "scripts/combine_blacklists.sh",
"chars": 12182,
"preview": "#!/usr/bin/env bash\n#\n# combine_blacklists.sh — combine multiple BED blacklists into one file.\n#\n# \"Combining\" blacklist"
},
{
"path": "scripts/extract_by_qname.sh",
"chars": 2878,
"preview": "#!/bin/bash\n# ============================================================\n# Extract all reads from a target BAM whose Q"
},
{
"path": "scripts/extract_discordants.sh",
"chars": 543,
"preview": "#!/bin/bash\n\nprefix=$1\n##prefix=\"${1%%.*}\"\n/usr/bin/samtools view -h ${prefix}.weird.bam | grep -E '^@|DC:Z' | /usr/bin/"
},
{
"path": "scripts/extract_pairs_by_seq.sh",
"chars": 3683,
"preview": "#!/bin/bash\n# ============================================================\n# Extract read pairs from a BAM where either "
},
{
"path": "scripts/filter_contig_supporting_reads.sh",
"chars": 3349,
"preview": "#!/usr/bin/env bash\n#\n# filter_contig_supporting_reads.sh\n#\n# Filter a svaba *.corrected.bam down to just the reads whos"
},
{
"path": "scripts/gcloud_teardown.sh",
"chars": 1739,
"preview": "#!/usr/bin/env bash\n#\n# gcloud_teardown.sh — kill svaba cloud workers and clean up the bucket.\n#\n# Usage:\n# gcloud_tea"
},
{
"path": "scripts/mosdepth_lowmapq_blacklist.sh",
"chars": 9945,
"preview": "#!/usr/bin/env bash\n#\n# mosdepth_lowmapq_blacklist.sh — flag regions dominated by multi-mappers.\n#\n# Given paired mosdep"
},
{
"path": "scripts/plot_learn.sh",
"chars": 4413,
"preview": "#!/usr/bin/env bash\n# plot_learn.sh — plot per-RG insert-size distributions from svaba learn data\n#\n# Usage:\n# plot_le"
},
{
"path": "scripts/r2c_for_contig.sh",
"chars": 4597,
"preview": "#!/usr/bin/env bash\n# r2c_for_contig.sh\n#\n# Given a svaba contig name, reconstruct the per-contig read-to-contig\n# (r2c)"
},
{
"path": "scripts/search_sequence.sh",
"chars": 2629,
"preview": "#!/bin/bash\n# ============================================================\n# Search a BAM for reads matching any of the "
},
{
"path": "scripts/sort_and_dedupe_bps_old.sh",
"chars": 4563,
"preview": "#!/usr/bin/env bash\n#\n# sort_and_dedupe_bps_old.sh — sort + dedup + PASS-filter for legacy\n# (pre-2.0) svaba bps.txt.gz "
},
{
"path": "scripts/sort_bps.sh",
"chars": 6232,
"preview": "#!/usr/bin/env bash\n#\n# sort_bps.sh — sort a svaba bps.txt.gz file by (chr1, pos1, chr2, pos2, -somlod)\n#\n# Reads: ${I"
},
{
"path": "scripts/svaba_cloud.sh",
"chars": 14229,
"preview": "#!/usr/bin/env bash\n#\n# svaba_cloud.sh — scatter svaba across GCP VMs, one partition per VM.\n#\n# Takes the svaba run com"
},
{
"path": "scripts/svaba_local_function.sh",
"chars": 16482,
"preview": "# svaba_local_function.sh — sourceable bash helpers for common svaba tasks.\n#\n# Designed to be dot-sourced into an inter"
},
{
"path": "scripts/svaba_postprocess.sh",
"chars": 25875,
"preview": "#!/usr/bin/env bash\n#\n# svaba_postprocess.sh — one-stop post-processing of a svaba run.\n#\n# Replaces (and subsumes) the "
},
{
"path": "scripts/update_svaba_image.sh",
"chars": 1470,
"preview": "#!/usr/bin/env bash\n#\n# update_svaba_image.sh — rebuild the svaba worker image from the builder VM.\n#\n# Stops the builde"
},
{
"path": "src/SGA/Algorithm/ClusterProcess.cpp",
"chars": 6538,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ClusterProcess.h",
"chars": 2041,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ConnectProcess.cpp",
"chars": 2890,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ConnectProcess.h",
"chars": 1497,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/DPAlignment.cpp",
"chars": 7351,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/DPAlignment.h",
"chars": 3977,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/ErrorCorrectProcess.cpp",
"chars": 19415,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ErrorCorrectProcess.h",
"chars": 3699,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ExtensionDP.cpp",
"chars": 16262,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/ExtensionDP.h",
"chars": 4654,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/FMMergeProcess.cpp",
"chars": 12792,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/FMMergeProcess.h",
"chars": 2657,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/GapFillProcess.cpp",
"chars": 9263,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/GapFillProcess.h",
"chars": 3193,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/HaplotypeBuilder.cpp",
"chars": 7574,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/HaplotypeBuilder.h",
"chars": 2856,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/KmerOverlaps.cpp",
"chars": 17718,
"preview": "///-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/KmerOverlaps.h",
"chars": 2139,
"preview": "///-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/LRAlignment.cpp",
"chars": 37983,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/LRAlignment.h",
"chars": 6784,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/Makefile.am",
"chars": 1248,
"preview": "noinst_LIBRARIES = libalgorithm.a\n\nlibalgorithm_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Bigraph \\\n\t-I$(top_srcdir)/src/S"
},
{
"path": "src/SGA/Algorithm/Makefile.in",
"chars": 29317,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/Algorithm/OverlapAlgorithm.cpp",
"chars": 49116,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/OverlapAlgorithm.h",
"chars": 8583,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/OverlapBlock.cpp",
"chars": 17915,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/OverlapBlock.h",
"chars": 7656,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/OverlapTools.cpp",
"chars": 6399,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/OverlapTools.h",
"chars": 1793,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/QCProcess.cpp",
"chars": 14801,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/QCProcess.h",
"chars": 4291,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ReadCluster.cpp",
"chars": 7399,
"preview": "///-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/ReadCluster.h",
"chars": 2333,
"preview": "///-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/SearchHistory.cpp",
"chars": 5754,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/SearchHistory.h",
"chars": 3999,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/SearchSeed.cpp",
"chars": 1174,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/SearchSeed.h",
"chars": 2717,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/StatsProcess.cpp",
"chars": 3564,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/StatsProcess.h",
"chars": 1752,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/StringGraphGenerator.cpp",
"chars": 8024,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/StringGraphGenerator.h",
"chars": 1994,
"preview": "///-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/Algorithm/StringThreader.cpp",
"chars": 10262,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/StringThreader.h",
"chars": 4507,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/VariationBuilderCommon.cpp",
"chars": 3037,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Algorithm/VariationBuilderCommon.h",
"chars": 2430,
"preview": "///----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Bigraph.cpp",
"chars": 20888,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Bigraph.h",
"chars": 6753,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Edge.cpp",
"chars": 3750,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Edge.h",
"chars": 5634,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/EdgeDesc.cpp",
"chars": 1187,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/EdgeDesc.h",
"chars": 924,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/GraphCommon.h",
"chars": 1781,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Makefile.am",
"chars": 404,
"preview": "noinst_LIBRARIES = libbigraph.a\n\nlibbigraph_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Util \\\n\t-I$(top_srcdir)/src/SGA/Thir"
},
{
"path": "src/SGA/Bigraph/Makefile.in",
"chars": 25651,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/Bigraph/Vertex.cpp",
"chars": 12339,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/Bigraph/Vertex.h",
"chars": 5788,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/Makefile.am",
"chars": 1729,
"preview": "noinst_LIBRARIES = libsga.a\n\nlibsga_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Util \\\n\t-I$(top_srcdir)/src/SGA/Bigraph \\\n\t-"
},
{
"path": "src/SGA/SGA/Makefile.in",
"chars": 19184,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/SGA/OverlapCommon.cpp",
"chars": 3367,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/OverlapCommon.h",
"chars": 1052,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/SGACommon.h",
"chars": 805,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/index.cpp",
"chars": 11574,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/index.h",
"chars": 664,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/overlap.cpp",
"chars": 15729,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SGA/overlap.h",
"chars": 599,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SQG/ASQG.cpp",
"chars": 9219,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SQG/ASQG.h",
"chars": 3567,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SQG/Makefile.am",
"chars": 219,
"preview": "noinst_LIBRARIES = libsqg.a\n\nlibsqg_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Bigraph \\\n\t-I$(top_srcdir)/src/SGA/Thirdpart"
},
{
"path": "src/SGA/SQG/Makefile.in",
"chars": 20637,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/SQG/SQG.cpp",
"chars": 547,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SQG/SQG.h",
"chars": 3046,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/CompleteOverlapSet.cpp",
"chars": 13187,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/CompleteOverlapSet.h",
"chars": 3243,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/GraphSearchTree.h",
"chars": 21278,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/Makefile.am",
"chars": 581,
"preview": "noinst_LIBRARIES = libstringgraph.a\n\nlibstringgraph_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Bigraph \\\n\t-I$(top_srcdir)/s"
},
{
"path": "src/SGA/StringGraph/Makefile.in",
"chars": 34276,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/StringGraph/RemovalAlgorithm.cpp",
"chars": 8688,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/RemovalAlgorithm.h",
"chars": 1454,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGAlgorithms.cpp",
"chars": 13014,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGAlgorithms.h",
"chars": 2772,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGSearch.cpp",
"chars": 7595,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGSearch.h",
"chars": 2315,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGUtil.cpp",
"chars": 10213,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGUtil.h",
"chars": 1149,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGVisitors.cpp",
"chars": 27512,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGVisitors.h",
"chars": 5549,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGWalk.cpp",
"chars": 14662,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/StringGraph/SGWalk.h",
"chars": 4129,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWT.h",
"chars": 695,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTAlgorithms.cpp",
"chars": 16753,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTAlgorithms.h",
"chars": 8335,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTCABauerCoxRosone.cpp",
"chars": 8600,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTCABauerCoxRosone.h",
"chars": 2987,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTCARopebwt.cpp",
"chars": 2613,
"preview": "//-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTCARopebwt.h",
"chars": 569,
"preview": "//-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTDiskConstruction.cpp",
"chars": 26173,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTDiskConstruction.h",
"chars": 1898,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTIndexSet.h",
"chars": 1101,
"preview": "//-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTInterval.h",
"chars": 3121,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTIntervalCache.cpp",
"chars": 1821,
"preview": "///-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/SuffixTools/BWTIntervalCache.h",
"chars": 1536,
"preview": "///-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/SuffixTools/BWTReader.cpp",
"chars": 479,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTReader.h",
"chars": 1100,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTReaderAscii.cpp",
"chars": 2781,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTReaderAscii.h",
"chars": 1031,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTReaderBinary.cpp",
"chars": 3252,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTReaderBinary.h",
"chars": 1117,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTTraverse.cpp",
"chars": 6793,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTTraverse.h",
"chars": 2294,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriter.cpp",
"chars": 1136,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriter.h",
"chars": 984,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriterAscii.cpp",
"chars": 2379,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriterAscii.h",
"chars": 1166,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriterBinary.cpp",
"chars": 2895,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/BWTWriterBinary.h",
"chars": 1221,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/FMMarkers.h",
"chars": 2889,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/GapArray.cpp",
"chars": 3408,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/GapArray.h",
"chars": 1992,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/HitData.h",
"chars": 1927,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/InverseSuffixArray.cpp",
"chars": 2130,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/InverseSuffixArray.h",
"chars": 884,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/Makefile.am",
"chars": 2043,
"preview": "noinst_LIBRARIES = libsuffixtools.a\n\nlibsuffixtools_a_CPPFLAGS = \\\n\t-I$(top_srcdir)/src/SGA/Util \\\n\t-I$(top_srcdir)/src/"
},
{
"path": "src/SGA/SuffixTools/Makefile.in",
"chars": 65370,
"preview": "# Makefile.in generated by automake 1.16.1 from Makefile.am.\n# @configure_input@\n\n# Copyright (C) 1994-2018 Free Softwar"
},
{
"path": "src/SGA/SuffixTools/Occurrence.cpp",
"chars": 2659,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/Occurrence.h",
"chars": 4311,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/PopulationIndex.cpp",
"chars": 4102,
"preview": "//-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/PopulationIndex.h",
"chars": 1918,
"preview": "//-----------------------------------------------\n// Copyright 2012 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/QuickBWT.cpp",
"chars": 808,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/QuickBWT.h",
"chars": 500,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/RLBWT.cpp",
"chars": 13508,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/RLBWT.h",
"chars": 11493,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/RLUnit.h",
"chars": 3753,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/RankProcess.cpp",
"chars": 4382,
"preview": "///-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/SuffixTools/RankProcess.h",
"chars": 1560,
"preview": "///-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared"
},
{
"path": "src/SGA/SuffixTools/SACAInducedCopying.cpp",
"chars": 7690,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SACAInducedCopying.h",
"chars": 1214,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SAReader.cpp",
"chars": 2741,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SAReader.h",
"chars": 1303,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SAWriter.cpp",
"chars": 1230,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SAWriter.h",
"chars": 845,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SBWT.cpp",
"chars": 4684,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SBWT.h",
"chars": 3066,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/STCommon.cpp",
"chars": 1280,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/STCommon.h",
"chars": 2765,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/STGlobals.h",
"chars": 723,
"preview": "//-----------------------------------------------\n// Copyright 2009 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SampledSuffixArray.cpp",
"chars": 9914,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SampledSuffixArray.h",
"chars": 2573,
"preview": "//-----------------------------------------------\n// Copyright 2011 Wellcome Trust Sanger Institute\n// Written by Jared "
},
{
"path": "src/SGA/SuffixTools/SparseGapArray.h",
"chars": 11241,
"preview": "//-----------------------------------------------\n// Copyright 2010 Wellcome Trust Sanger Institute\n// Written by Jared "
}
]
// ... and 199 more files (download for full content)
About this extraction
This page contains the full source code of the walaj/svaba GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 399 files (131.2 MB), approximately 1.4M tokens, and a symbol index with 1041 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.