Repository: haowenz/chromap Branch: master Commit: 949043c782e6 Files: 64 Total size: 765.5 KB Directory structure: gitextract_5tsxk3xx/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ └── bug_report.md │ └── workflows/ │ └── ci.yml ├── LICENSE ├── Makefile ├── README.md ├── chromap.1 ├── docs/ │ ├── _config.yml │ ├── chromap.html │ └── index.md ├── src/ │ ├── alignment.cc │ ├── alignment.h │ ├── barcode_translator.h │ ├── bed_mapping.h │ ├── candidate.h │ ├── candidate_position_generating_config.h │ ├── candidate_processor.cc │ ├── candidate_processor.h │ ├── chromap.cc │ ├── chromap.h │ ├── chromap_driver.cc │ ├── chromap_driver.h │ ├── cxxopts.hpp │ ├── draft_mapping.h │ ├── draft_mapping_generator.cc │ ├── draft_mapping_generator.h │ ├── feature_barcode_matrix.cc │ ├── feature_barcode_matrix.h │ ├── feature_barcode_matrix_writer.h │ ├── hit_utils.h │ ├── index.cc │ ├── index.h │ ├── index_parameters.h │ ├── index_utils.h │ ├── khash.h │ ├── kseq.h │ ├── ksw.cc │ ├── ksw.h │ ├── mapping.h │ ├── mapping_generator.cc │ ├── mapping_generator.h │ ├── mapping_in_memory.h │ ├── mapping_metadata.h │ ├── mapping_parameters.h │ ├── mapping_processor.h │ ├── mapping_writer.cc │ ├── mapping_writer.h │ ├── minimizer.h │ ├── minimizer_generator.cc │ ├── minimizer_generator.h │ ├── mmcache.hpp │ ├── paf_mapping.h │ ├── paired_end_mapping_metadata.h │ ├── pairs_mapping.h │ ├── sam_mapping.h │ ├── sequence_batch.cc │ ├── sequence_batch.h │ ├── sequence_effective_range.h │ ├── strand.h │ ├── summary_metadata.h │ ├── temp_mapping.h │ └── utils.h └── test/ ├── read1.fq ├── read2.fq └── ref.fa ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: "[BUG] XXX" labels: bug assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Describe the data you are using and provide a sample of your data if possible. For example, the paired-end reads are generated by 10x scATAC-seq. The read length is 50bp and the barcode length is 16bp. 2. Get the Chromap version by running ```chromap -v``` and post it here. 3. Provide the full command line you used to run Chromap. 4. Provide the log output by Chromap and highlight the error message. **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Environment (please complete the following information):** - OS: [e.g. Ubuntu 22.10] - Way you install Chromap [e.g. use Bioconda, download binary, build from source] - If you compiled Chromap from source yourself, please provide the compiler version [e.g. GCC 7.4.0] **Additional context** Add any other context about the problem here. ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [ master ] pull_request: branches: [ master ] env: DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer jobs: ubuntu: runs-on: ubuntu-latest strategy: matrix: compiler: [g++, clang++] steps: - uses: actions/checkout@v2 - name: install-deps run: sudo apt-get update; sudo apt-get install -y clang libomp5 libomp-dev - name: build-chromap run: make CXX=${{ matrix.compiler }} - name: test-chromap run: ./chromap -h macos: runs-on: macos-latest strategy: matrix: compiler: [clang++] steps: - uses: actions/checkout@v2 - name: cache-openmp id: cache-openmp uses: actions/cache@v3 with: path: openmp-install key: openmp-macos-install - name: build-openmp if: steps.cache-openmp.outputs.cache-hit != 'true' run: | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/openmp-14.0.0.src.tar.xz tar -xf openmp-14.0.0.src.tar.xz cd openmp-14.0.0.src sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S mkdir -p build && cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. cmake --build . -j 3 cmake --build . --target install mkdir $GITHUB_WORKSPACE/openmp-install cp -r install/* $GITHUB_WORKSPACE/openmp-install - name: install-openmp run: | sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - name: build-chromap run: make CXX=${{ matrix.compiler }} CXXFLAGS="-arch x86_64 -isysroot $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk -std=c++11 -Wall -O3 -Xclang -fopenmp -msse4.1" LDFLAGS="-L$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -rpath $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -lm -lz -lomp" - name: test-chromap run: ./chromap -h ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Haowen Zhang, Li Song, X. Shirley Liu, Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ CXX=g++ CXXFLAGS=-std=c++11 -Wall -O3 -fopenmp -msse4.1 LDFLAGS=-lm -lz cpp_source=sequence_batch.cc index.cc minimizer_generator.cc candidate_processor.cc alignment.cc feature_barcode_matrix.cc ksw.cc draft_mapping_generator.cc mapping_generator.cc mapping_writer.cc chromap.cc chromap_driver.cc src_dir=src objs_dir=objs objs+=$(patsubst %.cc,$(objs_dir)/%.o,$(cpp_source)) exec=chromap ifneq ($(asan),) CXXFLAGS+=-fsanitize=address -g LDFLAGS+=-fsanitize=address -ldl -g endif all: dir $(exec) dir: mkdir -p $(objs_dir) $(exec): $(objs) $(CXX) $(CXXFLAGS) $(objs) -o $(exec) $(LDFLAGS) $(objs_dir)/%.o: $(src_dir)/%.cc $(CXX) $(CXXFLAGS) -c $< -o $@ .PHONY: clean clean: -rm -rf $(exec) $(objs_dir) ================================================ FILE: README.md ================================================ [![GitHub build](https://github.com/haowenz/chromap/actions/workflows/ci.yml/badge.svg)](https://github.com/haowenz/chromap/actions/workflows/ci.yml) [![GitHub license](https://img.shields.io/github/license/haowenz/chromap)](https://github.com/haowenz/chromap/blob/master/LICENSE) [![Conda version](https://img.shields.io/conda/v/bioconda/chromap)](https://anaconda.org/bioconda/chromap) [![Conda platform](https://img.shields.io/conda/pn/bioconda/chromap)](https://anaconda.org/bioconda/chromap) [![Conda download](https://img.shields.io/conda/dn/bioconda/chromap)](https://anaconda.org/bioconda/chromap) ## Getting Started ```sh git clone https://github.com/haowenz/chromap.git cd chromap && make # create an index first and then map ./chromap -i -r test/ref.fa -o ref.index ./chromap -x ref.index -r test/ref.fa -1 test/read1.fq -2 test/read2.fq -o test.bed # use presets (no test data) ./chromap --preset atac -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed # ATAC-seq reads ./chromap --preset atac -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed \ -b barcode.fq.gz --barcode-whitelist whitelist.txt # scATAC-seq reads ./chromap --preset chip -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed # ChIP-seq reads ./chromap --preset hic -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.pairs # Hi-C reads and pairs output ./chromap --preset hic -x index -r ref.fa -1 read1.fq -2 read2.fq --SAM -o aln.sam # Hi-C reads and SAM output ``` ## Table of Contents - [Getting Started](#started) - [User Guide](#uguide) - [Installation](#install) - [General usage](#general) - [Use cases](#cases) - [Map ChIP-seq short reads](#map-chip) - [Map ATAC-seq/scATAC-seq short reads](#map-atac) - [Map Hi-C short reads](#map-hic) - [Summarizing mapping statistics/quality control](#atacseq-qc) - [Summary File](#summaryfile) - [Estimating FRiP](#estfrip) - [Features to assist in doublet detection](#doublet) - [Getting help](#help) - [Citing Chromap](#cite) ## User Guide Chromap is an ultrafast method for aligning and preprocessing high throughput chromatin profiles. Typical use cases include: (1) trimming sequencing adapters, mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq genomic reads to the human genome, correcting barcodes, removing duplicates and performing Tn5 shift; (3) split alignment of Hi-C reads against a reference genome. In all these three cases, Chromap is 10-20 times faster while being accurate. ### Installation To compile from the source, you need to have the GCC compiler with version>=7.3.0, GNU make and zlib development files installed. Then type `make` in the source code directory to compile. Chromap is also available on [bioconda][bioconda]. Thus you can easily install Chromap with Conda ```sh conda install -c conda-forge -c bioconda chromap ``` ### General usage Before mapping, an index of the reference needs to be created and saved on the disk: ```sh chromap -i -r ref.fa -o index ``` The users can input the min fragment length expected in their sequencing experiments, e.g. read length, by **--min-frag-length**. Then Chromap will choose proper k-mer length and window size to build the index. For human genome, it only takes a few minutes to build the index. Without any preset parameters, Chromap takes a reference database and a query sequence file as input and produce approximate mapping, without base-level alignment in the [BED format][bed]: ```sh chromap -x index -r ref.fa -1 query.fq -o approx-mapping.bed ``` You may ask Chromap to output alignments in the [SAM format][sam]: ```sh chromap -x index -r ref.fa -1 query.fq --SAM -o alignment.sam ``` But note that the the processing of SAM files is not fully optimized and can be slow. Thus generating the output in SAM format is not preferred and should be avoided when possible. Chromap can take multiple input read files: ```sh chromap -x index -r ref.fa -1 query1.fq,query2.fq,query3.fq --SAM -o alignment.sam ``` Chromap also supports wildcards in the read file names and will find all matched read files. To use this function, the read file names ***must*** be put in quotation marks: ```sh chromap -x index -r ref.fa -1 "query*.fq" --SAM -o alignment.sam ``` Chromap works with gzip'd FASTA and FASTQ formats as input. You don't need to convert between FASTA and FASTQ or decompress gzip'd files first. ***Importantly***, it should be noted that once you build the index, indexing parameters such as **-k**, **-w** and **--min-frag-length** can't be changed during mapping. If you are running Chromap for different data types, you will probably need to keep multiple indexes generated with different parameters. This makes Chromap different from BWA which always uses the same index regardless of query data types. Chromap can build the human genome index file in a few minutes. Detailed explanations for the options can be found at the [manpage][manpage]. ### Use cases To support different data types (e.g. ChIP-seq, Hi-C, ATAC-seq), Chromap needs to be tuned for optimal performance and accuracy. It is usually recommended to choose a preset with option **--preset**, which sets multiple parameters at the same time. #### Map ChIP-seq short reads ```sh chromap --preset chip -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed # ChIP-seq reads ``` This set of parameters is tuned for mapping ChIP-seq reads. Chromap will map the paired-end reads with max insert size up to 2000 (**-l 2000**) and then remove duplicates (**--remove-pcr-duplicates**) using the low memory mode (**--low-mem**). The output is in BED format (**--BED**). In the output BED file, each row is a mapping of a fragment (i.e., a read pair) and the columns are chrom chrom_start chrom_end N mapq strand The strand here is the strand of the first read in a read pair (specified by **-1**). If the mapping start and end locations of each read in a read pair are desired, **--TagAlign** should be used to overide **--BED** in the preset parameters as following ```sh chromap --preset chip -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz --TagAlign -o aln.tagAlign # ChIP-seq reads ``` For each read pair, there will be two rows in the output file, one for each read in the pair respectively. The meaning of the columns remains the same. #### Map ATAC-seq/scATAC-seq short reads ```sh chromap --preset atac -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed # ATAC-seq reads chromap --preset atac -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed\ -b barcode.fq.gz --barcode-whitelist whitelist.txt # scATAC-seq reads ``` This set of parameters is tuned for mapping ATAC-seq/scATAC-seq reads. Chromap will trim the adapters on 3' end (**--trim-adapters**), map the paired-end reads with max insert size up to 2000 (**-l 2000**) and then remove duplicates at cell level (**--remove-pcr-duplicates-at-cell-level**). Tn5 shift will also be applied to the fragments (**--Tn5-shift**). The forward mapping start positions are increased by 4bp and the reverse mapping end positions are decreased by 5bp. The processing is run in the low memory mode (**--low-mem**). If no barcode whitelist file is given, Chromap will skip barcode correction. When barcodes and a whitelist are given as input, by default Chromap will estimate barcode abundance and use this information to perform barcode correction with up to 1 Hamming distance from a whitelist barcode. By setting **--bc-error-threshold** to 2, Chromap is able to correct barcodes with up to 2 Hamming distance from a whitelist barcode. User can also increase the probability threshold to make a correction by setting **--bc-probability-threshold** (set to 0.9 by default) to a large value (e.g., 0.975) to only make reliable corrections. For scATAC-seq data with multiple read and barcode files, you can use "," to concatenate multiple input files as the example [above](#general). Chromap also supports user-defined barcode format, including mixed barcode and genomic data case. User can specify the sequence structure through option **--read-format**. The value is a comma-separated string, each field in the string is also a semi-comma-splitted string [r1|r2|bc]:start:end:strand The start and end are inclusive and -1 means the end of the read. User may use multiple fields to specify non-consecutive segments, e.g. bc:0:15,bc:32:-1. The strand is presented by '+' and '-' symbol, if '-' the barcode will be reverse-complemented after extraction. The strand symbol can be omitted if it is '+' and is ignored on r1 and r2. For example, when the barcode is in the first 16bp of read1, one can use the option `-1 read1.fq.gz -2 read2.fq.gz --barcode read1.fq.gz --read-format bc:0:15,r1:16:-1`. The output file formats for bulk and single-cell data are different except for the first three columns. For bulk data, the columns are chrom chrom_start chrom_end N mapq strand duplicate_count For single-cell data, the columns are chrom chrom_start chrom_end barcode duplicate_count the same as the definition of the fragment file in [CellRanger][cellranger]. Note that chrom_end is open-end. This output fragment file can be used as input of downstream analysis tools such as [MAESTRO][MAESTRO], [ArchR][ArchR], [signac][signac] and etc. Besides, Chromap can translate input cell barcodes to another set of barcodes. Users can specify the translation file through the option **--barcode-translate**. The translation file is a two-column tsv/csv file with the translated barcode on the first column and the original barcode on the second column. This is useful for 10x Multiome data, where scATAC-seq and scRNA-seq data use different sets of barcodes. This option also supports combinatorial barcoding, such as SHARE-seq. Chromap can translate each barcode segment provided in the second column to the ID in the first column and add "-" to concatenate the IDs in the output. #### Map Hi-C short reads ```sh chromap --preset hic -x index -r ref.fa -1 read1.fa -2 read2.fa -o aln.pairs # Hi-C reads and pairs output ``` Chromap will perform split alignment (**--split-alignment**) on Hi-C reads and output mappings in [pairs][pairs] format (**--pairs**), which is used in [4DN Hi-C data processing pipeline][4DN]. Some Hi-C data analysis pipelines may require the reads are sorted in specific chromosome order other than the one in the index. Therefore, Chromap provides the option **--chr-order** to specify the alignment order, and **--pairs-natural-chr-order** for flipping the pair in the pairs format. ### Summarizing mapping statistics/quality control Chromap allows you to summarize the dataset's mapping statistics as well as quality metrics at either a *bulk* or *single cell* level. To enable this feature, users can specify a file path using this option, **--summary [FILE]**, where a csv file will be saved. This summary file will output a series of metrics for each barcode (or the overall dataset if it is bulk). Here are the different columns contained within the summary file: ```sh barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip,numcacheslots ``` - `barcode` - Barcode label for cell - `total` - Total number of fragments - `duplicate` - Number of duplicate fragments - `unmapped` - Number of unmapped fragments - `lowmapq` - Number of fragments with a low MAPQ - `cachehit` - Number of fragments that were found in the chromap cache during alignment - `fric` - Fraction of fragments in the chromap cache - `estfrip` - Estimated FRiP value based on a linear model ([See below for more details](#estfrip)) - `numcacheslots` - Number of unique associated cache slots for this barcode (Relevant feature for doublet detection, [see below for more](#doublet)) The summary contains metrics relevant to the mappability of fragments from each barcode. However, it also contains metrics (`estfrip` and `numcacheslots`) relevant to quality control for chromatin profiling assays like scATAC-seq. These cache-related metrics require overall deep sequencing depth, so it is more useful for single-cell data. The next two sections briefly describe these two metrics and how they can be useful for users. #### Estimating FRiP The `estfrip` column in Chromap's summary file represents an estimate of the FRiP score (Fraction of Reads in Peak Regions) computed by Chromap. Chromap uses a simple multi-variate linear model to estimate the FRiP for each barcode and the features used in this model are `fric`, `duplicate`, `unmapped` and `lowmapq`. Typically, the FRiP score is used to assess the quality of chromatin profiles, where typically the higher the FRiP score the better. For users, this `estfrip` can be used to quickly gauge the quality of the data by plotting all the values in a histogram and looking to see if you a multi-modal distribution. In addition, when combining Chromap with downstream analysis tools such as [SnapATAC2](https://github.com/kaizhang/SnapATAC2) that perform clustering, the `estfrip` can be used to quickly identify any specific clusters that are lower quality than the rest. **An important note to users**, the `estfrip` values for every barcode should not be taken by themselves and used as the true FRiP score. These estimates are mainly intended to be used for quality control at a dataset level where we compare different `estfrip` values to each other. #### Features to assist in doublet detection The `numcacheslots` column in Chromap's summary file estimates the number of unique cache slots queried for each barcode during the alignment. This feature can be useful in assisting users for doublet detection/filtering. Typically for doublet detection in single-cell datasets, a simple and naive metric used to identify potential doublets is the number of fragments in cells (i.e. more reads, more likely a doublet). Chromap uses the simple intuition that barcodes with higher number of peaks than usual, could be doublets. The number of unique cache slots that are queried can be seen as a proxy for the number of peaks. In our experiments, using `numcacheslots` yields a larger AUC compared using `total` for binary classification of doublets. Therefore, users can potentially use this metric as an additional check/feature along with other doublet-detection specific methods. ### Getting help Detailed description of Chromap command line options and optional tags can be displayed by running Chromap with **-h** or be found at the [manpage][manpage]. If you encounter bugs or have further questions or requests, you can raise an issue at the [issue page][issue]. ### Citing Chromap If you use Chromap, please cite: > Zhang, H., Song, L., Wang, X., Cheng, H., Wang, C., Meyer, C. A., ..., Liu, X. S., Li, H. (2021). Fast alignment and preprocessing of chromatin profiles with Chromap. Nature communications, 12(1), 1-6. > https://doi.org/10.1038/s41467-021-26865-w The summary file for QC is described in the manuscript: > Ahmed, O., Zhang, H., Langmead, B., Song, L. (2025). Quality control of single-cell ATAC-seq data without peak calling using Chromap. Biorxiv. > https://doi.org/10.1101/2025.07.15.664951 [bed]: https://genome.ucsc.edu/FAQ/FAQformat.html#format1 [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md [sam]: https://samtools.github.io/hts-specs/SAMv1.pdf [pairs]: https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md [4DN]: https://data.4dnucleome.org/resources/data-analysis/hi_c-processing-pipeline [minimap]: https://github.com/lh3/minimap [release]: https://github.com/haowenz/chromap/releases [issue]: https://github.com/haowenz/chromap/issues [cellranger]: https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments [manpage]: https://haowenz.github.io/chromap/chromap.html [bioconda]: https://anaconda.org/bioconda/chromap [ArchR]: https://www.archrproject.com/index.html [MAESTRO]: https://github.com/liulab-dfci/MAESTRO [signac]: https://satijalab.org/signac/articles/pbmc_vignette.html ================================================ FILE: chromap.1 ================================================ .TH chromap 1 "25 Jan 2024" "chromap-0.2.6 (r490)" "Bioinformatics tools" .SH NAME .PP chromap - fast alignment and preprocessing of chromatin profiles .SH SYNOPSIS * Indexing the reference genome: .RS 4 chromap .B -i .RB [ -k .IR kmer ] .RB [ -w .IR miniWinSize ] .B -r .I ref.fa .B -o .I ref.index .RE * Mapping (sc)ATAC-seq reads: .RS 4 chromap .B --preset .I atac .B -r .I ref.fa .B -x .I ref.index .B -1 .I read1.fq .B -2 .I read2.fq .B -o .I aln.bed .RB [ -b .IR barcode.fq.gz ] .RB [ --barcode-whitelist .IR whitelist.txt ] .RE * Mapping ChIP-seq reads: .RS 4 chromap .B --preset .I chip .B -r .I ref.fa .B -x .I ref.index .B -1 .I read1.fq .B -2 .I read2.fq .B -o .I aln.bed .RE * Mapping Hi-C reads: .RS 4 chromap .B --preset .I hic .B -r .I ref.fa .B -x .I ref.index .B -1 .I read1.fq .B -2 .I read2.fq .B -o .I aln.pairs .br chromap .B --preset .I hic .B -r .I ref.fa .B -x .I ref.index .B -1 .I read1.fq .B -2 .I read2.fq .B --SAM .B -o .I aln.sam .RE .SH DESCRIPTION .PP Chromap is an ultrafast method for aligning and preprocessing high throughput chromatin profiles. Typical use cases include: (1) trimming sequencing adapters, mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq genomic reads to the human genome, correcting barcodes, removing duplicates and performing Tn5 shift; (3) split alignment of Hi-C reads against a reference genome. In all these three cases, Chromap is 10-20 times faster while being accurate. .SH OPTIONS .SS Indexing options .TP 10 .BI -k \ INT Minimizer k-mer length [17]. .TP .BI -w \ INT Minimizer window size [7]. A minimizer is the smallest k-mer in a window of w consecutive k-mers. .TP .B --min-frag-length Min fragment length for choosing k and w automatically [30]. Users can increase this value when the min length of the fragments of interest is long, which can increase the mapping speed. Note that the default value 30 is the min fragment length that chromap can map. .SS Mapping options .TP 10 .BI --split-alignment Allow split alignments. This option should be set only when mapping Hi-C reads. .TP .BI -e \ INT Max edit distance allowed to map a read [8]. .TP .BI -s \ INT Min number of minimizers required to map a read [2]. .TP .BI -f \ INT1 [, INT2 ] Ignore minimizers occuring more than .I INT1 [500] times. .I INT2 [1000] is the threshold for a second round of seeding. .TP .BI -l \ INT Max insert size, only for paired-end read mapping [1000]. .TP .BI -q \ INT Min MAPQ in range [0, 60] for mappings to be output [30]. .TP .BI --min-read-length \ INT Skip mapping the reads of length less than .I INT [30]. Note that this is different from the index option .BR --min-frag-length , which set .BR -k and .BR -w for indexing the genome. .TP .BI --trim-adapters Try to trim adapters on 3'. This only works for paired-end reads. When the fragment length indicated by the read pair is less than the length of the reads, the two mates are overlapped with each other. Then the regions outside the overlap are regarded as adapters and trimmed. .TP .BI --remove-pcr-duplicates Remove PCR duplicates. .TP .BI --remove-pcr-duplicates-at-bulk-level Remove PCR duplicates at bulk level for single cell data. .TP .BI --remove-pcr-duplicates-at-cell-level Remove PCR duplicates at cell level for single cell data. .TP .BI --Tn5-shift Perform Tn5 shift. When this option is turned on, the forward mapping start positions are increased by 4bp and the reverse mapping end positions are decreased by 5bp. Note that this works only when .BR --SAM is NOT set. .TP .BI --low-mem Use low memory mode. When this option is set, multiple temporary intermediate mapping files might be generated on disk and they are merged at the end of processing to reduce memory usage. When this is NOT set, all the mapping results are kept in the memory before they are saved on disk, which works more efficiently for datasets that are not too large. .TP .BI --bc-error-threshold \ INT Max Hamming distance allowed to correct a barcode [1]. Note that the max supported threshold is 2. .TP .BI --bc-probability-threshold \ FLT Min probability to correct a barcode [0.9]. When there are multiple whitelisted barcodes with the same Hamming distance to the barcode to correct, chromap will process the base quality of the mismatched bases, and compute a probability that the correction is right. .TP .BI -t \ INT The number of threads for mapping [1]. .SS Input options .TP 10 .BI -r \ FILE Reference file. .TP .BI -x \ FILE Index file. .TP .BI -1 \ FILE Single-end read files or paired-end read files 1. Chromap supports mulitple input files concatenate by ",". For example, setting this option to "Library1_R1.fastq.gz,Library2_R1.fastq.gz,Library3_R1.fastq.gz" will make all three files as input and map them in this order. Similarly, .BR -2 and .BR -b also support multiple input files. And the ordering of the input files for all the three options should match. .TP .BI -2 \ FILE Paired-end read files 2. .TP .BI -b \ FILE Cell barcode files. .TP .BI --barcode-whitelist \ FILE Cell barcode whitelist file. This is supposed to be a txt file where each line is a whitelisted barcode. .TP .BI --read-format \ STR Format for read files and barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics single-end format. .SS Output options .TP 10 .BR -o \ FILE Output file. .TP .BR --output-mappings-not-in-whitelist Output mappings with barcode not in the whitelist. .TP .BR --chr-order \ FILE Custom chromosome order file. If not specified, the order of reference sequences will be used. .TP .BR --BED Output mappings in BED/BEDPE format. Note that only one of the formats should be set. .TP .BR --TagAlign Output mappings in TagAlign/PairedTagAlign format. .TP .BR --SAM Output mappings in SAM format. .TP .BR --pairs Output mappings in pairs format (defined by 4DN for HiC data). .TP .BR --pairs-natural-chr-order \ FILE Custom chromosome order file for pairs flipping. If not specified, the custom chromosome order will be used. .TP .BR --barcode-translate \ FILE Convert input barcodes to another set of barcodes in the output. .TP .BR --summary \ FILE Summarize the mapping statistics at bulk or barcode level. .TP .B -v Print version number to stdout. .SS Preset options .TP 10 .BI --preset \ STR Preset []. This option applies multiple options at the same time. It should be applied before other options because options applied later will overwrite the values set by .BR --preset . Available .I STR are: .RS .TP 10 .B chip Mapping ChIP-seq reads .RB ( -l .I 2000 .B --remove-pcr-duplicates --low-mem .BR --BED ). .TP .B atac Mapping ATAC-seq/scATAC-seq reads .RB ( -l .I 2000 .B --remove-pcr-duplicates --low-mem --trim-adapters --Tn5-shift .B --remove-pcr-duplicates-at-cell-level .BR --BED ). .TP .B hic Mapping Hi-C reads .RB ( -e .I 4 .B -q .I 1 .B --low-mem --split-alignment .BR --pairs ). ================================================ FILE: docs/_config.yml ================================================ theme: jekyll-theme-modernist ================================================ FILE: docs/chromap.html ================================================ chromap

chromap

NAME
SYNOPSIS
DESCRIPTION
OPTIONS

NAME

chromap - fast alignment and preprocessing of chromatin profiles

SYNOPSIS

* Indexing the reference genome:

chromap -i [-k kmer] [-w miniWinSize] -r ref.fa -o ref.index

* Mapping (sc)ATAC-seq reads:

chromap --preset atac -r ref.fa -x ref.index -1 read1.fq -2 read2.fq -o aln.bed [-b barcode.fq.gz] [--barcode-whitelist whitelist.txt]

* Mapping ChIP-seq reads:

chromap --preset chip -r ref.fa -x ref.index -1 read1.fq -2 read2.fq -o aln.bed

* Mapping Hi-C reads:

chromap --preset hic -r ref.fa -x ref.index -1 read1.fq -2 read2.fq -o aln.pairs
chromap --preset hic -r ref.fa -x ref.index -1 read1.fq -2 read2.fq --SAM -o aln.sam

DESCRIPTION

Chromap is an ultrafast method for aligning and preprocessing high throughput chromatin profiles. Typical use cases include: (1) trimming sequencing adapters, mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq genomic reads to the human genome, correcting barcodes, removing duplicates and performing Tn5 shift; (3) split alignment of Hi-C reads against a reference genome. In all these three cases, Chromap is 10-20 times faster while being accurate.

OPTIONS

Indexing options

-k INT

Minimizer k-mer length [17].

-w INT

Minimizer window size [7]. A minimizer is the smallest k-mer in a window of w consecutive k-mers.

--min-frag-length

Min fragment length for choosing k and w automatically [30]. Users can increase this value when the min length of the fragments of interest is long, which can increase the mapping speed. Note that the default value 30 is the min fragment length that chromap can map.

Mapping options
--split-alignment

Allow split alignments. This option should be set only when mapping Hi-C reads.

-e INT

Max edit distance allowed to map a read [8].

-s INT

Min number of minimizers required to map a read [2].

-f INT1[,INT2]

Ignore minimizers occuring more than INT1 [500] times. INT2 [1000] is the threshold for a second round of seeding.

-l INT

Max insert size, only for paired-end read mapping [1000].

-q INT

Min MAPQ in range [0, 60] for mappings to be output [30].

--min-read-length INT

Skip mapping the reads of length less than INT [30]. Note that this is different from the index option --min-frag-length , which set -k and -w for indexing the genome.

--trim-adapters

Try to trim adapters on 3’. This only works for paired-end reads. When the fragment length indicated by the read pair is less than the length of the reads, the two mates are overlapped with each other. Then the regions outside the overlap are regarded as adapters and trimmed.

--remove-pcr-duplicates

Remove PCR duplicates.

--remove-pcr-duplicates-at-bulk-level

Remove PCR duplicates at bulk level for single cell data.

--remove-pcr-duplicates-at-cell-level

Remove PCR duplicates at cell level for single cell data.

--Tn5-shift

Perform Tn5 shift. When this option is turned on, the forward mapping start positions are increased by 4bp and the reverse mapping end positions are decreased by 5bp. Note that this works only when --SAM is NOT set.

--low-mem

Use low memory mode. When this option is set, multiple temporary intermediate mapping files might be generated on disk and they are merged at the end of processing to reduce memory usage. When this is NOT set, all the mapping results are kept in the memory before they are saved on disk, which works more efficiently for datasets that are not too large.

--bc-error-threshold INT

Max Hamming distance allowed to correct a barcode [1]. Note that the max supported threshold is 2.

--bc-probability-threshold FLT

Min probability to correct a barcode [0.9]. When there are multiple whitelisted barcodes with the same Hamming distance to the barcode to correct, chromap will process the base quality of the mismatched bases, and compute a probability that the correction is right.

-t INT

The number of threads for mapping [1].

Input options

-r FILE

Reference file.

-x FILE

Index file.

-1 FILE

Single-end read files or paired-end read files 1. Chromap supports mulitple input files concatenate by ",". For example, setting this option to "read11.fq,read12.fq,read13.fq" will make all three files as input and map them in this order. Similarly, -2 and -b also support multiple input files. And the ordering of the input files for all the three options should match.

-2 FILE

Paired-end read files 2.

-b FILE

Cell barcode files.

--barcode-whitelist FILE

Cell barcode whitelist file. This is supposed to be a txt file where each line is a whitelisted barcode.

--read-format STR

Format for read files and barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics single-end format.

Output options

-o FILE

Output file.

--output-mappings-not-in-whitelist

Output mappings with barcode not in the whitelist.

--chr-order FILE

Customized chromsome order.

--BED

Output mappings in BED/BEDPE format. Note that only one of the formats should be set.

--TagAlign

Output mappings in TagAlign/PairedTagAlign format.

--SAM

Output mappings in SAM format.

--pairs

Output mappings in pairs format (defined by 4DN for HiC data).

--pairs-natural-chr-order FILE

Natural chromosome order for pairs flipping.

-v

Print version number to stdout.

Preset options
--preset 
STR

Preset []. This option applies multiple options at the same time. It should be applied before other options because options applied later will overwrite the values set by --preset. Available STR are:

chip

Mapping ChIP-seq reads (-l 2000 --remove-pcr-duplicates --low-mem --BED).

atac

Mapping ATAC-seq/scATAC-seq reads (-l 2000 --remove-pcr-duplicates --low-mem --trim-adapters --Tn5-shift --remove-pcr-duplicates-at-cell-level --BED).

hic

Mapping Hi-C reads (-e 4 -q 1 --low-mem --split-alignment --pairs).


================================================ FILE: docs/index.md ================================================ ## Getting help * [README][doc]: general documentation * [Manpage](chromap.html): explanation of command-line options * [Preprint][biorxiv]: free of charge preprint that describes the method * [GitHub Issues page][issue]: report bugs, request features and ask questions ## Acquiring Chromap * `git clone https://github.com/haowenz/chromap.git` * [GitHub Release page][release]: versioned packages * Also [available from BioConda][bioconda] [doc]: https://github.com/haowenz/chromap/blob/master/README.md [biorxiv]: https://www.biorxiv.org/content/10.1101/2021.06.18.448995v1 [bioconda]: https://anaconda.org/bioconda/chromap [release]: https://github.com/haowenz/chromap/releases [issue]: https://github.com/haowenz/chromap/issues ================================================ FILE: src/alignment.cc ================================================ #include "alignment.h" #include namespace chromap { int GetLongestMatchLength(const char *pattern, const char *text, const int read_length) { int max_match = 0; int tmp = 0; for (int i = 0; i < read_length; ++i) { if (CharToUint8(pattern[i]) == CharToUint8(text[i])) { ++tmp; } else if (tmp > max_match) { max_match = tmp; } } if (tmp > max_match) { max_match = tmp; } return max_match; } int AdjustGapBeginning(const Strand mapping_strand, const char *ref, const char *read, int *gap_beginning, int read_end, int ref_start_position, int ref_end_position, int *n_cigar, uint32_t **cigar) { int i, j; if (mapping_strand == kPositive) { if (*gap_beginning <= 0) { return ref_start_position; } // printf("%d\n", *gap_beginning); for (i = *gap_beginning - 1, j = ref_start_position - 1; i >= 0 && j >= 0; --i, --j) { // printf("%c %c\n", read[i], ref[j]); if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') { break; } } *gap_beginning = i + 1; // TODO: add soft clip in cigar if (n_cigar && *n_cigar > 0) { if (((*cigar)[0] & 0xf) == BAM_CMATCH) { (*cigar)[0] += (ref_start_position - 1 - j) << 4; } } return j + 1; } if (*gap_beginning <= 0) { return ref_end_position; } // printf("%d\n", *gap_beginning); /*char *tmp = new char[255] ; strncpy(tmp, ref + ref_start_position, ref_end_position - ref_start_position + 1 + 10) ; printf("%s %d. %d %d\n", tmp, strlen(tmp), ref_end_position - ref_start_position + 1 + 10, strlen(ref)) ; delete[] tmp;*/ for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j]; ++i, ++j) { // printf("%c %c %c %c %c %c\n", read[i], ref[j - 1], ref[j], ref[j + 1], // ref[j + 2], ref[j + 3]); if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') { break; } } *gap_beginning = *gap_beginning + i - (read_end + 1); if (n_cigar && *n_cigar > 0) { if (((*cigar)[*n_cigar - 1] & 0xf) == BAM_CMATCH) { (*cigar)[*n_cigar - 1] += (j - (ref_end_position + 1)) << 4; } } return j - 1; } void GenerateNMAndMDTag(const char *pattern, const char *text, int mapping_start_position, MappingInMemory &mapping_in_memory) { const char *read = text; const char *reference = pattern + mapping_start_position; const uint32_t *cigar = mapping_in_memory.cigar; const int n_cigar = mapping_in_memory.n_cigar; mapping_in_memory.NM = 0; mapping_in_memory.MD_tag.clear(); int num_matches = 0; int read_position = 0; int reference_position = 0; for (int ci = 0; ci < n_cigar; ++ci) { uint32_t current_cigar_uint = cigar[ci]; uint8_t cigar_operation = bam_cigar_op(current_cigar_uint); int num_cigar_operations = bam_cigar_oplen(current_cigar_uint); if (cigar_operation == BAM_CMATCH) { for (int opi = 0; opi < num_cigar_operations; ++opi) { if (reference[reference_position] == read[read_position] || reference[reference_position] - 'a' + 'A' == read[read_position]) { // a match ++num_matches; } else { // a mismatch ++mapping_in_memory.NM; mapping_in_memory.MD_tag.append(std::to_string(num_matches)); num_matches = 0; mapping_in_memory.MD_tag.push_back(reference[reference_position]); } ++reference_position; ++read_position; } } else if (cigar_operation == BAM_CINS) { mapping_in_memory.NM += num_cigar_operations; read_position += num_cigar_operations; } else if (cigar_operation == BAM_CDEL) { mapping_in_memory.NM += num_cigar_operations; mapping_in_memory.MD_tag.append(std::to_string(num_matches)); num_matches = 0; mapping_in_memory.MD_tag.push_back('^'); for (int opi = 0; opi < num_cigar_operations; ++opi) { mapping_in_memory.MD_tag.push_back(reference[reference_position]); ++reference_position; } } else { std::cerr << "Unexpected cigar op: " << (int)cigar_operation << "\n"; } } mapping_in_memory.MD_tag.append(std::to_string(num_matches)); } int BandedAlignPatternToText(int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position) { uint32_t Peq[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base = CharToUint8(pattern[i]); Peq[base] = Peq[base] | (1 << i); } uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); uint32_t lowest_bit_in_band_mask = 1; uint32_t VP = 0; uint32_t VN = 0; uint32_t X = 0; uint32_t D0 = 0; uint32_t HN = 0; uint32_t HP = 0; int num_errors_at_band_start_position = 0; for (int i = 0; i < read_length; i++) { uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]); Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask; X = Peq[CharToUint8(text[i])] | VN; D0 = ((VP + (X & VP)) ^ VP) | X; HN = VP & D0; HP = VN | ~(VP | D0); X = D0 >> 1; VN = X & HP; VP = HN | ~(X | HP); num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask); if (num_errors_at_band_start_position > 3 * error_threshold) { return error_threshold + 1; } for (int ai = 0; ai < 5; ai++) { Peq[ai] >>= 1; } } int band_start_position = read_length - 1; int min_num_errors = num_errors_at_band_start_position; *mapping_end_position = band_start_position; for (int i = 0; i < 2 * error_threshold; i++) { num_errors_at_band_start_position = num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1); num_errors_at_band_start_position = num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1); if (num_errors_at_band_start_position < min_num_errors || (num_errors_at_band_start_position == min_num_errors && i + 1 == error_threshold)) { min_num_errors = num_errors_at_band_start_position; *mapping_end_position = band_start_position + 1 + i; } } return min_num_errors; } // Return negative number if the termination are deemed at the beginning of the // read mappping_end_position is relative to pattern (reference) // read_mapping_length is for text (read) int BandedAlignPatternToTextWithDropOff(int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position, int *read_mapping_length) { uint32_t Peq[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base = CharToUint8(pattern[i]); Peq[base] = Peq[base] | (1 << i); } uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); uint32_t lowest_bit_in_band_mask = 1; uint32_t VP = 0; uint32_t VN = 0; uint32_t X = 0; uint32_t D0 = 0; uint32_t HN = 0; uint32_t HP = 0; uint32_t prev_VP = 0; uint32_t prev_VN = 0; int num_errors_at_band_start_position = 0; int i = 0; int fail_beginning = 0; // the alignment failed at the beginning part int prev_num_errors_at_band_start_position = 0; for (; i < read_length; i++) { uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]); Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask; X = Peq[CharToUint8(text[i])] | VN; D0 = ((VP + (X & VP)) ^ VP) | X; HN = VP & D0; HP = VN | ~(VP | D0); X = D0 >> 1; prev_VN = VN; prev_VP = VP; VN = X & HP; VP = HN | ~(X | HP); prev_num_errors_at_band_start_position = num_errors_at_band_start_position; num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask); if (num_errors_at_band_start_position > 2 * error_threshold) { // return error_threshold + 1; // the min error in this band could be still less than the // error_threshold, and could but this should be fine since it does not // affect the 5' end of the read. if (i < 4 * error_threshold && i < read_length / 2) { fail_beginning = 1; } break; } for (int ai = 0; ai < 5; ai++) { Peq[ai] >>= 1; } } /*char tmp[255] ; strncpy(tmp, pattern, read_length + 2 * error_threshold); printf("%s\n%s\n", tmp, text); printf("%d\n", i) ; fflush(stdout);*/ if (i < read_length) { num_errors_at_band_start_position = prev_num_errors_at_band_start_position; VN = prev_VN; VP = prev_VP; } int band_start_position = i - 1; int min_num_errors = num_errors_at_band_start_position; *read_mapping_length = i; *mapping_end_position = band_start_position; for (i = 0; i < 2 * error_threshold; i++) { num_errors_at_band_start_position = num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1); num_errors_at_band_start_position = num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1); if (num_errors_at_band_start_position < min_num_errors || (num_errors_at_band_start_position == min_num_errors && i + 1 == error_threshold)) { min_num_errors = num_errors_at_band_start_position; *mapping_end_position = band_start_position + 1 + i; } } if (fail_beginning || (read_length > 60 && *mapping_end_position + 1 - error_threshold - min_num_errors < 30)) { *mapping_end_position = -*mapping_end_position; } return min_num_errors; } int BandedAlignPatternToTextWithDropOffFrom3End(int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position, int *read_mapping_length) { uint32_t Peq[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base = CharToUint8(pattern[read_length + 2 * error_threshold - 1 - i]); Peq[base] = Peq[base] | (1 << i); } uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); uint32_t lowest_bit_in_band_mask = 1; uint32_t VP = 0; uint32_t VN = 0; uint32_t X = 0; uint32_t D0 = 0; uint32_t HN = 0; uint32_t HP = 0; uint32_t prev_VP = 0; uint32_t prev_VN = 0; int num_errors_at_band_start_position = 0; int i = 0; int fail_beginning = 0; // the alignment failed at the beginning part int prev_num_errors_at_band_start_position = 0; for (; i < read_length; i++) { // printf("%c %c %d\n", pattern[read_length - 1 - i], pattern[read_length - // 1 - i + error_threshold], text[read_length - 1 - i]); uint8_t pattern_base = CharToUint8(pattern[read_length - 1 - i]); Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask; X = Peq[CharToUint8(text[read_length - 1 - i])] | VN; D0 = ((VP + (X & VP)) ^ VP) | X; HN = VP & D0; HP = VN | ~(VP | D0); X = D0 >> 1; prev_VN = VN; prev_VP = VP; VN = X & HP; VP = HN | ~(X | HP); prev_num_errors_at_band_start_position = num_errors_at_band_start_position; num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask); /*printf("->%d %d %c %c", i, num_errors_at_band_start_position, pattern[read_length - 1 - i], text[read_length - 1 - i]) ; int tmp = num_errors_at_band_start_position; for (int j = 0; j < 2 * error_threshold; j++) { tmp = tmp + ((VP >> j) & (uint32_t) 1); tmp = tmp - ((VN >> j) & (uint32_t) 1); printf(" %d", tmp); } printf("\n");*/ if (num_errors_at_band_start_position > 2 * error_threshold) { // return error_threshold + 1; if (i < 4 * error_threshold && i < read_length / 2) { fail_beginning = 1; } break; } for (int ai = 0; ai < 5; ai++) { Peq[ai] >>= 1; } } // printf("li %d: %d %d %d\n", fail_beginning, i, error_threshold, // read_length); if (i < read_length) { num_errors_at_band_start_position = prev_num_errors_at_band_start_position; VN = prev_VN; VP = prev_VP; } int band_start_position = i - 1; int min_num_errors = num_errors_at_band_start_position; *read_mapping_length = i; *mapping_end_position = band_start_position; // printf("-1: %d\n", num_errors_at_band_start_position); for (i = 0; i < 2 * error_threshold; i++) { num_errors_at_band_start_position = num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1); num_errors_at_band_start_position = num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1); // printf("%d: %d\n", i, num_errors_at_band_start_position); if (num_errors_at_band_start_position < min_num_errors || (num_errors_at_band_start_position == min_num_errors && i + 1 == error_threshold)) { min_num_errors = num_errors_at_band_start_position; *mapping_end_position = band_start_position + (1 + i); } } if (fail_beginning || (read_length > 60 && *mapping_end_position + 1 - error_threshold - min_num_errors < 30)) { *mapping_end_position = -*mapping_end_position; } return min_num_errors; } void BandedAlign4PatternsToText(int error_threshold, const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions) { int ALPHABET_SIZE = 5; const char *reference_sequence0 = patterns[0]; const char *reference_sequence1 = patterns[1]; const char *reference_sequence2 = patterns[2]; const char *reference_sequence3 = patterns[3]; uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); __m128i highest_bit_in_band_mask_vpu0 = _mm_set_epi32(0, 0, 0, highest_bit_in_band_mask); __m128i highest_bit_in_band_mask_vpu1 = _mm_set_epi32(0, 0, highest_bit_in_band_mask, 0); __m128i highest_bit_in_band_mask_vpu2 = _mm_set_epi32(0, highest_bit_in_band_mask, 0, 0); __m128i highest_bit_in_band_mask_vpu3 = _mm_set_epi32(highest_bit_in_band_mask, 0, 0, 0); // Init Peq __m128i Peq[ALPHABET_SIZE]; for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_setzero_si128(); } for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base0 = CharToUint8(reference_sequence0[i]); uint8_t base1 = CharToUint8(reference_sequence1[i]); uint8_t base2 = CharToUint8(reference_sequence2[i]); uint8_t base3 = CharToUint8(reference_sequence3[i]); Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]); Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]); Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]); Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]); for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_srli_epi32(Peq[ai], 1); } } uint32_t lowest_bit_in_band_mask = 1; __m128i lowest_bit_in_band_mask_vpu = _mm_set1_epi32(lowest_bit_in_band_mask); __m128i VP = _mm_setzero_si128(); __m128i VN = _mm_setzero_si128(); __m128i X = _mm_setzero_si128(); __m128i D0 = _mm_setzero_si128(); __m128i HN = _mm_setzero_si128(); __m128i HP = _mm_setzero_si128(); __m128i max_mask_vpu = _mm_set1_epi32(0xffffffff); __m128i num_errors_at_band_start_position_vpu = _mm_setzero_si128(); __m128i early_stop_threshold_vpu = _mm_set1_epi32(error_threshold * 3); for (int i = 0; i < read_length; i++) { uint8_t base0 = CharToUint8(reference_sequence0[i + 2 * error_threshold]); uint8_t base1 = CharToUint8(reference_sequence1[i + 2 * error_threshold]); uint8_t base2 = CharToUint8(reference_sequence2[i + 2 * error_threshold]); uint8_t base3 = CharToUint8(reference_sequence3[i + 2 * error_threshold]); Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]); Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]); Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]); Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]); X = _mm_or_si128(Peq[CharToUint8(text[i])], VN); D0 = _mm_and_si128(X, VP); D0 = _mm_add_epi32(D0, VP); D0 = _mm_xor_si128(D0, VP); D0 = _mm_or_si128(D0, X); HN = _mm_and_si128(VP, D0); HP = _mm_or_si128(VP, D0); HP = _mm_xor_si128(HP, max_mask_vpu); HP = _mm_or_si128(HP, VN); X = _mm_srli_epi32(D0, 1); VN = _mm_and_si128(X, HP); VP = _mm_or_si128(X, HP); VP = _mm_xor_si128(VP, max_mask_vpu); VP = _mm_or_si128(VP, HN); __m128i E = _mm_and_si128(D0, lowest_bit_in_band_mask_vpu); E = _mm_xor_si128(E, lowest_bit_in_band_mask_vpu); num_errors_at_band_start_position_vpu = _mm_add_epi32(num_errors_at_band_start_position_vpu, E); __m128i early_stop = _mm_cmpgt_epi32(num_errors_at_band_start_position_vpu, early_stop_threshold_vpu); int tmp = _mm_movemask_epi8(early_stop); if (tmp == 0xffff) { _mm_store_si128((__m128i *)mapping_edit_distances, num_errors_at_band_start_position_vpu); return; } for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_srli_epi32(Peq[ai], 1); } } int band_start_position = read_length - 1; __m128i min_num_errors_vpu = num_errors_at_band_start_position_vpu; for (int i = 0; i < 2 * error_threshold; i++) { __m128i lowest_bit_in_VP_vpu = _mm_and_si128(VP, lowest_bit_in_band_mask_vpu); __m128i lowest_bit_in_VN_vpu = _mm_and_si128(VN, lowest_bit_in_band_mask_vpu); num_errors_at_band_start_position_vpu = _mm_add_epi32( num_errors_at_band_start_position_vpu, lowest_bit_in_VP_vpu); num_errors_at_band_start_position_vpu = _mm_sub_epi32( num_errors_at_band_start_position_vpu, lowest_bit_in_VN_vpu); __m128i mapping_end_positions_update_mask_vpu = _mm_cmplt_epi32( num_errors_at_band_start_position_vpu, min_num_errors_vpu); __m128i mapping_end_positions_update_mask_vpu1 = _mm_cmpeq_epi32( num_errors_at_band_start_position_vpu, min_num_errors_vpu); int mapping_end_positions_update_mask = _mm_movemask_epi8(mapping_end_positions_update_mask_vpu); int mapping_end_positions_update_mask1 = _mm_movemask_epi8(mapping_end_positions_update_mask_vpu1); for (int li = 0; li < 4; ++li) { if ((mapping_end_positions_update_mask & 1) == 1 || ((mapping_end_positions_update_mask1 & 1) == 1 && i + 1 == error_threshold)) { mapping_end_positions[li] = band_start_position + 1 + i; } mapping_end_positions_update_mask = mapping_end_positions_update_mask >> 4; mapping_end_positions_update_mask1 = mapping_end_positions_update_mask1 >> 4; } min_num_errors_vpu = _mm_min_epi32(min_num_errors_vpu, num_errors_at_band_start_position_vpu); VP = _mm_srli_epi32(VP, 1); VN = _mm_srli_epi32(VN, 1); } _mm_store_si128((__m128i *)mapping_edit_distances, min_num_errors_vpu); } void BandedAlign8PatternsToText(int error_threshold, const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions) { int ALPHABET_SIZE = 5; const char *reference_sequence0 = patterns[0]; const char *reference_sequence1 = patterns[1]; const char *reference_sequence2 = patterns[2]; const char *reference_sequence3 = patterns[3]; const char *reference_sequence4 = patterns[4]; const char *reference_sequence5 = patterns[5]; const char *reference_sequence6 = patterns[6]; const char *reference_sequence7 = patterns[7]; uint16_t highest_bit_in_band_mask = 1 << (2 * error_threshold); __m128i highest_bit_in_band_mask_vpu0 = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, highest_bit_in_band_mask); __m128i highest_bit_in_band_mask_vpu1 = _mm_set_epi16(0, 0, 0, 0, 0, 0, highest_bit_in_band_mask, 0); __m128i highest_bit_in_band_mask_vpu2 = _mm_set_epi16(0, 0, 0, 0, 0, highest_bit_in_band_mask, 0, 0); __m128i highest_bit_in_band_mask_vpu3 = _mm_set_epi16(0, 0, 0, 0, highest_bit_in_band_mask, 0, 0, 0); __m128i highest_bit_in_band_mask_vpu4 = _mm_set_epi16(0, 0, 0, highest_bit_in_band_mask, 0, 0, 0, 0); __m128i highest_bit_in_band_mask_vpu5 = _mm_set_epi16(0, 0, highest_bit_in_band_mask, 0, 0, 0, 0, 0); __m128i highest_bit_in_band_mask_vpu6 = _mm_set_epi16(0, highest_bit_in_band_mask, 0, 0, 0, 0, 0, 0); __m128i highest_bit_in_band_mask_vpu7 = _mm_set_epi16(highest_bit_in_band_mask, 0, 0, 0, 0, 0, 0, 0); // Init Peq __m128i Peq[ALPHABET_SIZE]; for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_setzero_si128(); } for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base0 = CharToUint8(reference_sequence0[i]); uint8_t base1 = CharToUint8(reference_sequence1[i]); uint8_t base2 = CharToUint8(reference_sequence2[i]); uint8_t base3 = CharToUint8(reference_sequence3[i]); uint8_t base4 = CharToUint8(reference_sequence4[i]); uint8_t base5 = CharToUint8(reference_sequence5[i]); uint8_t base6 = CharToUint8(reference_sequence6[i]); uint8_t base7 = CharToUint8(reference_sequence7[i]); Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]); Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]); Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]); Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]); Peq[base4] = _mm_or_si128(highest_bit_in_band_mask_vpu4, Peq[base4]); Peq[base5] = _mm_or_si128(highest_bit_in_band_mask_vpu5, Peq[base5]); Peq[base6] = _mm_or_si128(highest_bit_in_band_mask_vpu6, Peq[base6]); Peq[base7] = _mm_or_si128(highest_bit_in_band_mask_vpu7, Peq[base7]); for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_srli_epi16(Peq[ai], 1); } } uint16_t lowest_bit_in_band_mask = 1; __m128i lowest_bit_in_band_mask_vpu = _mm_set1_epi16(lowest_bit_in_band_mask); __m128i VP = _mm_setzero_si128(); __m128i VN = _mm_setzero_si128(); __m128i X = _mm_setzero_si128(); __m128i D0 = _mm_setzero_si128(); __m128i HN = _mm_setzero_si128(); __m128i HP = _mm_setzero_si128(); __m128i max_mask_vpu = _mm_set1_epi16(0xffff); __m128i num_errors_at_band_start_position_vpu = _mm_setzero_si128(); __m128i early_stop_threshold_vpu = _mm_set1_epi16(error_threshold * 3); for (int i = 0; i < read_length; i++) { uint8_t base0 = CharToUint8(reference_sequence0[i + 2 * error_threshold]); uint8_t base1 = CharToUint8(reference_sequence1[i + 2 * error_threshold]); uint8_t base2 = CharToUint8(reference_sequence2[i + 2 * error_threshold]); uint8_t base3 = CharToUint8(reference_sequence3[i + 2 * error_threshold]); uint8_t base4 = CharToUint8(reference_sequence4[i + 2 * error_threshold]); uint8_t base5 = CharToUint8(reference_sequence5[i + 2 * error_threshold]); uint8_t base6 = CharToUint8(reference_sequence6[i + 2 * error_threshold]); uint8_t base7 = CharToUint8(reference_sequence7[i + 2 * error_threshold]); Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]); Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]); Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]); Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]); Peq[base4] = _mm_or_si128(highest_bit_in_band_mask_vpu4, Peq[base4]); Peq[base5] = _mm_or_si128(highest_bit_in_band_mask_vpu5, Peq[base5]); Peq[base6] = _mm_or_si128(highest_bit_in_band_mask_vpu6, Peq[base6]); Peq[base7] = _mm_or_si128(highest_bit_in_band_mask_vpu7, Peq[base7]); X = _mm_or_si128(Peq[CharToUint8(text[i])], VN); D0 = _mm_and_si128(X, VP); D0 = _mm_add_epi16(D0, VP); D0 = _mm_xor_si128(D0, VP); D0 = _mm_or_si128(D0, X); HN = _mm_and_si128(VP, D0); HP = _mm_or_si128(VP, D0); HP = _mm_xor_si128(HP, max_mask_vpu); HP = _mm_or_si128(HP, VN); X = _mm_srli_epi16(D0, 1); VN = _mm_and_si128(X, HP); VP = _mm_or_si128(X, HP); VP = _mm_xor_si128(VP, max_mask_vpu); VP = _mm_or_si128(VP, HN); __m128i E = _mm_and_si128(D0, lowest_bit_in_band_mask_vpu); E = _mm_xor_si128(E, lowest_bit_in_band_mask_vpu); num_errors_at_band_start_position_vpu = _mm_add_epi16(num_errors_at_band_start_position_vpu, E); __m128i early_stop = _mm_cmpgt_epi16(num_errors_at_band_start_position_vpu, early_stop_threshold_vpu); int tmp = _mm_movemask_epi8(early_stop); if (tmp == 0xffff) { _mm_store_si128((__m128i *)mapping_edit_distances, num_errors_at_band_start_position_vpu); return; } for (int ai = 0; ai < ALPHABET_SIZE; ai++) { Peq[ai] = _mm_srli_epi16(Peq[ai], 1); } } int band_start_position = read_length - 1; __m128i min_num_errors_vpu = num_errors_at_band_start_position_vpu; for (int i = 0; i < 2 * error_threshold; i++) { __m128i lowest_bit_in_VP_vpu = _mm_and_si128(VP, lowest_bit_in_band_mask_vpu); __m128i lowest_bit_in_VN_vpu = _mm_and_si128(VN, lowest_bit_in_band_mask_vpu); num_errors_at_band_start_position_vpu = _mm_add_epi16( num_errors_at_band_start_position_vpu, lowest_bit_in_VP_vpu); num_errors_at_band_start_position_vpu = _mm_sub_epi16( num_errors_at_band_start_position_vpu, lowest_bit_in_VN_vpu); __m128i mapping_end_positions_update_mask_vpu = _mm_cmplt_epi16( num_errors_at_band_start_position_vpu, min_num_errors_vpu); __m128i mapping_end_positions_update_mask_vpu1 = _mm_cmpeq_epi16( num_errors_at_band_start_position_vpu, min_num_errors_vpu); int mapping_end_positions_update_mask = _mm_movemask_epi8(mapping_end_positions_update_mask_vpu); int mapping_end_positions_update_mask1 = _mm_movemask_epi8(mapping_end_positions_update_mask_vpu1); for (int li = 0; li < 8; ++li) { if ((mapping_end_positions_update_mask & 1) == 1 || ((mapping_end_positions_update_mask1 & 1) == 1 && i + 1 == error_threshold)) { mapping_end_positions[li] = band_start_position + 1 + i; } mapping_end_positions_update_mask = mapping_end_positions_update_mask >> 2; mapping_end_positions_update_mask1 = mapping_end_positions_update_mask1 >> 2; } min_num_errors_vpu = _mm_min_epi16(min_num_errors_vpu, num_errors_at_band_start_position_vpu); VP = _mm_srli_epi16(VP, 1); VN = _mm_srli_epi16(VN, 1); } _mm_store_si128((__m128i *)mapping_edit_distances, min_num_errors_vpu); } void BandedTraceback(int error_threshold, int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position) { // fisrt calculate the hamming distance and see whether it's equal to # errors if (min_num_errors == 0) { *mapping_start_position = error_threshold; return; } int error_count = 0; for (int i = 0; i < read_length; ++i) { if (pattern[i + error_threshold] != text[i]) { ++error_count; } } if (error_count == min_num_errors) { *mapping_start_position = error_threshold; return; } // if not then there are gaps so that we have to traceback with edit distance. uint32_t Peq[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base = CharToUint8(pattern[read_length - 1 + 2 * error_threshold - i]); Peq[base] = Peq[base] | (1 << i); } uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); uint32_t lowest_bit_in_band_mask = 1; uint32_t VP = 0; uint32_t VN = 0; uint32_t X = 0; uint32_t D0 = 0; uint32_t HN = 0; uint32_t HP = 0; int num_errors_at_band_start_position = 0; for (int i = 0; i < read_length; i++) { uint8_t pattern_base = CharToUint8(pattern[read_length - 1 - i]); Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask; X = Peq[CharToUint8(text[read_length - 1 - i])] | VN; D0 = ((VP + (X & VP)) ^ VP) | X; HN = VP & D0; HP = VN | ~(VP | D0); X = D0 >> 1; VN = X & HP; VP = HN | ~(X | HP); num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask); for (int ai = 0; ai < 5; ai++) { Peq[ai] >>= 1; } } *mapping_start_position = 2 * error_threshold; for (int i = 0; i < 2 * error_threshold; i++) { num_errors_at_band_start_position = num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1); num_errors_at_band_start_position = num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1); if (num_errors_at_band_start_position == min_num_errors) { *mapping_start_position = 2 * error_threshold - (1 + i); if (i + 1 == error_threshold) { return; } } } } void BandedTracebackToEnd(int error_threshold, int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_end_position) { // fisrt calculate the hamming distance and see whether it's equal to # errors if (min_num_errors == 0) { *mapping_end_position = read_length + error_threshold; return; } int error_count = 0; for (int i = 0; i < read_length; ++i) { if (pattern[i + error_threshold] != text[i]) { ++error_count; } } if (error_count == min_num_errors) { *mapping_end_position = read_length + error_threshold; return; } // if not then there are gaps so that we have to traceback with edit distance. uint32_t Peq[5] = {0, 0, 0, 0, 0}; for (int i = 0; i < 2 * error_threshold; i++) { uint8_t base = CharToUint8(pattern[i]); Peq[base] = Peq[base] | (1 << i); } uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold); uint32_t lowest_bit_in_band_mask = 1; uint32_t VP = 0; uint32_t VN = 0; uint32_t X = 0; uint32_t D0 = 0; uint32_t HN = 0; uint32_t HP = 0; int num_errors_at_band_start_position = 0; for (int i = 0; i < read_length; i++) { // printf("=>%d %d %c %c\n", i, num_errors_at_band_start_position, pattern[i // + 2 * error_threshold], text[i]) ; uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]); Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask; X = Peq[CharToUint8(text[i])] | VN; D0 = ((VP + (X & VP)) ^ VP) | X; HN = VP & D0; HP = VN | ~(VP | D0); X = D0 >> 1; VN = X & HP; VP = HN | ~(X | HP); num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask); for (int ai = 0; ai < 5; ai++) { Peq[ai] >>= 1; } } int band_start_position = read_length; *mapping_end_position = band_start_position + 1; for (int i = 0; i < 2 * error_threshold; i++) { num_errors_at_band_start_position = num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1); num_errors_at_band_start_position = num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1); if (num_errors_at_band_start_position == min_num_errors) { *mapping_end_position = band_start_position + (i + 1); if (i + 1 == error_threshold) { return; } } } } } // namespace chromap ================================================ FILE: src/alignment.h ================================================ #ifndef ALIGNMENT_H_ #define ALIGNMENT_H_ #include "mapping_in_memory.h" #include "sam_mapping.h" #include "sequence_batch.h" #include "utils.h" namespace chromap { int GetLongestMatchLength(const char *pattern, const char *text, const int read_length); // Return newly adjusted reference start/end position for kPositive/kNegative // mappings. int AdjustGapBeginning(const Strand mapping_strand, const char *ref, const char *read, int *gap_beginning, int read_end, int ref_start_position, int ref_end_position, int *n_cigar, uint32_t **cigar); // Reference (pattern) mapping start postion and cigar must be computed before // calling this function. Read (text) must be already at the start position. void GenerateNMAndMDTag(const char *pattern, const char *text, int mapping_start_position, MappingInMemory &mapping_in_memory); int BandedAlignPatternToText(int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position); // Return negative number if the termination are deemed at the beginning of the // read mappping_end_position is relative to pattern (reference) // read_mapping_length is for text (read) int BandedAlignPatternToTextWithDropOff(int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position, int *read_mapping_length); int BandedAlignPatternToTextWithDropOffFrom3End( int error_threshold, const char *pattern, const char *text, const int read_length, int *mapping_end_position, int *read_mapping_length); void BandedAlign4PatternsToText(int error_threshold, const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions); void BandedAlign8PatternsToText(int error_threshold, const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions); void BandedTraceback(int error_threshold, int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position); void BandedTracebackToEnd(int error_threshold, int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_end_position); } // namespace chromap #endif // ALIGNMENT_H_ ================================================ FILE: src/barcode_translator.h ================================================ #ifndef BARCODETRANSLATOR_H_ #define BARCODETRANSLATOR_H_ #include #include #include #include #include #include #include #include #include "khash.h" #include "utils.h" namespace chromap { KHASH_INIT(k64_str, uint64_t, char *, 1, kh_int64_hash_func, kh_int64_hash_equal); // The class for handling barcode convertion. class BarcodeTranslator { public: BarcodeTranslator() { barcode_translate_table_ = NULL; from_bc_length_ = -1; } ~BarcodeTranslator() { if (barcode_translate_table_ != NULL) { khiter_t k; for (k = kh_begin(barcode_translate_table_); k != kh_end(barcode_translate_table_); ++k) { if (kh_exist(barcode_translate_table_, k)) free(kh_value(barcode_translate_table_, k)); } kh_destroy(k64_str, barcode_translate_table_); } } void SetTranslateTable(const std::string &file) { barcode_translate_table_ = kh_init(k64_str); if (1) { gzFile barcode_translate_file = gzopen(file.c_str(), "r"); const uint32_t line_buffer_size = 512; char file_line[line_buffer_size]; while (gzgets(barcode_translate_file, file_line, line_buffer_size) != NULL) { int line_len = strlen(file_line); if (file_line[line_len - 1] == '\n') { file_line[line_len - 1] = '\0'; } std::string tmp_string(file_line); ProcessTranslateFileLine(tmp_string); } } else { // Old implementation, which does not support gzipped input. std::ifstream file_stream(file); std::string file_line; while (getline(file_stream, file_line)) { ProcessTranslateFileLine(file_line); } } mask_ = (1ull << (2 * from_bc_length_)) - 1; /*for (int i = 0; i < from_bc_length_; ++i) { mask_ |= (3ull << (2*i)); }*/ } std::string Translate(uint64_t bc, uint32_t bc_length) { if (barcode_translate_table_ == NULL) { return Seed2Sequence(bc, bc_length); } std::string ret; uint64_t i; for (i = 0; i < bc_length / from_bc_length_; ++i) { uint64_t seed = (bc << (2 * i * from_bc_length_)) >> (2 * (bc_length / from_bc_length_ - 1) * from_bc_length_); seed &= mask_; khiter_t barcode_translate_table_iter = kh_get(k64_str, barcode_translate_table_, seed); if (barcode_translate_table_iter == kh_end(barcode_translate_table_)) { std::cerr << "Barcode does not exist in the translation table." << std::endl; exit(-1); } std::string bc_to( kh_value(barcode_translate_table_, barcode_translate_table_iter)); if (i == 0) { ret = bc_to; } else { ret += "-" + bc_to; } } return ret; } private: khash_t(k64_str) * barcode_translate_table_; int from_bc_length_; uint64_t mask_; std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const { std::string sequence; sequence.reserve(seed_length); uint64_t mask_ = 3; for (uint32_t i = 0; i < seed_length; ++i) { sequence.push_back( Uint8ToChar((seed >> ((seed_length - 1 - i) * 2)) & mask_)); } return sequence; } void ProcessTranslateFileLine(std::string &line) { int i; int len = line.length(); std::string to; for (i = 0; i < len; ++i) { if (line[i] == ',' || line[i] == '\t') break; } to = line.substr(0, i); // from = line.substr(i + 1, len - i - 1); from_bc_length_ = len - i - 1; uint64_t from_seed = GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length_); int khash_return_code; khiter_t barcode_translate_table_iter = kh_put( k64_str, barcode_translate_table_, from_seed, &khash_return_code); kh_value(barcode_translate_table_, barcode_translate_table_iter) = strdup(to.c_str()); } }; } // namespace chromap #endif ================================================ FILE: src/bed_mapping.h ================================================ #ifndef BEDMAPPING_H_ #define BEDMAPPING_H_ #include #include "mapping.h" namespace chromap { class MappingWithBarcode : public Mapping { public: uint32_t read_id_; uint64_t cell_barcode_; uint32_t fragment_start_position_; uint16_t fragment_length_; uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1; uint8_t num_dups_; // uint8_t mapq; MappingWithBarcode() : num_dups_(0) {} MappingWithBarcode(uint32_t read_id, uint64_t cell_barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups) : read_id_(read_id), cell_barcode_(cell_barcode), fragment_start_position_(fragment_start_position), fragment_length_(fragment_length), mapq_(mapq), direction_(direction), is_unique_(is_unique), num_dups_(num_dups) {} bool operator<(const MappingWithBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_, cell_barcode_, mapq_, direction_, is_unique_, read_id_) < std::tie(m.fragment_start_position_, m.fragment_length_, m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_, m.read_id_); } bool operator==(const MappingWithBarcode &m) const { return std::tie(cell_barcode_, fragment_start_position_) == std::tie(m.cell_barcode_, m.fragment_start_position_); } bool IsSamePosition(const MappingWithBarcode &m) const { return std::tie(fragment_start_position_) == std::tie(m.fragment_start_position_); } uint64_t GetBarcode() const { return cell_barcode_; } void Tn5Shift() { if (direction_ == 1) { fragment_start_position_ += 4; } else { fragment_length_ -= 5; } } bool IsPositiveStrand() const { return direction_ > 0 ? true : false; } uint32_t GetStartPosition() const { // inclusive return fragment_start_position_; } uint32_t GetEndPosition() const { // exclusive return fragment_start_position_ + fragment_length_; } }; class MappingWithoutBarcode : public Mapping { public: uint32_t read_id_; uint32_t fragment_start_position_; uint16_t fragment_length_; // uint8_t mapq; uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1; uint16_t num_dups_; // Need higher limit in bulk setting MappingWithoutBarcode() : num_dups_(0) {} MappingWithoutBarcode(uint32_t read_id, uint32_t fragment_start_position, uint16_t fragment_length, uint16_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups) : read_id_(read_id), fragment_start_position_(fragment_start_position), fragment_length_(fragment_length), mapq_(mapq), direction_(direction), is_unique_(is_unique), num_dups_(num_dups) {} bool operator<(const MappingWithoutBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_, mapq_, direction_, is_unique_, read_id_) < std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_, m.direction_, m.is_unique_, m.read_id_); } bool operator==(const MappingWithoutBarcode &m) const { return std::tie(fragment_start_position_) == std::tie(m.fragment_start_position_); } bool IsSamePosition(const MappingWithoutBarcode &m) const { return std::tie(fragment_start_position_) == std::tie(m.fragment_start_position_); } uint64_t GetBarcode() const { return 0; } void Tn5Shift() { if (direction_ == 1) { fragment_start_position_ += 4; } else { fragment_length_ -= 5; } } bool IsPositiveStrand() const { return direction_ > 0 ? true : false; } uint32_t GetStartPosition() const { // inclusive return fragment_start_position_; } uint32_t GetEndPosition() const { // exclusive return fragment_start_position_ + fragment_length_; } }; class PairedEndMappingWithBarcode : public Mapping { public: uint32_t read_id_; uint64_t cell_barcode_; uint32_t fragment_start_position_; uint16_t fragment_length_; uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1; uint8_t num_dups_; // uint8_t mapq; uint16_t positive_alignment_length_; uint16_t negative_alignment_length_; PairedEndMappingWithBarcode() : num_dups_(0) {} PairedEndMappingWithBarcode(uint32_t read_id, uint64_t cell_barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length) : read_id_(read_id), cell_barcode_(cell_barcode), fragment_start_position_(fragment_start_position), fragment_length_(fragment_length), mapq_(mapq), direction_(direction), is_unique_(is_unique), num_dups_(num_dups), positive_alignment_length_(positive_alignment_length), negative_alignment_length_(negative_alignment_length) {} bool operator<(const PairedEndMappingWithBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_, cell_barcode_, mapq_, direction_, is_unique_, read_id_, positive_alignment_length_, negative_alignment_length_) < std::tie(m.fragment_start_position_, m.fragment_length_, m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_, m.read_id_, m.positive_alignment_length_, m.negative_alignment_length_); } bool operator==(const PairedEndMappingWithBarcode &m) const { return std::tie(cell_barcode_, fragment_start_position_, fragment_length_) == std::tie(m.cell_barcode_, m.fragment_start_position_, m.fragment_length_); } bool IsSamePosition(const PairedEndMappingWithBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_) == std::tie(m.fragment_start_position_, m.fragment_length_); } uint64_t GetBarcode() const { return cell_barcode_; } void Tn5Shift() { fragment_start_position_ += 4; positive_alignment_length_ -= 4; fragment_length_ -= 9; negative_alignment_length_ -= 5; } bool IsPositiveStrand() const { return direction_ > 0 ? true : false; } uint32_t GetStartPosition() const { // inclusive return fragment_start_position_; } uint32_t GetEndPosition() const { // exclusive return fragment_start_position_ + fragment_length_; } }; class PairedEndMappingWithoutBarcode : public Mapping { public: uint32_t read_id_; uint32_t fragment_start_position_; uint16_t fragment_length_; uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1; uint8_t num_dups_; // uint8_t mapq; uint16_t positive_alignment_length_; uint16_t negative_alignment_length_; PairedEndMappingWithoutBarcode() : num_dups_(0) {} PairedEndMappingWithoutBarcode(uint32_t read_id, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint16_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length) : read_id_(read_id), fragment_start_position_(fragment_start_position), fragment_length_(fragment_length), mapq_(mapq), direction_(direction), is_unique_(is_unique), num_dups_(num_dups), positive_alignment_length_(positive_alignment_length), negative_alignment_length_(negative_alignment_length) {} bool operator<(const PairedEndMappingWithoutBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_, mapq_, direction_, is_unique_, read_id_, positive_alignment_length_, negative_alignment_length_) < std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_, m.direction_, m.is_unique_, m.read_id_, m.positive_alignment_length_, m.negative_alignment_length_); } bool operator==(const PairedEndMappingWithoutBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_) == std::tie(m.fragment_start_position_, m.fragment_length_); } bool IsSamePosition(const PairedEndMappingWithoutBarcode &m) const { return std::tie(fragment_start_position_, fragment_length_) == std::tie(m.fragment_start_position_, m.fragment_length_); } uint64_t GetBarcode() const { return 0; } void Tn5Shift() { fragment_start_position_ += 4; positive_alignment_length_ -= 4; fragment_length_ -= 9; negative_alignment_length_ -= 5; } bool IsPositiveStrand() const { return direction_ > 0 ? true : false; } uint32_t GetStartPosition() const { // inclusive return fragment_start_position_; } uint32_t GetEndPosition() const { // exclusive return fragment_start_position_ + fragment_length_; } }; } // namespace chromap #endif // BEDMAPPING_H_ ================================================ FILE: src/candidate.h ================================================ #ifndef CANDIDATE_H_ #define CANDIDATE_H_ #include namespace chromap { struct Candidate { // The high 32 bits save the reference sequence index in the reference // sequence batch. The low 32 bits save the reference position on that // sequence. uint64_t position = 0; // The number of minimizers supports the position. uint8_t count = 0; inline uint32_t GetReferenceSequenceIndex() const { return (position >> 32); } inline uint32_t GetReferenceSequencePosition() const { return position; } inline uint8_t GetCount() { return count; } inline bool operator<(const Candidate &c) const { if (count > c.count) { return true; } if (count < c.count) { return false; } return position < c.position; } }; } // namespace chromap #endif // CANDIDATE_H_ ================================================ FILE: src/candidate_position_generating_config.h ================================================ #ifndef CANDIDATE_POSITION_GENERATING_CONFIG_H_ #define CANDIDATE_POSITION_GENERATING_CONFIG_H_ #include namespace chromap { // This class holds the parameters to generate candidate position. Using the // parameters, it can check whether a seed is frequent or repetitive. class CandidatePositionGeneratingConfig { public: CandidatePositionGeneratingConfig() = delete; CandidatePositionGeneratingConfig(uint32_t max_seed_frequency, uint32_t repetitive_seed_frequency, bool use_heap_merge) : max_seed_frequency_(max_seed_frequency), repetitive_seed_frequency_(repetitive_seed_frequency), use_heap_merge_(use_heap_merge) {} ~CandidatePositionGeneratingConfig() = default; inline bool IsFrequentSeed(uint32_t seed_frequency) const { return seed_frequency >= max_seed_frequency_; } inline bool IsRepetitiveSeed(uint32_t seed_frequency) const { return seed_frequency >= repetitive_seed_frequency_; } inline bool UseHeapMerge() const { return use_heap_merge_; } inline uint32_t GetMaxSeedFrequency() const { return max_seed_frequency_; } private: // Only seeds with frequency less than this threshold will be used. const uint32_t max_seed_frequency_; // Seeds with frequency greater than or equal to this threshold will be // considered as repetitive seeds. const uint32_t repetitive_seed_frequency_; // When the number of candidate positions is really large, use heap merge to // merge sorted candidate lists. const bool use_heap_merge_; }; } // namespace chromap #endif // CANDIDATE_POSITION_GENERATING_CONFIG_H_ ================================================ FILE: src/candidate_processor.cc ================================================ #include "candidate_processor.h" #include #include #include #include #include #include namespace chromap { void CandidateProcessor::GenerateCandidates( int error_threshold, const Index &index, MappingMetadata &mapping_metadata) const { const std::vector &minimizers = mapping_metadata.minimizers_; std::vector &positive_hits = mapping_metadata.positive_hits_; std::vector &negative_hits = mapping_metadata.negative_hits_; std::vector &positive_candidates = mapping_metadata.positive_candidates_; std::vector &negative_candidates = mapping_metadata.negative_candidates_; uint32_t &repetitive_seed_length = mapping_metadata.repetitive_seed_length_; const CandidatePositionGeneratingConfig first_round_generating_config( /*max_seed_frequency=*/max_seed_frequencies_[0], /*repetitive_seed_frequency=*/max_seed_frequencies_[0], /*use_heap_merge=*/false); repetitive_seed_length = 0; int repetitive_seed_count = index.GenerateCandidatePositions( first_round_generating_config, mapping_metadata); bool use_high_frequency_minimizers = false; if (positive_hits.size() + negative_hits.size() == 0) { positive_hits.clear(); negative_hits.clear(); repetitive_seed_length = 0; const CandidatePositionGeneratingConfig second_round_generating_config( /*max_seed_frequency=*/max_seed_frequencies_[1], /*repetitive_seed_frequency=*/max_seed_frequencies_[0], /*use_heap_merge=*/true); repetitive_seed_count = index.GenerateCandidatePositions( second_round_generating_config, mapping_metadata); use_high_frequency_minimizers = true; if (positive_hits.size() == 0 || negative_hits.size() == 0) { use_high_frequency_minimizers = false; } } int num_required_seeds = minimizers.size() - repetitive_seed_count; num_required_seeds = num_required_seeds > 1 ? num_required_seeds : 1; num_required_seeds = num_required_seeds > min_num_seeds_required_for_mapping_ ? min_num_seeds_required_for_mapping_ : num_required_seeds; if (use_high_frequency_minimizers) { num_required_seeds = min_num_seeds_required_for_mapping_; } // std::cerr << "Normal positive gen on one dir\n"; GenerateCandidatesOnOneStrand(error_threshold, num_required_seeds, minimizers.size(), positive_hits, positive_candidates); // std::cerr << "Normal negative gen on one dir\n"; GenerateCandidatesOnOneStrand(error_threshold, num_required_seeds, minimizers.size(), negative_hits, negative_candidates); // fprintf(stderr, "p+n: %d\n", positive_candidates->size() + // negative_candidates->size()) ; } // Return 0 if it supplements normally. Return 1 if the supplement could be too // aggressive, and MAPQ needs setting to 0. int CandidateProcessor::SupplementCandidates( int error_threshold, uint32_t search_range, const Index &index, PairedEndMappingMetadata &paired_end_mapping_metadata) const { std::vector augment_positive_candidates1; std::vector augment_positive_candidates2; std::vector augment_negative_candidates1; std::vector augment_negative_candidates2; int ret = 0; for (int mate = 0; mate <= 1; ++mate) { std::vector *minimizers; std::vector *positive_hits; std::vector *negative_hits; std::vector *positive_candidates; std::vector *negative_candidates; std::vector *mate_positive_candidates; std::vector *mate_negative_candidates; std::vector *augment_positive_candidates; std::vector *augment_negative_candidates; uint32_t *repetitive_seed_length; if (mate == 0) { minimizers = &paired_end_mapping_metadata.mapping_metadata1_.minimizers_; positive_hits = &paired_end_mapping_metadata.mapping_metadata1_.positive_hits_; negative_hits = &paired_end_mapping_metadata.mapping_metadata1_.negative_hits_; positive_candidates = &paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_; negative_candidates = &paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_; mate_positive_candidates = &paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_; mate_negative_candidates = &paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_; augment_positive_candidates = &augment_positive_candidates1; augment_negative_candidates = &augment_negative_candidates1; repetitive_seed_length = &paired_end_mapping_metadata.mapping_metadata1_ .repetitive_seed_length_; } else { minimizers = &paired_end_mapping_metadata.mapping_metadata2_.minimizers_; positive_hits = &paired_end_mapping_metadata.mapping_metadata2_.positive_hits_; negative_hits = &paired_end_mapping_metadata.mapping_metadata2_.negative_hits_; positive_candidates = &paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_; negative_candidates = &paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_; mate_positive_candidates = &paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_; mate_negative_candidates = &paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_; augment_positive_candidates = &augment_positive_candidates2; augment_negative_candidates = &augment_negative_candidates2; repetitive_seed_length = &paired_end_mapping_metadata.mapping_metadata2_ .repetitive_seed_length_; } uint32_t mm_count = minimizers->size(); bool augment_flag = true; uint32_t candidate_num = positive_candidates->size(); for (uint32_t i = 0; i < candidate_num; ++i) { if ((*positive_candidates)[i].count >= mm_count / 2) { augment_flag = false; break; } } candidate_num = negative_candidates->size(); if (augment_flag) { for (uint32_t i = 0; i < candidate_num; ++i) { if ((*negative_candidates)[i].count >= mm_count / 2) { augment_flag = false; break; } } } if (augment_flag) { positive_hits->clear(); negative_hits->clear(); positive_hits->reserve(max_seed_frequencies_[0]); negative_hits->reserve(max_seed_frequencies_[0]); int positive_rescue_result = 0; int negative_rescue_result = 0; if (mate_positive_candidates->size() > 0) { positive_rescue_result = GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand( kNegative, search_range, error_threshold, index, *minimizers, *mate_positive_candidates, *repetitive_seed_length, *negative_hits, *augment_negative_candidates); } if (mate_negative_candidates->size() > 0) { negative_rescue_result = GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand( kPositive, search_range, error_threshold, index, *minimizers, *mate_negative_candidates, *repetitive_seed_length, *positive_hits, *augment_positive_candidates); } // If one of the strand did not supplement due to too many best candidate, // and the filtered strand have better best candidates, // and there is no candidate directly from minimizers, // then we remove the supplement if (((positive_rescue_result < 0 && negative_rescue_result > 0 && -positive_rescue_result >= negative_rescue_result) || (positive_rescue_result > 0 && negative_rescue_result < 0 && positive_rescue_result <= -negative_rescue_result)) && positive_candidates->size() + negative_candidates->size() == 0) { // augment_positive_candidates->clear(); // augment_negative_candidates->clear(); ret = 1; } } } if (augment_positive_candidates1.size() > 0) { MergeCandidates( error_threshold, paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_, augment_positive_candidates1, paired_end_mapping_metadata.mapping_metadata1_ .positive_candidates_buffer_); } if (augment_negative_candidates1.size() > 0) { MergeCandidates( error_threshold, paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_, augment_negative_candidates1, paired_end_mapping_metadata.mapping_metadata1_ .negative_candidates_buffer_); } if (augment_positive_candidates2.size() > 0) { MergeCandidates( error_threshold, paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_, augment_positive_candidates2, paired_end_mapping_metadata.mapping_metadata2_ .positive_candidates_buffer_); } if (augment_negative_candidates2.size() > 0) { MergeCandidates( error_threshold, paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_, augment_negative_candidates2, paired_end_mapping_metadata.mapping_metadata2_ .negative_candidates_buffer_); } return ret; } void CandidateProcessor::ReduceCandidatesForPairedEndRead( uint32_t mapping_positions_distance, PairedEndMappingMetadata &paired_end_mapping_metadata) const { const std::vector &positive_candidates1 = paired_end_mapping_metadata.mapping_metadata1_ .positive_candidates_buffer_; const std::vector &negative_candidates1 = paired_end_mapping_metadata.mapping_metadata1_ .negative_candidates_buffer_; const std::vector &positive_candidates2 = paired_end_mapping_metadata.mapping_metadata2_ .positive_candidates_buffer_; const std::vector &negative_candidates2 = paired_end_mapping_metadata.mapping_metadata2_ .negative_candidates_buffer_; std::vector &filtered_positive_candidates1 = paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_; std::vector &filtered_negative_candidates1 = paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_; std::vector &filtered_positive_candidates2 = paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_; std::vector &filtered_negative_candidates2 = paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_; ReduceCandidatesForPairedEndReadOnOneDirection( mapping_positions_distance, positive_candidates1, negative_candidates2, filtered_positive_candidates1, filtered_negative_candidates2); ReduceCandidatesForPairedEndReadOnOneDirection( mapping_positions_distance, negative_candidates1, positive_candidates2, filtered_negative_candidates1, filtered_positive_candidates2); } int CandidateProcessor:: GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand( const Strand strand, uint32_t search_range, int error_threshold, const Index &index, const std::vector &minimizers, const std::vector &mate_candidates, uint32_t &repetitive_seed_length, std::vector &hits, std::vector &candidates) const { int max_seed_count = index.GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand( strand, search_range, min_num_seeds_required_for_mapping_, max_seed_frequencies_[0], error_threshold, minimizers, mate_candidates, repetitive_seed_length, hits); GenerateCandidatesOnOneStrand(error_threshold, /*num_seeds_required=*/1, minimizers.size(), hits, candidates); return max_seed_count; } void CandidateProcessor::GenerateCandidatesOnOneStrand( int error_threshold, int num_seeds_required, uint32_t num_minimizers, std::vector &hits, std::vector &candidates) const { hits.emplace_back(UINT64_MAX); if (hits.size() > 0) { int minimizer_count = 1; // The number of seeds with the exact same reference position. int equal_count = 1; int best_equal_count = 1; uint64_t previous_hit = hits[0]; uint32_t previous_reference_id = previous_hit >> 32; uint32_t previous_reference_position = previous_hit; uint64_t best_local_hit = hits[0]; for (uint32_t pi = 1; pi < hits.size(); ++pi) { uint32_t current_reference_id = hits[pi] >> 32; uint32_t current_reference_position = hits[pi]; #ifdef LI_DEBUG printf("%s: %d %d\n", __func__, current_reference_id, current_reference_position); #endif if (current_reference_id != previous_reference_id || current_reference_position > previous_reference_position + error_threshold || ((uint32_t)minimizer_count >= num_minimizers && current_reference_position > (uint32_t)best_local_hit + error_threshold)) { if (minimizer_count >= num_seeds_required) { Candidate candidate; candidate.position = best_local_hit; candidate.count = best_equal_count; candidates.push_back(candidate); } minimizer_count = 1; equal_count = 1; best_equal_count = 1; best_local_hit = hits[pi]; } else { if (hits[pi] == best_local_hit) { ++equal_count; ++best_equal_count; } else if (hits[pi] == previous_hit) { ++equal_count; if (equal_count > best_equal_count) { best_local_hit = previous_hit; best_equal_count = equal_count; } } else { equal_count = 1; } ++minimizer_count; } previous_hit = hits[pi]; previous_reference_id = current_reference_id; previous_reference_position = current_reference_position; } } } // Merge c1 and c2 into buffer and then swap the results into c1. void CandidateProcessor::MergeCandidates(int error_threshold, std::vector &c1, std::vector &c2, std::vector &buffer) const { if (c1.size() == 0) { c1.swap(c2); return; } uint32_t i, j; uint32_t size1, size2; size1 = c1.size(); size2 = c2.size(); buffer.clear(); #ifdef LI_DEBUG for (i = 0; i < size1; ++i) printf("c1: %d %d %d\n", (int)(c1[i].position >> 32), (int)c1[i].position, c1[i].count); for (i = 0; i < size2; ++i) printf("c2: %d %d %d\n", (int)(c2[i].position >> 32), (int)c2[i].position, c2[i].count); #endif i = 0; j = 0; while (i < size1 && j < size2) { if (c1[i].position == c2[j].position) { if (buffer.empty() || c1[i].position > buffer.back().position + error_threshold) { if (c1[i].count > c2[j].count) { buffer.push_back(c1[i]); } else { buffer.push_back(c2[j]); } } ++i, ++j; } else if (c1[i].position < c2[j].position) { if (buffer.empty() || c1[i].position > buffer.back().position + error_threshold) { buffer.push_back(c1[i]); } ++i; } else { if (buffer.empty() || c2[j].position > buffer.back().position + error_threshold) { buffer.push_back(c2[j]); } ++j; } } while (i < size1) { if (buffer.empty() || c1[i].position > buffer.back().position + error_threshold) { buffer.push_back(c1[i]); } ++i; } while (j < size2) { if (buffer.empty() || c2[j].position > buffer.back().position + error_threshold) { buffer.push_back(c2[j]); } ++j; } c1.swap(buffer); } void CandidateProcessor::ReduceCandidatesForPairedEndReadOnOneDirection( uint32_t mapping_positions_distance, const std::vector &candidates1, const std::vector &candidates2, std::vector &filtered_candidates1, std::vector &filtered_candidates2) const { uint32_t i1 = 0; uint32_t i2 = 0; int num_unpaired_candidate1 = 0; int num_unpaired_candidate2 = 0; int num_unpaired_candidate_threshold = 5; int max_candidate_count1 = 6; int max_candidate_count2 = 6; uint32_t previous_end_i2 = i2; #ifdef LI_DEBUG for (uint32_t i = 0; i < candidates1.size(); ++i) printf("%s 0: %d %d:%d\n", __func__, i, (int)(candidates1[i].position >> 32), (int)candidates1[i].position); for (uint32_t i = 0; i < candidates2.size(); ++i) printf("%s 1: %d %d:%d\n", __func__, i, (int)(candidates2[i].position >> 32), (int)candidates2[i].position); #endif while (i1 < candidates1.size() && i2 < candidates2.size()) { if (candidates1[i1].position > candidates2[i2].position + mapping_positions_distance) { if (i2 >= previous_end_i2 && num_unpaired_candidate2 < num_unpaired_candidate_threshold && (candidates1[i1].position >> 32) == (candidates2[i2].position >> 32) && candidates2[i2].count >= max_candidate_count2) { filtered_candidates2.emplace_back(candidates2[i2]); ++num_unpaired_candidate2; } ++i2; } else if (candidates2[i2].position > candidates1[i1].position + mapping_positions_distance) { if (num_unpaired_candidate1 < num_unpaired_candidate_threshold && (candidates1[i1].position >> 32) == (candidates2[i2].position >> 32) && candidates1[i1].count >= max_candidate_count1) { filtered_candidates1.emplace_back(candidates1[i1]); ++num_unpaired_candidate1; } ++i1; } else { // ok, find a pair, we store current ni2 somewhere and keep looking until // we go out of the range, then we go back and then move to next pi1 and // keep doing the similar thing. filtered_candidates1.emplace_back(candidates1[i1]); if (candidates1[i1].count > max_candidate_count1) { max_candidate_count1 = candidates1[i1].count; } uint32_t current_i2 = i2; while (current_i2 < candidates2.size() && candidates2[current_i2].position <= candidates1[i1].position + mapping_positions_distance) { if (current_i2 >= previous_end_i2) { filtered_candidates2.emplace_back(candidates2[current_i2]); if (candidates2[current_i2].count > max_candidate_count2) { max_candidate_count2 = candidates2[current_i2].count; } } ++current_i2; } previous_end_i2 = current_i2; ++i1; } } } } // namespace chromap ================================================ FILE: src/candidate_processor.h ================================================ #ifndef CANDIDATE_PROCESSOR_H_ #define CANDIDATE_PROCESSOR_H_ #include #include #include #include #include #include #include "candidate.h" #include "index.h" #include "mapping_metadata.h" #include "paired_end_mapping_metadata.h" #include "sequence_batch.h" #include "utils.h" namespace chromap { class CandidateProcessor { public: CandidateProcessor() = delete; CandidateProcessor(int min_num_seeds_required_for_mapping, const std::vector max_seed_frequencies) : min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies) {} ~CandidateProcessor() = default; void GenerateCandidates(int error_threshold, const Index &index, MappingMetadata &mapping_metadata) const; int SupplementCandidates( int error_threshold, uint32_t search_range, const Index &index, PairedEndMappingMetadata &paired_end_mapping_metadata) const; void ReduceCandidatesForPairedEndRead( uint32_t mapping_positions_distance, PairedEndMappingMetadata &paired_end_mapping_metadata) const; private: void GenerateCandidatesOnOneStrand(int error_threshold, int num_seeds_required, uint32_t num_minimizers, std::vector &hits, std::vector &candidates) const; int GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand( const Strand strand, uint32_t search_range, int error_threshold, const Index &index, const std::vector &minimizers, const std::vector &mate_candidates, uint32_t &repetitive_seed_length, std::vector &hits, std::vector &candidates) const; void MergeCandidates(int error_threshold, std::vector &c1, std::vector &c2, std::vector &buffer) const; void ReduceCandidatesForPairedEndReadOnOneDirection( uint32_t mapping_positions_distance, const std::vector &candidates1, const std::vector &candidates2, std::vector &filtered_candidates1, std::vector &filtered_candidates2) const; const int min_num_seeds_required_for_mapping_; // Vector of size 2. The first element is the frequency threshold, and the // second element is the frequency threshold to run rescue. The second element // should always larger than the first one. // TODO(Haowen): add an error check. const std::vector max_seed_frequencies_; }; } // namespace chromap #endif // CANDIDATE_PROCESSOR_H_ ================================================ FILE: src/chromap.cc ================================================ #include "chromap.h" #include #include #include #include #include #include #include #include #include #include namespace chromap { void Chromap::ConstructIndex() { // TODO(Haowen): Need a faster algorithm // Load all sequences in the reference into one batch SequenceBatch reference; reference.InitializeLoading(index_parameters_.reference_file_path); reference.LoadAllSequences(); const uint32_t num_sequences = reference.GetNumSequences(); Index index(index_parameters_); index.Construct(num_sequences, reference); index.Statistics(num_sequences, reference); index.Save(); reference.FinalizeLoading(); } uint32_t Chromap::LoadSingleEndReadsWithBarcodes(SequenceBatch &read_batch, SequenceBatch &barcode_batch, bool parallel_parsing) { //double real_start_time = GetRealTime(); uint32_t num_loaded_reads = 0; if (!parallel_parsing || mapping_parameters_.is_bulk_data) { while (num_loaded_reads < read_batch_size_) { bool no_more_read = read_batch.LoadOneSequenceAndSaveAt(num_loaded_reads); bool no_more_barcode = no_more_read; if (!mapping_parameters_.is_bulk_data) { no_more_barcode = barcode_batch.LoadOneSequenceAndSaveAt(num_loaded_reads); } if (no_more_read && no_more_barcode) { break; } else if (no_more_read || no_more_barcode){ ExitWithMessage("Numbers of reads and barcodes don't match!"); } ++num_loaded_reads; } } else { uint32_t num_loaded_barcode = 0 ; #pragma omp task shared(num_loaded_reads, read_batch) { uint32_t i = 0 ; for (i = 0 ; i < read_batch_size_; ++i) { if (read_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read break ; } } num_loaded_reads = i ; } #pragma omp task shared(num_loaded_barcode, barcode_batch) { // bulk data will go to the other big branch uint32_t i = 0 ; for (i = 0 ; i < read_batch_size_; ++i) { if (barcode_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read break ; } } num_loaded_barcode = i ; } #pragma omp taskwait if (num_loaded_reads != num_loaded_barcode) { ExitWithMessage("Numbers of reads and barcodes don't match!"); } } /*if (num_loaded_reads > 0) { std::cerr << "Loaded " << num_loaded_reads << " reads in " << GetRealTime() - real_start_time << "s.\n"; } else { std::cerr << "No more reads.\n"; }*/ return num_loaded_reads; } uint32_t Chromap::LoadPairedEndReadsWithBarcodes(SequenceBatch &read_batch1, SequenceBatch &read_batch2, SequenceBatch &barcode_batch, bool parallel_parsing) { // double real_start_time = Chromap<>::GetRealTime(); uint32_t num_loaded_pairs = 0; if (!parallel_parsing) { while (num_loaded_pairs < read_batch_size_) { bool no_more_read1 = read_batch1.LoadOneSequenceAndSaveAt(num_loaded_pairs); bool no_more_read2 = read_batch2.LoadOneSequenceAndSaveAt(num_loaded_pairs); bool no_more_barcode = no_more_read2; if (!mapping_parameters_.is_bulk_data) { no_more_barcode = barcode_batch.LoadOneSequenceAndSaveAt(num_loaded_pairs); } if (no_more_read1 && no_more_read2 && no_more_barcode) { break; } else if (no_more_read1 || no_more_read2 || no_more_barcode){ ExitWithMessage("Numbers of reads and barcodes don't match!"); } ++num_loaded_pairs; } } else { uint32_t num_loaded_read1 = 0; uint32_t num_loaded_read2 = 0; uint32_t num_loaded_barcode = 0; #pragma omp task shared(num_loaded_read1, read_batch1) { uint32_t i = 0 ; for (i = 0 ; i < read_batch_size_; ++i) { if (read_batch1.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read break ; } } num_loaded_read1 = i ; } #pragma omp task shared(num_loaded_read2, read_batch2) { uint32_t i = 0 ; for (i = 0 ; i < read_batch_size_; ++i) { if (read_batch2.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read break ; } } num_loaded_read2 = i ; } #pragma omp task shared(num_loaded_barcode, barcode_batch) { if (!mapping_parameters_.is_bulk_data) { uint32_t i = 0 ; for (i = 0 ; i < read_batch_size_; ++i) { if (barcode_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read break ; } } num_loaded_barcode = i ; } } #pragma omp taskwait if (mapping_parameters_.is_bulk_data) { num_loaded_barcode = num_loaded_read2; } if (num_loaded_read1 != num_loaded_read2 || num_loaded_read2 != num_loaded_barcode) { ExitWithMessage("Numbers of reads and barcodes don't match!"); } num_loaded_pairs = num_loaded_read1 ; } // if (num_loaded_pairs > 0) { // std::cerr << "Loaded " << num_loaded_pairs << " pairs in "<< // Chromap<>::GetRealTime() - real_start_time << "s. "; //} else { // std::cerr << "No more reads.\n"; //} return num_loaded_pairs; } void Chromap::TrimAdapterForPairedEndRead(uint32_t pair_index, SequenceBatch &read_batch1, SequenceBatch &read_batch2) { const uint32_t raw_read1_length = read_batch1.GetSequenceLengthAt(pair_index); const uint32_t raw_read2_length = read_batch2.GetSequenceLengthAt(pair_index); const char *raw_read1 = read_batch1.GetSequenceAt(pair_index); const char *raw_read2 = read_batch2.GetSequenceAt(pair_index); const std::string &raw_negative_read1 = read_batch1.GetNegativeSequenceAt(pair_index); const std::string &raw_negative_read2 = read_batch2.GetNegativeSequenceAt(pair_index); // In the actual adaptor trimming, we assuem length(read1)<=length(read2). So // we can have the case that read1 is a subset of read2. const char *read1 = raw_read1_length <= raw_read2_length ? raw_read1 : raw_read2; const std::string &negative_read2 = raw_read1_length <= raw_read2_length ? raw_negative_read2 : raw_negative_read1; const uint32_t read1_length = raw_read1_length <= raw_read2_length ? raw_read1_length : raw_read2_length; const uint32_t read2_length = raw_read1_length <= raw_read2_length ? raw_read2_length : raw_read1_length; const int min_overlap_length = mapping_parameters_.min_read_length; const int seed_length = min_overlap_length / 2; const int error_threshold_for_merging = 1; bool is_merged = false; for (int si = 0; si < error_threshold_for_merging + 1; ++si) { size_t seed_start_position = negative_read2.find(read1 + si * seed_length, 0, seed_length); while (seed_start_position != std::string::npos) { const bool before_seed_is_enough_long = seed_start_position >= (size_t)(si * seed_length); const bool overlap_is_enough_long = (int)(read2_length - seed_start_position + seed_length * si) >= min_overlap_length; if (!before_seed_is_enough_long || !overlap_is_enough_long) { seed_start_position = negative_read2.find( read1 + si * seed_length, seed_start_position + 1, seed_length); continue; } bool can_merge = true; int num_errors = 0; // The bases before the seed. for (int i = 0; i < seed_length * si; ++i) { if (negative_read2[seed_start_position - si * seed_length + i] != read1[i]) { ++num_errors; } if (num_errors > error_threshold_for_merging) { can_merge = false; break; } } // The bases after the seed. for (uint32_t i = seed_length; i + seed_start_position < read2_length && si * seed_length + i < read1_length; ++i) { if (negative_read2[seed_start_position + i] != read1[si * seed_length + i]) { ++num_errors; } if (num_errors > error_threshold_for_merging) { can_merge = false; break; } } if (can_merge) { // Trim adapters and TODO: fix sequencing errors int overlap_length = read2_length - seed_start_position + si * seed_length; int read2_offset = 0; // The case that read1 is strictly contained in read2. overlap_length is // inferred from the longer read2, which could be longer than read1. In // that case, we don't trim read1 (make overlap length equal to read1 // length) and trim read2 as the original plan. if (overlap_length > (int)read1_length) { read2_offset = overlap_length - read1_length; overlap_length = read1_length; } if (raw_read1_length <= raw_read2_length) { read_batch1.TrimSequenceAt(pair_index, overlap_length); read_batch2.TrimSequenceAt(pair_index, overlap_length + read2_offset); } else { read_batch1.TrimSequenceAt(pair_index, overlap_length + read2_offset); read_batch2.TrimSequenceAt(pair_index, overlap_length); } is_merged = true; // std::cerr << "Trimed! overlap length: " << overlap_length << ", " << // read1.GetLength() << " " << read2.GetLength() << "\n"; break; } seed_start_position = negative_read2.find( read1 + si * seed_length, seed_start_position + 1, seed_length); } if (is_merged) { break; } } } bool Chromap::PairedEndReadWithBarcodeIsDuplicate( uint32_t pair_index, const SequenceBatch &barcode_batch, const SequenceBatch &read_batch1, const SequenceBatch &read_batch2) { int dedupe_seed_length = 16; uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(pair_index); uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt(pair_index, 0, barcode_length); uint64_t read1_seed1 = read_batch1.GenerateSeedFromSequenceAt(pair_index, 0, dedupe_seed_length); uint64_t read2_seed1 = read_batch2.GenerateSeedFromSequenceAt(pair_index, 0, dedupe_seed_length); uint64_t read_seed_key = (read1_seed1 << (dedupe_seed_length * 2)) | read2_seed1; uint64_t read1_seed2 = read_batch1.GenerateSeedFromSequenceAt( pair_index, dedupe_seed_length, dedupe_seed_length * 2); uint64_t read2_seed2 = read_batch2.GenerateSeedFromSequenceAt( pair_index, dedupe_seed_length, dedupe_seed_length * 2); khiter_t barcode_table_iterator = kh_get(k64_seq, barcode_lookup_table_, barcode_key); if (barcode_table_iterator != kh_end(barcode_lookup_table_)) { uint32_t read_lookup_table_index = kh_value(barcode_lookup_table_, barcode_table_iterator); // std::cerr << "Have barcode, try to check read. " << // read_lookup_table_index << "\n"; khash_t(k128) *read_lookup_table = read_lookup_tables_[read_lookup_table_index]; khiter_t read_lookup_table_iterator = kh_get(k128, read_lookup_table, read_seed_key); if (read_lookup_table_iterator != kh_end(read_lookup_table)) { // std::cerr << "Have barcode, have read, try whether match.\n"; uint128_t read_seeds = kh_value(read_lookup_table, read_lookup_table_iterator); if (read_seeds.first == read1_seed2 && read_seeds.second == read2_seed2) { // std::cerr << "Have barcode, have read, and match.\n"; return true; } else { // std::cerr << "Have barcode, have read, but don't match.\n"; return false; } } else { // std::cerr << "Have barcode, no read.\n"; uint128_t read_seeds = {.first = read1_seed2, .second = read2_seed2}; int khash_return_code; khiter_t read_lookup_table_insert_iterator = kh_put(k128, read_lookup_table, read_seed_key, &khash_return_code); assert(khash_return_code != -1 && khash_return_code != 0); kh_value(read_lookup_table, read_lookup_table_insert_iterator) = read_seeds; // std::cerr << "Have barcode, no read.\n"; return false; } } else { // insert the barcode and append a new read hash table to tables and then // insert the reads // std::cerr << "No barcode, no read.\n"; int khash_return_code; khiter_t barcode_table_insert_iterator = kh_put(k64_seq, barcode_lookup_table_, barcode_key, &khash_return_code); assert(khash_return_code != -1 && khash_return_code != 0); kh_value(barcode_lookup_table_, barcode_table_insert_iterator) = read_lookup_tables_.size(); khash_t(k128) *read_lookup_table = kh_init(k128); khiter_t read_lookup_table_iterator = kh_put(k128, read_lookup_table, read_seed_key, &khash_return_code); assert(khash_return_code != -1 && khash_return_code != 0); uint128_t read_seeds = {.first = read1_seed2, .second = read2_seed2}; kh_value(read_lookup_table, read_lookup_table_iterator) = read_seeds; read_lookup_tables_.push_back(read_lookup_table); // std::cerr << "No barcode, no read.\n"; return false; } } uint32_t Chromap::SampleInputBarcodesAndExamineLength() { if (mapping_parameters_.is_bulk_data) { return 0; } uint32_t sample_batch_size = 1000; SequenceBatch barcode_batch(sample_batch_size, barcode_effective_range_); barcode_batch.InitializeLoading(mapping_parameters_.barcode_file_paths[0]); uint32_t num_loaded_barcodes = barcode_batch.LoadBatch(); uint32_t cell_barcode_length = barcode_batch.GetSequenceLengthAt(0); for (uint32_t i = 1; i < num_loaded_barcodes; ++i) { if (barcode_batch.GetSequenceLengthAt(i) != cell_barcode_length) { ExitWithMessage("ERROR: barcode lengths are not equal in the sample!"); } } barcode_batch.FinalizeLoading(); return cell_barcode_length; } void Chromap::LoadBarcodeWhitelist() { double real_start_time = GetRealTime(); int num_barcodes = 0; if (1) { gzFile barcode_whitelist_file = gzopen(mapping_parameters_.barcode_whitelist_file_path.c_str(), "r"); const uint32_t barcode_buffer_size = 256; char barcode[barcode_buffer_size]; while (gzgets(barcode_whitelist_file, barcode, barcode_buffer_size) != NULL) { size_t barcode_length = strlen(barcode); if (barcode[barcode_length - 1] == '\n') { barcode[barcode_length - 1] = '\0'; --barcode_length; } if (barcode_length > 32) { ExitWithMessage("ERROR: barcode length is greater than 32!"); } if (barcode_length != barcode_length_) { if (num_barcodes == 0) { ExitWithMessage( "ERROR: whitelist and input barcode lengths are not equal!"); } else { ExitWithMessage( "ERROR: barcode lengths are not equal in the whitelist!"); } } uint64_t barcode_key = GenerateSeedFromSequence( barcode, barcode_length, 0, barcode_length); int khash_return_code; khiter_t barcode_whitelist_lookup_table_iterator = kh_put(k64_seq, barcode_whitelist_lookup_table_, barcode_key, &khash_return_code); kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) = 0; assert(khash_return_code != -1 && khash_return_code != 0); ++num_barcodes; } if (!gzeof(barcode_whitelist_file)) { ExitWithMessage("ERROR: barcode whitelist file does not exist or is truncated!"); } gzclose(barcode_whitelist_file); } else { std::ifstream barcode_whitelist_file_stream( mapping_parameters_.barcode_whitelist_file_path); std::string barcode_whitelist_file_line; // bool first_line = true; while (getline(barcode_whitelist_file_stream, barcode_whitelist_file_line)) { std::stringstream barcode_whitelist_file_line_string_stream( barcode_whitelist_file_line); //// skip the header // if (barcode_whitelist_file_line[0] == '#' || // barcode_whitelist_file_line.find("kmer") == 0) { // continue; //} std::string barcode; barcode_whitelist_file_line_string_stream >> barcode; size_t barcode_length = barcode.length(); if (barcode_length > 32) { ExitWithMessage("ERROR: barcode length is greater than 32!"); } if (barcode_length != barcode_length_) { if (num_barcodes == 0) { ExitWithMessage( "ERROR: whitelist and input barcode lengths are not equal!"); } else { ExitWithMessage( "ERROR: barcode lengths are not equal in the whitelist!"); } } // if (first_line) { // //size_t barcode_length = kmer.length(); // // Allocate memory to save pore model parameters // //size_t num_pore_models = 1 << (kmer_size_ * 2); // //pore_models_.assign(num_pore_models, PoreModelParameters()); // //first_line = false; //} // assert(kmer.length() == (size_t)kmer_size_); uint64_t barcode_key = GenerateSeedFromSequence( barcode.data(), barcode_length, 0, barcode_length); // PoreModelParameters &pore_model_parameters = // pore_models_[kmer_hash_value]; barcode_whitelist_file_line_string_stream // >> pore_model_parameters.level_mean >> pore_model_parameters.level_stdv // >> pore_model_parameters.sd_mean >> pore_model_parameters.sd_stdv; int khash_return_code; khiter_t barcode_whitelist_lookup_table_iterator = kh_put(k64_seq, barcode_whitelist_lookup_table_, barcode_key, &khash_return_code); kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) = 0; assert(khash_return_code != -1 && khash_return_code != 0); ++num_barcodes; } barcode_whitelist_file_stream.close(); } std::cerr << "Loaded " << num_barcodes << " barcodes in " << GetRealTime() - real_start_time << "s.\n"; } void Chromap::ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes) { double real_start_time = GetRealTime(); SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_); for (size_t read_file_index = 0; read_file_index < mapping_parameters_.read_file1_paths.size(); ++read_file_index) { barcode_batch.InitializeLoading( mapping_parameters_.barcode_file_paths[read_file_index]); uint32_t num_loaded_barcodes = barcode_batch.LoadBatch(); while (num_loaded_barcodes > 0) { for (uint32_t barcode_index = 0; barcode_index < num_loaded_barcodes; ++barcode_index) { std::vector N_pos; // position of Ns barcode_batch.GetSequenceNsAt(barcode_index, /*little_endian=*/true, N_pos); if (N_pos.size() > 0) continue; uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(barcode_index); uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt( barcode_index, 0, barcode_length); khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key); if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) { // Correct barcode kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) += 1; ++num_sample_barcodes_; } } if (!mapping_parameters_.skip_barcode_check && num_sample_barcodes_ * 20 < num_loaded_barcodes) { // Since num_loaded_pairs is a constant, this if is actuaclly only // effective in the first iteration ExitWithMessage( "Less than 5\% barcodes can be found or corrected based on the " "barcode whitelist.\nPlease check whether the barcode whitelist " "matches the data, e.g. length, reverse-complement. If this is a " "false warning, please run Chromap with the option " "--skip-barcode-check."); } if (num_sample_barcodes_ >= max_num_sample_barcodes) { break; } num_loaded_barcodes = barcode_batch.LoadBatch(); } barcode_batch.FinalizeLoading(); if (num_sample_barcodes_ >= max_num_sample_barcodes) { break; } } std::cerr << "Compute barcode abundance using " << num_sample_barcodes_ << " in " << GetRealTime() - real_start_time << "s.\n"; } void Chromap::UpdateBarcodeAbundance(uint32_t num_loaded_barcodes, const SequenceBatch &barcode_batch) { double real_start_time = GetRealTime(); for (uint32_t barcode_index = 0; barcode_index < num_loaded_barcodes; ++barcode_index) { uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(barcode_index); uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt( barcode_index, 0, barcode_length); khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key); if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) { // Correct barcode kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) += 1; ++num_sample_barcodes_; } } std::cerr << "Update barcode abundance using " << num_sample_barcodes_ << " in " << GetRealTime() - real_start_time << "s.\n"; } bool Chromap::CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch &barcode_batch, uint64_t &num_barcode_in_whitelist, uint64_t &num_corrected_barcode) { const uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(barcode_index); const uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt( barcode_index, 0, barcode_length); khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key); std::vector N_pos; // position of Ns barcode_batch.GetSequenceNsAt(barcode_index, /*little_endian=*/true, N_pos); if (N_pos.size() > (uint32_t)mapping_parameters_.barcode_correction_error_threshold) return false; if (N_pos.size() == 0 && barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) { // Correct barcode ++num_barcode_in_whitelist; return true; } else if (mapping_parameters_.barcode_correction_error_threshold > 0) { // Need to correct this barcode // const char *barcode = barcode_batch->GetSequenceAt(barcode_index); // std::cerr << barcode_index << " barcode " << barcode << " needs // correction\n"; const char *barcode_qual = barcode_batch.GetSequenceQualAt(barcode_index); std::vector corrected_barcodes_with_quals; uint64_t mask = (uint64_t)3; uint32_t i_start = 0; uint32_t i_end = barcode_length; uint32_t ti_limit = 3; if (N_pos.size() > 0) { i_start = N_pos[0]; i_end = N_pos[0] + 1; ti_limit = 4; } for (uint32_t i = i_start; i < i_end; ++i) { uint64_t barcode_key_to_change = mask << (2 * i); barcode_key_to_change = ~barcode_key_to_change; barcode_key_to_change &= barcode_key; uint64_t base_to_change1 = (barcode_key >> (2 * i)) & mask; for (uint32_t ti = 0; ti < ti_limit; ++ti) { // change the base base_to_change1 += 1; base_to_change1 &= mask; // generate the corrected key uint64_t corrected_barcode_key = barcode_key_to_change | (base_to_change1 << (2 * i)); barcode_whitelist_lookup_table_iterator = kh_get( k64_seq, barcode_whitelist_lookup_table_, corrected_barcode_key); if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) { // find one possible corrected barcode double barcode_abundance = kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) / (double)num_sample_barcodes_; int qual_offset = 33; int adjusted_qual = barcode_qual[barcode_length - 1 - i] - qual_offset; adjusted_qual = adjusted_qual > 40 ? 40 : adjusted_qual; adjusted_qual = adjusted_qual < 3 ? 3 : adjusted_qual; double score = pow(10.0, ((-adjusted_qual) / 10.0)) * barcode_abundance; corrected_barcodes_with_quals.emplace_back( BarcodeWithQual{barcode_length - 1 - i, Uint8ToChar(base_to_change1), 0, 0, score}); // std::cerr << "1score: " << score << " pos1: " << barcode_length - 1 // - i << " b1: " << base_to_change1 << " pos2: " << 0 << " b2: " << // (char)0 << "\n"; } if (mapping_parameters_.barcode_correction_error_threshold == 2) { uint32_t j_start = i + 1; uint32_t j_end = barcode_length; uint32_t ti2_limit = 3; if (N_pos.size() == 2) { j_start = N_pos[1]; j_end = N_pos[1] + 1; ti2_limit = 4; } for (uint32_t j = j_start; j < j_end; ++j) { uint64_t barcode_key_to_change2 = mask << (2 * i); barcode_key_to_change2 = mask << (2 * j); barcode_key_to_change2 = ~barcode_key_to_change2; barcode_key_to_change2 &= corrected_barcode_key; uint64_t base_to_change2 = (corrected_barcode_key >> (2 * j)) & mask; for (uint32_t ti2 = 0; ti2 < ti2_limit; ++ti2) { // change the base base_to_change2 += 1; base_to_change2 &= mask; // generate the corrected key uint64_t corrected_barcode_key2 = barcode_key_to_change2 | (base_to_change2 << (2 * j)); barcode_whitelist_lookup_table_iterator = kh_get(k64_seq, barcode_whitelist_lookup_table_, corrected_barcode_key2); if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) { // find one possible corrected barcode double barcode_abundance = kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) / (double)num_sample_barcodes_; int qual_offset = 33; int adjusted_qual = barcode_qual[barcode_length - 1 - j] - qual_offset; adjusted_qual = adjusted_qual > 40 ? 40 : adjusted_qual; adjusted_qual = adjusted_qual < 3 ? 3 : adjusted_qual; int adjusted_qual1 = barcode_qual[barcode_length - 1 - i] - qual_offset; adjusted_qual1 = adjusted_qual1 > 40 ? 40 : adjusted_qual1; adjusted_qual1 = adjusted_qual1 < 3 ? 3 : adjusted_qual1; adjusted_qual += adjusted_qual1; double score = pow(10.0, ((-adjusted_qual) / 10.0)) * barcode_abundance; corrected_barcodes_with_quals.emplace_back(BarcodeWithQual{ barcode_length - 1 - i, Uint8ToChar(base_to_change1), barcode_length - 1 - j, Uint8ToChar(base_to_change2), score}); // std::cerr << "2score: " << score << " pos1: " << // barcode_length - 1 - i << " b1: " << base_to_change1 << " // pos2: " << barcode_length - 1 -j << " b2: " << // base_to_change2 // << "\n"; } } } } } } size_t num_possible_corrected_barcodes = corrected_barcodes_with_quals.size(); if (num_possible_corrected_barcodes == 0) { // Barcode cannot be corrected, leave it for downstream return false; } else if (num_possible_corrected_barcodes == 1) { // Just correct it // std::cerr << "Corrected the barcode from " << barcode << " to "; barcode_batch.CorrectBaseAt( barcode_index, corrected_barcodes_with_quals[0].corrected_base_index1, corrected_barcodes_with_quals[0].correct_base1); if (corrected_barcodes_with_quals[0].correct_base2 != 0) { barcode_batch.CorrectBaseAt( barcode_index, corrected_barcodes_with_quals[0].corrected_base_index2, corrected_barcodes_with_quals[0].correct_base2); } // std::cerr << barcode << "\n"; // std::cerr << "score: " << corrected_barcodes_with_quals[0].score << // "\n"; std::cerr << "score: " << corrected_barcodes_with_quals[0].score // << " pos1: " << corrected_barcodes_with_quals[0].corrected_base_index1 // << " b1: " << corrected_barcodes_with_quals[0].correct_base1 << " pos2: // " << corrected_barcodes_with_quals[0].corrected_base_index2 << " b2: " // << corrected_barcodes_with_quals[0].correct_base2 << "\n"; ++num_corrected_barcode; return true; } else { // Select the best correction std::sort(corrected_barcodes_with_quals.begin(), corrected_barcodes_with_quals.end(), std::greater()); // int num_ties = 0; double sum_score = 0; for (size_t ci = 0; ci < num_possible_corrected_barcodes; ++ci) { sum_score += corrected_barcodes_with_quals[ci].score; // std::cerr << ci << " score: " << // corrected_barcodes_with_quals[ci].score << " pos1: " << // corrected_barcodes_with_quals[ci].corrected_base_index1 << " b1: " << // corrected_barcodes_with_quals[ci].correct_base1 << " pos2: " << // corrected_barcodes_with_quals[ci].corrected_base_index2 << " b2: " << // corrected_barcodes_with_quals[ci].correct_base2 << "\n"; if // (corrected_barcodes_with_quals[ci].qual == // corrected_barcodes_with_quals[0].qual) { // ++num_ties; //} } int best_corrected_barcode_index = 0; // if (num_ties > 0) { // std::mt19937 tmp_generator(11); // std::uniform_int_distribution distribution(0, num_ties); // // important: inclusive range best_corrected_barcode_index = // distribution(tmp_generator); //} // std::cerr << "Corrected the barcode from " << barcode << " to "; double confidence_threshold = mapping_parameters_.barcode_correction_probability_threshold; if (corrected_barcodes_with_quals[best_corrected_barcode_index].score / sum_score > confidence_threshold) { barcode_batch.CorrectBaseAt( barcode_index, corrected_barcodes_with_quals[best_corrected_barcode_index] .corrected_base_index1, corrected_barcodes_with_quals[best_corrected_barcode_index] .correct_base1); if (corrected_barcodes_with_quals[best_corrected_barcode_index] .correct_base2 != 0) { barcode_batch.CorrectBaseAt( barcode_index, corrected_barcodes_with_quals[best_corrected_barcode_index] .corrected_base_index2, corrected_barcodes_with_quals[best_corrected_barcode_index] .correct_base2); } // std::cerr << barcode << "\n"; // std::cerr << "score: " << // corrected_barcodes_with_quals[best_corrected_barcode_index].score << // "\n"; std::cerr << "best score: " << // corrected_barcodes_with_quals[best_corrected_barcode_index].score << // " sum score: " << sum_score << "\n"; ++num_corrected_barcode; return true; } else { // std::cerr << "Didnt pass filter: " << // corrected_barcodes_with_quals[best_corrected_barcode_index].score / // sum_score << "\n"; std::cerr << "best score: " << // corrected_barcodes_with_quals[best_corrected_barcode_index].score << // " sum score: " << sum_score << "\n"; return false; } } } else { return false; } } void Chromap::OutputBarcodeStatistics() { std::cerr << "Number of barcodes in whitelist: " << num_barcode_in_whitelist_ << ".\n"; std::cerr << "Number of corrected barcodes: " << num_corrected_barcode_ << ".\n"; } void Chromap::OutputMappingStatistics() { std::cerr << "Number of reads: " << num_reads_ << ".\n"; // std::cerr << "Number of duplicated reads: " << num_duplicated_reads_ << // ".\n"; std::cerr << "Number of mapped reads: " << num_mapped_reads_ << ".\n"; std::cerr << "Number of uniquely mapped reads: " << num_uniquely_mapped_reads_ << ".\n"; std::cerr << "Number of reads have multi-mappings: " << num_mapped_reads_ - num_uniquely_mapped_reads_ << ".\n"; std::cerr << "Number of candidates: " << num_candidates_ << ".\n"; std::cerr << "Number of mappings: " << num_mappings_ << ".\n"; std::cerr << "Number of uni-mappings: " << num_uniquely_mapped_reads_ << ".\n"; std::cerr << "Number of multi-mappings: " << num_mappings_ - num_uniquely_mapped_reads_ << ".\n"; } void Chromap::ParseReadFormat(const std::string &read_format) { if (read_format.empty()) { return; } read1_effective_range_.InitializeParsing(); read2_effective_range_.InitializeParsing(); barcode_effective_range_.InitializeParsing(); uint32_t i, j; for (i = 0; i < read_format.size();) { for (j = i + 1; j < read_format.size() && read_format[j] != ','; ++j) ; bool parse_success = true; if (read_format[i] == 'r' && read_format[i + 1] == '1') { parse_success = read1_effective_range_.ParseFormatStringAndAppendEffectiveRange( read_format.c_str() + i, j - i); } else if (read_format[i] == 'r' && read_format[i + 1] == '2') { parse_success = read2_effective_range_.ParseFormatStringAndAppendEffectiveRange( read_format.c_str() + i, j - i); } else if (read_format[i] == 'b' && read_format[i + 1] == 'c') { parse_success = barcode_effective_range_.ParseFormatStringAndAppendEffectiveRange( read_format.c_str() + i, j - i); } else { parse_success = false; } if (!parse_success) { ExitWithMessage("Unknown read format: " + read_format + "\n"); } i = j + 1; } read1_effective_range_.FinalizeParsing(); read2_effective_range_.FinalizeParsing(); barcode_effective_range_.FinalizeParsing(); } void Chromap::GenerateCustomRidRanks( const std::string &custom_rid_order_file_path, uint32_t num_reference_sequences, const SequenceBatch &reference, std::vector &rid_ranks) { for (uint32_t i = 0; i < num_reference_sequences; ++i) { rid_ranks.emplace_back(i); } if (custom_rid_order_file_path.empty()) { return; } std::unordered_map ref_name_to_rank; std::ifstream custom_rid_order_file_stream(custom_rid_order_file_path); std::string ref_name; uint32_t ref_rank = 0; while (getline(custom_rid_order_file_stream, ref_name)) { ref_name_to_rank[ref_name] = ref_rank; ref_rank += 1; } custom_rid_order_file_stream.close(); // First, rank the chromosomes in the custom order provided by users. for (uint32_t i = 0; i < num_reference_sequences; ++i) { std::string ref_name(reference.GetSequenceNameAt(i)); if (ref_name_to_rank.find(ref_name) != ref_name_to_rank.end()) { rid_ranks[i] = ref_name_to_rank[ref_name]; } else { rid_ranks[i] = -1; } } // There might be some rids without any custom order. We just order them based // on their original order in the reference file. uint32_t k = ref_name_to_rank.size(); // Rank the remaining chromosomes. for (uint32_t i = 0; i < num_reference_sequences; ++i) { if (rid_ranks[i] == -1) { rid_ranks[i] = k; ++k; } } if (k > num_reference_sequences) { ExitWithMessage( "ERROR: unknown chromsome names found in chromosome order file."); } } void Chromap::RerankCandidatesRid(std::vector &candidates) { for (size_t i = 0; i < candidates.size(); ++i) { uint64_t rid = (uint32_t)(candidates[i].position >> 32); rid = custom_rid_rank_[rid]; candidates[i].position = (candidates[i].position & (uint64_t)0xffffffff) | (rid << 32); } } } // namespace chromap ================================================ FILE: src/chromap.h ================================================ #ifndef CHROMAP_H_ #define CHROMAP_H_ #include #include #include #include #include #include #include // Used these two for k-minhash #include #include // Used for frip est params splitting #include "candidate_processor.h" #include "cxxopts.hpp" #include "draft_mapping_generator.h" #include "feature_barcode_matrix.h" #include "index.h" #include "index_parameters.h" #include "khash.h" #include "mapping_generator.h" #include "mapping_metadata.h" #include "mapping_parameters.h" #include "mapping_processor.h" #include "mapping_writer.h" #include "minimizer_generator.h" #include "mmcache.hpp" #include "paired_end_mapping_metadata.h" #include "sequence_batch.h" #include "sequence_effective_range.h" #include "temp_mapping.h" #include "utils.h" #define CHROMAP_VERSION "0.3.3-r521" namespace chromap { class K_MinHash { public: /* * MinHash Class - used to estimate the number of unique cache slots * hit by each barcode * * @param k - size of MinHash sketch * @param range - range of possible cache ids */ K_MinHash(size_t k, size_t range) : k_(k), range_(range) {} inline void add(size_t num) { /* If num is not present in queue, we will add it */ if (unique_slots_.find(num) == unique_slots_.end()) { unique_slots_.insert(num); pq_.push(num); // only keep smallest k numbers if (pq_.size() > k_) { unique_slots_.erase(pq_.top()); pq_.pop(); } } } inline size_t compute_cardinality() { /* Use k-MinHash estimator to return estimated cardinality */ if (pq_.size() < k_) {return 0;} size_t cardinality = (k_ * range_)/pq_.top() - 1; return cardinality; } private: size_t k_; size_t range_; /* Uses an unordered set to have O(1) find queries*/ std::priority_queue pq_; // max-heap std::unordered_set unique_slots_; // keep track of unique values }; class Chromap { public: Chromap() = delete; // For index construction Chromap(const IndexParameters &index_parameters) : index_parameters_(index_parameters) { barcode_lookup_table_ = NULL; barcode_whitelist_lookup_table_ = NULL; } // For mapping Chromap(const MappingParameters &mapping_parameters) : mapping_parameters_(mapping_parameters) { barcode_lookup_table_ = kh_init(k64_seq); barcode_whitelist_lookup_table_ = kh_init(k64_seq); ParseReadFormat(mapping_parameters.read_format); } ~Chromap() { if (barcode_whitelist_lookup_table_ != NULL) { kh_destroy(k64_seq, barcode_whitelist_lookup_table_); } if (barcode_lookup_table_ != NULL) { kh_destroy(k64_seq, barcode_lookup_table_); } if (read_lookup_tables_.size() > 0) { for (uint32_t i = 0; i < read_lookup_tables_.size(); ++i) { kh_destroy(k128, read_lookup_tables_[i]); } } } void ConstructIndex(); template void MapSingleEndReads(); template void MapPairedEndReads(); private: uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch &read_batch, SequenceBatch &barcode_batch, bool parallel_parsing); uint32_t LoadPairedEndReadsWithBarcodes(SequenceBatch &read_batch1, SequenceBatch &read_batch2, SequenceBatch &barcode_batch, bool parallel_parsing); void TrimAdapterForPairedEndRead(uint32_t pair_index, SequenceBatch &read_batch1, SequenceBatch &read_batch2); bool PairedEndReadWithBarcodeIsDuplicate(uint32_t pair_index, const SequenceBatch &barcode_batch, const SequenceBatch &read_batch1, const SequenceBatch &read_batch2); uint32_t SampleInputBarcodesAndExamineLength(); void LoadBarcodeWhitelist(); void ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes); void UpdateBarcodeAbundance(uint32_t num_loaded_barcodes, const SequenceBatch &barcode_batch); bool CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch &barcode_batch, uint64_t &num_barcode_in_whitelist, uint64_t &num_corrected_barcode); void OutputBarcodeStatistics(); void OutputMappingStatistics(); void ParseReadFormat(const std::string &read_format); // User custom rid order file contains a column of reference sequence names // and there is one name on each row. The reference sequence name on the ith // row means the rank of this sequence is i. This function loads the custom // rid order file and generates a mapping from the original rids to their // custom ranks, e.g., rid_ranks[i] is the custom rank of the ith rid in the // reference. void GenerateCustomRidRanks(const std::string &custom_rid_order_file_path, uint32_t num_reference_sequences, const SequenceBatch &reference, std::vector &rid_ranks); // TODO: generate reranked candidates directly. void RerankCandidatesRid(std::vector &candidates); // Parameters const IndexParameters index_parameters_; const MappingParameters mapping_parameters_; // Default batch size, # reads for single-end reads, # read pairs for // paired-end reads. const uint32_t read_batch_size_ = 500000; // 0-start, 1-end (includsive), 2-strand(-1:minus, 1:plus) SequenceEffectiveRange barcode_effective_range_; SequenceEffectiveRange read1_effective_range_; SequenceEffectiveRange read2_effective_range_; std::vector custom_rid_rank_; std::vector pairs_custom_rid_rank_; khash_t(k64_seq) * barcode_whitelist_lookup_table_; // For identical read dedupe khash_t(k64_seq) * barcode_lookup_table_; std::vector read_lookup_tables_; // For mapping. const int min_unique_mapping_mapq_ = 4; // For mapping stats. uint64_t num_candidates_ = 0; uint64_t num_mappings_ = 0; uint64_t num_mapped_reads_ = 0; uint64_t num_uniquely_mapped_reads_ = 0; uint64_t num_reads_ = 0; // # identical reads. // uint64_t num_duplicated_reads_ = 0; // For barcode stats. const uint64_t initial_num_sample_barcodes_ = 20000000; uint64_t num_sample_barcodes_ = 0; uint64_t num_barcode_in_whitelist_ = 0; uint64_t num_corrected_barcode_ = 0; uint32_t barcode_length_ = 0; }; template void Chromap::MapSingleEndReads() { double real_start_time = GetRealTime(); SequenceBatch reference; reference.InitializeLoading(mapping_parameters_.reference_file_path); reference.LoadAllSequences(); uint32_t num_reference_sequences = reference.GetNumSequences(); if (mapping_parameters_.custom_rid_order_file_path.length() > 0) { GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path, num_reference_sequences, reference, custom_rid_rank_); reference.ReorderSequences(custom_rid_rank_); } Index index(mapping_parameters_.index_file_path); index.Load(); const int kmer_size = index.GetKmerSize(); const int window_size = index.GetWindowSize(); // index.Statistics(num_sequences, reference); SequenceBatch read_batch(read_batch_size_, read1_effective_range_); SequenceBatch read_batch_for_loading(read_batch_size_, read1_effective_range_); SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_); SequenceBatch barcode_batch_for_loading(read_batch_size_, barcode_effective_range_); std::vector> mappings_on_diff_ref_seqs; mappings_on_diff_ref_seqs.reserve(num_reference_sequences); for (uint32_t i = 0; i < num_reference_sequences; ++i) { mappings_on_diff_ref_seqs.emplace_back(std::vector()); } std::vector> temp_mapping_file_handles; // Preprocess barcodes for single cell data if (!mapping_parameters_.is_bulk_data) { barcode_length_ = SampleInputBarcodesAndExamineLength(); if (!mapping_parameters_.barcode_whitelist_file_path.empty()) { LoadBarcodeWhitelist(); ComputeBarcodeAbundance(initial_num_sample_barcodes_); } } MinimizerGenerator minimizer_generator(kmer_size, window_size); CandidateProcessor candidate_processor( mapping_parameters_.min_num_seeds_required_for_mapping, mapping_parameters_.max_seed_frequencies); MappingProcessor mapping_processor(mapping_parameters_, min_unique_mapping_mapq_); DraftMappingGenerator draft_mapping_generator(mapping_parameters_); MappingGenerator mapping_generator(mapping_parameters_, pairs_custom_rid_rank_); MappingWriter mapping_writer( mapping_parameters_, barcode_length_, pairs_custom_rid_rank_); mapping_writer.OutputHeader(num_reference_sequences, reference); uint32_t num_mappings_in_mem = 0; uint64_t max_num_mappings_in_mem = 1 * ((uint64_t)1 << 30) / sizeof(MappingRecord); if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM || mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAF || mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) { max_num_mappings_in_mem = 1 * ((uint64_t)1 << 29) / sizeof(MappingRecord); } mm_cache mm_to_candidates_cache(2000003); mm_to_candidates_cache.SetKmerLength(kmer_size); struct _mm_history *mm_history = new struct _mm_history[read_batch_size_]; // Use bit encoding to represent mapping results // bit 0: is barcode in whitelist uint8_t *read_map_summary = NULL ; if (!mapping_parameters_.summary_metadata_file_path.empty()) { read_map_summary = new uint8_t[read_batch_size_]; memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_); } static uint64_t thread_num_candidates = 0; static uint64_t thread_num_mappings = 0; static uint64_t thread_num_mapped_reads = 0; static uint64_t thread_num_uniquely_mapped_reads = 0; static uint64_t thread_num_barcode_in_whitelist = 0; static uint64_t thread_num_corrected_barcode = 0; #pragma omp threadprivate( \ thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, \ thread_num_uniquely_mapped_reads, thread_num_barcode_in_whitelist, \ thread_num_corrected_barcode) double real_start_mapping_time = GetRealTime(); for (size_t read_file_index = 0; read_file_index < mapping_parameters_.read_file1_paths.size(); ++read_file_index) { read_batch_for_loading.InitializeLoading( mapping_parameters_.read_file1_paths[read_file_index]); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.InitializeLoading( mapping_parameters_.barcode_file_paths[read_file_index]); } uint32_t num_loaded_reads_for_loading = 0; uint32_t num_loaded_reads = LoadSingleEndReadsWithBarcodes( read_batch_for_loading, barcode_batch_for_loading, mapping_parameters_.num_threads >= 3 ? true : false); read_batch_for_loading.SwapSequenceBatch(read_batch); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.SwapSequenceBatch(barcode_batch); } std::vector>> mappings_on_diff_ref_seqs_for_diff_threads; std::vector>> mappings_on_diff_ref_seqs_for_diff_threads_for_saving; mappings_on_diff_ref_seqs_for_diff_threads.reserve( mapping_parameters_.num_threads); mappings_on_diff_ref_seqs_for_diff_threads_for_saving.reserve( mapping_parameters_.num_threads); for (int ti = 0; ti < mapping_parameters_.num_threads; ++ti) { mappings_on_diff_ref_seqs_for_diff_threads.emplace_back( std::vector>(num_reference_sequences)); mappings_on_diff_ref_seqs_for_diff_threads_for_saving.emplace_back( std::vector>(num_reference_sequences)); for (uint32_t i = 0; i < num_reference_sequences; ++i) { mappings_on_diff_ref_seqs_for_diff_threads[ti][i].reserve( (num_loaded_reads + num_loaded_reads / 1000 * mapping_parameters_.max_num_best_mappings) / mapping_parameters_.num_threads / num_reference_sequences); mappings_on_diff_ref_seqs_for_diff_threads_for_saving[ti][i].reserve( (num_loaded_reads + num_loaded_reads / 1000 * mapping_parameters_.max_num_best_mappings) / mapping_parameters_.num_threads / num_reference_sequences); } } #pragma omp parallel shared(num_reads_, mm_history, read_map_summary, reference, index, read_batch, barcode_batch, read_batch_for_loading, barcode_batch_for_loading, std::cerr, num_loaded_reads_for_loading, num_loaded_reads, num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs, temp_mapping_file_handles, mm_to_candidates_cache, mapping_writer, minimizer_generator, candidate_processor, mapping_processor, draft_mapping_generator, mapping_generator, num_mappings_in_mem, max_num_mappings_in_mem) num_threads(mapping_parameters_.num_threads) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_, num_barcode_in_whitelist_, num_corrected_barcode_) { thread_num_candidates = 0; thread_num_mappings = 0; thread_num_mapped_reads = 0; thread_num_uniquely_mapped_reads = 0; thread_num_barcode_in_whitelist = 0; thread_num_corrected_barcode = 0; MappingMetadata mapping_metadata; #pragma omp single { while (num_loaded_reads > 0) { double real_batch_start_time = GetRealTime(); num_reads_ += num_loaded_reads; #pragma omp task { num_loaded_reads_for_loading = LoadSingleEndReadsWithBarcodes( read_batch_for_loading, barcode_batch_for_loading, mapping_parameters_.num_threads >= 12 ? true : false); } // end of openmp loading task uint32_t history_update_threshold = mm_to_candidates_cache.GetUpdateThreshold(num_loaded_reads, num_reads_, false, 0.01); // int grain_size = 10000; //#pragma omp taskloop grainsize(grain_size) //num_tasks(num_threads_* 50) #pragma omp taskloop num_tasks( \ mapping_parameters_.num_threads *mapping_parameters_.num_threads) for (uint32_t read_index = 0; read_index < num_loaded_reads; ++read_index) { bool current_barcode_is_whitelisted = true; if (!mapping_parameters_.barcode_whitelist_file_path.empty()) { current_barcode_is_whitelisted = CorrectBarcodeAt( read_index, barcode_batch, thread_num_barcode_in_whitelist, thread_num_corrected_barcode); } if (!(current_barcode_is_whitelisted || mapping_parameters_.output_mappings_not_in_whitelist)) { if (read_map_summary != NULL) read_map_summary[read_index] = 0; continue; } if (read_batch.GetSequenceLengthAt(read_index) < (uint32_t)mapping_parameters_.min_read_length) { continue; // reads are too short, just drop. } read_batch.PrepareNegativeSequenceAt(read_index); mapping_metadata.PrepareForMappingNextRead( mapping_parameters_.max_seed_frequencies[0]); minimizer_generator.GenerateMinimizers( read_batch, read_index, mapping_metadata.minimizers_); if (mapping_metadata.minimizers_.size() > 0) { if (mapping_parameters_.custom_rid_order_file_path.length() > 0) { RerankCandidatesRid(mapping_metadata.positive_candidates_); RerankCandidatesRid(mapping_metadata.negative_candidates_); } if (mm_to_candidates_cache.Query( mapping_metadata, read_batch.GetSequenceLengthAt(read_index)) == -1) { candidate_processor.GenerateCandidates( mapping_parameters_.error_threshold, index, mapping_metadata); } if (read_index < history_update_threshold) { mm_history[read_index].timestamp = num_reads_; mm_history[read_index].minimizers = mapping_metadata.minimizers_; mm_history[read_index].positive_candidates = mapping_metadata.positive_candidates_; mm_history[read_index].negative_candidates = mapping_metadata.negative_candidates_; mm_history[read_index].repetitive_seed_length = mapping_metadata.repetitive_seed_length_; } size_t current_num_candidates = mapping_metadata.GetNumCandidates(); if (current_num_candidates > 0) { thread_num_candidates += current_num_candidates; draft_mapping_generator.GenerateDraftMappings( read_batch, read_index, reference, mapping_metadata); const size_t current_num_draft_mappings = mapping_metadata.GetNumDraftMappings(); if (current_num_draft_mappings > 0) { std::vector> &mappings_on_diff_ref_seqs = mappings_on_diff_ref_seqs_for_diff_threads [omp_get_thread_num()]; mapping_generator.GenerateBestMappingsForSingleEndRead( read_batch, read_index, reference, barcode_batch, mapping_metadata, mappings_on_diff_ref_seqs); thread_num_mappings += std::min(mapping_metadata.GetNumBestMappings(), mapping_parameters_.max_num_best_mappings); ++thread_num_mapped_reads; if (mapping_metadata.GetNumBestMappings() == 1) { ++thread_num_uniquely_mapped_reads; } } } } } #pragma omp taskwait for (uint32_t read_index = 0; read_index < history_update_threshold; ++read_index) { if (mm_history[read_index].timestamp != num_reads_) continue; mm_to_candidates_cache.Update( mm_history[read_index].minimizers, mm_history[read_index].positive_candidates, mm_history[read_index].negative_candidates, mm_history[read_index].repetitive_seed_length); if (mm_history[read_index].positive_candidates.size() < mm_history[read_index].positive_candidates.capacity() / 2) { std::vector().swap( mm_history[read_index].positive_candidates); } if (mm_history[read_index].negative_candidates.size() < mm_history[read_index].negative_candidates.capacity() / 2) { std::vector().swap( mm_history[read_index].negative_candidates); } } // std::cerr<<"cache memusage: " << // mm_to_candidates_cache.GetMemoryBytes() <<"\n" ; if (!mapping_parameters_.summary_metadata_file_path.empty()) { if (mapping_parameters_.is_bulk_data) mapping_writer.UpdateSummaryMetadata(0, SUMMARY_METADATA_TOTAL, num_loaded_reads) ; else { uint32_t nonwhitelist_count = 0; for (uint32_t read_index = 0; read_index < num_loaded_reads; ++read_index) if (read_map_summary[read_index] & 1) { mapping_writer.UpdateSummaryMetadata( barcode_batch.GenerateSeedFromSequenceAt(read_index, 0, barcode_length_), SUMMARY_METADATA_TOTAL, 1); } else { ++nonwhitelist_count; } mapping_writer.UpdateSpeicalCategorySummaryMetadata(/*nonwhitelist*/0, SUMMARY_METADATA_TOTAL, nonwhitelist_count); } // By default, set the lowest bit to 1 (whether the barcode is in the whitelist) memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_); } num_loaded_reads = num_loaded_reads_for_loading; read_batch_for_loading.SwapSequenceBatch(read_batch); barcode_batch_for_loading.SwapSequenceBatch(barcode_batch); mappings_on_diff_ref_seqs_for_diff_threads.swap( mappings_on_diff_ref_seqs_for_diff_threads_for_saving); #pragma omp task { num_mappings_in_mem += mapping_processor.MoveMappingsInBuffersToMappingContainer( num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs); if (mapping_parameters_.low_memory_mode && num_mappings_in_mem > max_num_mappings_in_mem) { mapping_processor.ParallelSortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs, 0); mapping_writer.OutputTempMappings(num_reference_sequences, mappings_on_diff_ref_seqs, temp_mapping_file_handles); if (temp_mapping_file_handles.size() > 850 && temp_mapping_file_handles.size() % 10 == 1) { // every 10 temp files, double the temp file size max_num_mappings_in_mem <<= 1; std::cerr << "Used " << temp_mapping_file_handles.size() << "temp files. Double the temp file volume to " << max_num_mappings_in_mem << "\n" ; } num_mappings_in_mem = 0; } } std::cerr << "Mapped " << num_loaded_reads << " reads in " << GetRealTime() - real_batch_start_time << "s.\n"; } } // end of openmp single { num_barcode_in_whitelist_ += thread_num_barcode_in_whitelist; num_corrected_barcode_ += thread_num_corrected_barcode; num_candidates_ += thread_num_candidates; num_mappings_ += thread_num_mappings; num_mapped_reads_ += thread_num_mapped_reads; num_uniquely_mapped_reads_ += thread_num_uniquely_mapped_reads; } // end of updating shared mapping stats } // end of openmp parallel region read_batch_for_loading.FinalizeLoading(); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.FinalizeLoading(); } } std::cerr << "Mapped all reads in " << GetRealTime() - real_start_mapping_time << "s.\n"; delete[] mm_history; if (read_map_summary != NULL) delete[] read_map_summary; OutputMappingStatistics(); if (!mapping_parameters_.is_bulk_data) { OutputBarcodeStatistics(); } index.Destroy(); if (mapping_parameters_.low_memory_mode) { // First, process the remaining mappings in the memory and save them on // disk. if (num_mappings_in_mem > 0) { mapping_processor.SortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs); mapping_writer.OutputTempMappings(num_reference_sequences, mappings_on_diff_ref_seqs, temp_mapping_file_handles); num_mappings_in_mem = 0; } mapping_writer.ProcessAndOutputMappingsInLowMemory( num_mappings_in_mem, num_reference_sequences, reference, barcode_whitelist_lookup_table_, temp_mapping_file_handles); } else { if (mapping_parameters_.Tn5_shift) { mapping_processor.ApplyTn5ShiftOnMappings(num_reference_sequences, mappings_on_diff_ref_seqs); } if (mapping_parameters_.remove_pcr_duplicates) { mapping_processor.RemovePCRDuplicate(num_reference_sequences, mappings_on_diff_ref_seqs, mapping_parameters_.num_threads); std::cerr << "After removing PCR duplications, "; mapping_processor.OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs); } else { mapping_processor.ParallelSortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs, mapping_parameters_.num_threads); } if (mapping_parameters_.allocate_multi_mappings) { const uint64_t num_multi_mappings = num_mapped_reads_ - num_uniquely_mapped_reads_; mapping_processor.AllocateMultiMappings( num_reference_sequences, num_multi_mappings, mapping_parameters_.multi_mapping_allocation_distance, mappings_on_diff_ref_seqs); std::cerr << "After allocating multi-mappings, "; mapping_processor.OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs); mapping_processor.SortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs); } mapping_writer.OutputMappings(num_reference_sequences, reference, mappings_on_diff_ref_seqs); } mapping_writer.OutputSummaryMetadata(); reference.FinalizeLoading(); std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n"; } template void Chromap::MapPairedEndReads() { double real_start_time = GetRealTime(); // Load reference SequenceBatch reference; reference.InitializeLoading(mapping_parameters_.reference_file_path); reference.LoadAllSequences(); uint32_t num_reference_sequences = reference.GetNumSequences(); // Debugging Info (printing out reference information) if (mapping_parameters_.debug_cache) { for (size_t i = 0; i < num_reference_sequences; i++){ std::cout << "[DEBUG][INDEX] seq_i = " << i << " , seq_i_name = " << reference.GetSequenceNameAt(i) << std::endl; } } if (mapping_parameters_.custom_rid_order_file_path.length() > 0) { GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path, num_reference_sequences, reference, custom_rid_rank_); reference.ReorderSequences(custom_rid_rank_); } if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) { GenerateCustomRidRanks( mapping_parameters_.pairs_flipping_custom_rid_order_file_path, num_reference_sequences, reference, pairs_custom_rid_rank_); } // Load index Index index(mapping_parameters_.index_file_path); index.Load(); const int kmer_size = index.GetKmerSize(); const int window_size = index.GetWindowSize(); // index.Statistics(num_sequences, reference); // Initialize read batches SequenceBatch read_batch1(read_batch_size_, read1_effective_range_); SequenceBatch read_batch2(read_batch_size_, read2_effective_range_); SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_); SequenceBatch read_batch1_for_loading(read_batch_size_, read1_effective_range_); SequenceBatch read_batch2_for_loading(read_batch_size_, read2_effective_range_); SequenceBatch barcode_batch_for_loading(read_batch_size_, barcode_effective_range_); // Check cache-related parameters std::cerr << "Cache Size: " << mapping_parameters_.cache_size << std::endl; std::cerr << "Cache Update Param: " << mapping_parameters_.cache_update_param << std::endl; std::vector seeds_for_batch(500000, 0); // Variables used for counting number of associated cache slots bool output_num_cache_slots_info = mapping_parameters_.output_num_uniq_cache_slots; if (mapping_parameters_.summary_metadata_file_path.empty()) { output_num_cache_slots_info = false; } const size_t k_for_minhash = mapping_parameters_.k_for_minhash; std::cerr << "Output number of associated cache slots: " << output_num_cache_slots_info << std::endl; std::cerr << "K for MinHash: " << k_for_minhash << std::endl; int num_locks_for_map = 1000; omp_lock_t map_locks[num_locks_for_map]; for (int i = 0; i < num_locks_for_map; ++i) {omp_init_lock(&map_locks[i]);} std::vector> barcode_peak_map(num_locks_for_map); // Parse out the parameters for chromap score (const, fric, dup, unmapped, lowmapq) std::vector frip_est_params; std::stringstream ss(mapping_parameters_.frip_est_params); std::string token; while(std::getline(ss, token, ';')) { try { auto curr_param = std::stod(token); frip_est_params.push_back(curr_param); } catch(...) { chromap::ExitWithMessage( "\nException occurred while processing chromap score parameters\n" ); } } if (frip_est_params.size() != 5) { chromap::ExitWithMessage( "\nInvalid number of parameters, expecting 5 parameters but found " + std::to_string(frip_est_params.size()) + " parameters\n" ); } // Initialize vector to keep track of cache hits for each thread std::vector cache_hits_per_thread(mapping_parameters_.num_threads, 0); // Initialize cache mm_cache mm_to_candidates_cache(mapping_parameters_.cache_size); mm_to_candidates_cache.SetKmerLength(kmer_size); struct _mm_history *mm_history1 = new struct _mm_history[read_batch_size_]; struct _mm_history *mm_history2 = new struct _mm_history[read_batch_size_]; // The explanation for read_map_summary is in the single-end mapping function uint8_t *read_map_summary = NULL ; if (!mapping_parameters_.summary_metadata_file_path.empty()) { read_map_summary = new uint8_t[read_batch_size_]; memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_); } std::vector> mappings_on_diff_ref_seqs; // Initialize mapping container mappings_on_diff_ref_seqs.reserve(num_reference_sequences); for (uint32_t i = 0; i < num_reference_sequences; ++i) { mappings_on_diff_ref_seqs.emplace_back(std::vector()); } std::vector> temp_mapping_file_handles; // Preprocess barcodes for single cell data if (!mapping_parameters_.is_bulk_data) { barcode_length_ = SampleInputBarcodesAndExamineLength(); if (!mapping_parameters_.barcode_whitelist_file_path.empty()) { LoadBarcodeWhitelist(); ComputeBarcodeAbundance(initial_num_sample_barcodes_); } } MinimizerGenerator minimizer_generator(kmer_size, window_size); CandidateProcessor candidate_processor( mapping_parameters_.min_num_seeds_required_for_mapping, mapping_parameters_.max_seed_frequencies); MappingProcessor mapping_processor(mapping_parameters_, min_unique_mapping_mapq_); DraftMappingGenerator draft_mapping_generator(mapping_parameters_); MappingGenerator mapping_generator(mapping_parameters_, pairs_custom_rid_rank_); MappingWriter mapping_writer( mapping_parameters_, barcode_length_, pairs_custom_rid_rank_); mapping_writer.OutputHeader(num_reference_sequences, reference); uint32_t num_mappings_in_mem = 0; uint64_t max_num_mappings_in_mem = 1 * ((uint64_t)1 << 30) / sizeof(MappingRecord); if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM || mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAF || mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) { max_num_mappings_in_mem = 1 * ((uint64_t)1 << 29) / sizeof(MappingRecord); } static uint64_t thread_num_candidates = 0; static uint64_t thread_num_mappings = 0; static uint64_t thread_num_mapped_reads = 0; static uint64_t thread_num_uniquely_mapped_reads = 0; static uint64_t thread_num_barcode_in_whitelist = 0; static uint64_t thread_num_corrected_barcode = 0; #pragma omp threadprivate( \ thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, \ thread_num_uniquely_mapped_reads, thread_num_barcode_in_whitelist, \ thread_num_corrected_barcode) double real_start_mapping_time = GetRealTime(); for (size_t read_file_index = 0; read_file_index < mapping_parameters_.read_file1_paths.size(); ++read_file_index) { // Set read batches to the current read files. read_batch1_for_loading.InitializeLoading( mapping_parameters_.read_file1_paths[read_file_index]); read_batch2_for_loading.InitializeLoading( mapping_parameters_.read_file2_paths[read_file_index]); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.InitializeLoading( mapping_parameters_.barcode_file_paths[read_file_index]); } // Load the first batches. uint32_t num_loaded_pairs_for_loading = 0; uint32_t num_loaded_pairs = LoadPairedEndReadsWithBarcodes( read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, mapping_parameters_.num_threads >= 3 ? true : false); read_batch1_for_loading.SwapSequenceBatch(read_batch1); read_batch2_for_loading.SwapSequenceBatch(read_batch2); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.SwapSequenceBatch(barcode_batch); } // Setup thread private vectors to save mapping results. std::vector>> mappings_on_diff_ref_seqs_for_diff_threads; std::vector>> mappings_on_diff_ref_seqs_for_diff_threads_for_saving; mappings_on_diff_ref_seqs_for_diff_threads.reserve( mapping_parameters_.num_threads); mappings_on_diff_ref_seqs_for_diff_threads_for_saving.reserve( mapping_parameters_.num_threads); for (int ti = 0; ti < mapping_parameters_.num_threads; ++ti) { mappings_on_diff_ref_seqs_for_diff_threads.emplace_back( std::vector>(num_reference_sequences)); mappings_on_diff_ref_seqs_for_diff_threads_for_saving.emplace_back( std::vector>(num_reference_sequences)); for (uint32_t i = 0; i < num_reference_sequences; ++i) { mappings_on_diff_ref_seqs_for_diff_threads[ti][i].reserve( (num_loaded_pairs + num_loaded_pairs / 1000 * mapping_parameters_.max_num_best_mappings) / mapping_parameters_.num_threads / num_reference_sequences); mappings_on_diff_ref_seqs_for_diff_threads_for_saving[ti][i].reserve( (num_loaded_pairs + num_loaded_pairs / 1000 * mapping_parameters_.max_num_best_mappings) / mapping_parameters_.num_threads / num_reference_sequences); } } #pragma omp parallel shared(num_reads_, num_reference_sequences, reference, index, read_batch1, read_batch2, barcode_batch, read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, minimizer_generator, candidate_processor, mapping_processor, draft_mapping_generator, mapping_generator, mapping_writer, std::cerr, num_loaded_pairs_for_loading, num_loaded_pairs, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs, num_mappings_in_mem, max_num_mappings_in_mem, temp_mapping_file_handles, mm_to_candidates_cache, mm_history1, mm_history2, read_map_summary) num_threads(mapping_parameters_.num_threads) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_, num_barcode_in_whitelist_, num_corrected_barcode_) { thread_num_candidates = 0; thread_num_mappings = 0; thread_num_mapped_reads = 0; thread_num_uniquely_mapped_reads = 0; thread_num_barcode_in_whitelist = 0; thread_num_corrected_barcode = 0; PairedEndMappingMetadata paired_end_mapping_metadata; std::vector best_mapping_indices( mapping_parameters_.max_num_best_mappings); std::mt19937 generator(11); #pragma omp single { double real_batch_start_time = GetRealTime(); while (num_loaded_pairs > 0) { num_reads_ += num_loaded_pairs; num_reads_ += num_loaded_pairs; #pragma omp task { num_loaded_pairs_for_loading = LoadPairedEndReadsWithBarcodes( read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, mapping_parameters_.num_threads >= 12 ? true : false); } // end of openmp loading task int grain_size = 5000; uint32_t history_update_threshold = mm_to_candidates_cache.GetUpdateThreshold(num_loaded_pairs, num_reads_, true, mapping_parameters_.cache_update_param ); std::fill(cache_hits_per_thread.begin(), cache_hits_per_thread.end(), 0); if (mapping_parameters_.debug_cache) { std::cout << "[DEBUG][UPDATE] update_threshold = " << history_update_threshold << std::endl; } #pragma omp taskloop grainsize(grain_size) for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) { int thread_id = omp_get_thread_num(); bool current_barcode_is_whitelisted = true; if (!mapping_parameters_.barcode_whitelist_file_path.empty()) { current_barcode_is_whitelisted = CorrectBarcodeAt( pair_index, barcode_batch, thread_num_barcode_in_whitelist, thread_num_corrected_barcode); } // calculate seed value for each barcode to use later (below and summary update) size_t curr_seed_val = barcode_batch.GenerateSeedFromSequenceAt(pair_index, 0, barcode_length_); seeds_for_batch[pair_index] = curr_seed_val; if (current_barcode_is_whitelisted || mapping_parameters_.output_mappings_not_in_whitelist) { if (read_batch1.GetSequenceLengthAt(pair_index) < (uint32_t)mapping_parameters_.min_read_length || read_batch2.GetSequenceLengthAt(pair_index) < (uint32_t)mapping_parameters_.min_read_length) { continue; // reads are too short, just drop. } read_batch1.PrepareNegativeSequenceAt(pair_index); read_batch2.PrepareNegativeSequenceAt(pair_index); if (mapping_parameters_.trim_adapters) { TrimAdapterForPairedEndRead(pair_index, read_batch1, read_batch2); } paired_end_mapping_metadata.PreparedForMappingNextReadPair( mapping_parameters_.max_seed_frequencies[0]); minimizer_generator.GenerateMinimizers( read_batch1, pair_index, paired_end_mapping_metadata.mapping_metadata1_.minimizers_); minimizer_generator.GenerateMinimizers( read_batch2, pair_index, paired_end_mapping_metadata.mapping_metadata2_.minimizers_); if (paired_end_mapping_metadata.BothEndsHaveMinimizers()) { // declare temp local variable for cache result int cache_query_result1 = 0; int cache_query_result2 = 0; int cache_miss = 0; cache_query_result1 = mm_to_candidates_cache.Query(paired_end_mapping_metadata.mapping_metadata1_, read_batch1.GetSequenceLengthAt(pair_index)); if (cache_query_result1 == -1) { candidate_processor.GenerateCandidates( mapping_parameters_.error_threshold, index, paired_end_mapping_metadata.mapping_metadata1_ ); ++cache_miss; } size_t current_num_candidates1 = paired_end_mapping_metadata.mapping_metadata1_.GetNumCandidates(); cache_query_result2 = mm_to_candidates_cache.Query(paired_end_mapping_metadata.mapping_metadata2_, read_batch2.GetSequenceLengthAt(pair_index)); if (cache_query_result2 == -1) { candidate_processor.GenerateCandidates( mapping_parameters_.error_threshold, index, paired_end_mapping_metadata.mapping_metadata2_ ); ++cache_miss; } size_t current_num_candidates2 = paired_end_mapping_metadata.mapping_metadata2_.GetNumCandidates(); // increment variable for cache_hits bool curr_read_hit_cache = false; if (cache_query_result1 >= 0 || cache_query_result2 >= 0) { cache_hits_per_thread[thread_id]++; curr_read_hit_cache = true; } // update the peak counting data-structure if (output_num_cache_slots_info && curr_read_hit_cache) { // calculate which map this barcode is in size_t map_id = curr_seed_val % num_locks_for_map; // grab lock for this map, and add to the K-MinHash for this particular barcode omp_set_lock(&map_locks[map_id]); auto it = barcode_peak_map[map_id].emplace(curr_seed_val, K_MinHash(k_for_minhash, mapping_parameters_.cache_size)).first; if (cache_query_result1 >= 0) {it->second.add(cache_query_result1);} if (cache_query_result2 >= 0) {it->second.add(cache_query_result2);} omp_unset_lock(&map_locks[map_id]); } if (pair_index < history_update_threshold) { mm_history1[pair_index].timestamp = mm_history2[pair_index].timestamp = num_reads_; mm_history1[pair_index].minimizers = paired_end_mapping_metadata.mapping_metadata1_ .minimizers_; mm_history1[pair_index].positive_candidates = paired_end_mapping_metadata.mapping_metadata1_ .positive_candidates_; mm_history1[pair_index].negative_candidates = paired_end_mapping_metadata.mapping_metadata1_ .negative_candidates_; mm_history1[pair_index].repetitive_seed_length = paired_end_mapping_metadata.mapping_metadata1_ .repetitive_seed_length_; mm_history2[pair_index].minimizers = paired_end_mapping_metadata.mapping_metadata2_ .minimizers_; mm_history2[pair_index].positive_candidates = paired_end_mapping_metadata.mapping_metadata2_ .positive_candidates_; mm_history2[pair_index].negative_candidates = paired_end_mapping_metadata.mapping_metadata2_ .negative_candidates_; mm_history2[pair_index].repetitive_seed_length = paired_end_mapping_metadata.mapping_metadata2_ .repetitive_seed_length_; } // Test whether we need to augment the candidate list with mate // information. int supplementCandidateResult = 0; if (!mapping_parameters_.split_alignment) { supplementCandidateResult = candidate_processor.SupplementCandidates( mapping_parameters_.error_threshold, /*search_range=*/2 * mapping_parameters_.max_insert_size, index, paired_end_mapping_metadata); current_num_candidates1 = paired_end_mapping_metadata.mapping_metadata1_ .GetNumCandidates(); current_num_candidates2 = paired_end_mapping_metadata.mapping_metadata2_ .GetNumCandidates(); } if (current_num_candidates1 > 0 && current_num_candidates2 > 0 && !mapping_parameters_.split_alignment) { paired_end_mapping_metadata.MoveCandidiatesToBuffer(); // Paired-end filter candidate_processor.ReduceCandidatesForPairedEndRead( mapping_parameters_.max_insert_size, paired_end_mapping_metadata); current_num_candidates1 = paired_end_mapping_metadata.mapping_metadata1_ .GetNumCandidates(); current_num_candidates2 = paired_end_mapping_metadata.mapping_metadata2_ .GetNumCandidates(); } // Verify candidates if (current_num_candidates1 > 0 && current_num_candidates2 > 0) { thread_num_candidates += current_num_candidates1 + current_num_candidates2; if (mapping_parameters_.custom_rid_order_file_path.length() > 0) { RerankCandidatesRid( paired_end_mapping_metadata.mapping_metadata1_ .positive_candidates_); RerankCandidatesRid( paired_end_mapping_metadata.mapping_metadata1_ .negative_candidates_); RerankCandidatesRid( paired_end_mapping_metadata.mapping_metadata2_ .positive_candidates_); RerankCandidatesRid( paired_end_mapping_metadata.mapping_metadata2_ .negative_candidates_); } draft_mapping_generator.GenerateDraftMappings( read_batch1, pair_index, reference, paired_end_mapping_metadata.mapping_metadata1_); const size_t current_num_draft_mappings1 = paired_end_mapping_metadata.mapping_metadata1_ .GetNumDraftMappings(); draft_mapping_generator.GenerateDraftMappings( read_batch2, pair_index, reference, paired_end_mapping_metadata.mapping_metadata2_); const size_t current_num_draft_mappings2 = paired_end_mapping_metadata.mapping_metadata2_ .GetNumDraftMappings(); if (current_num_draft_mappings1 > 0 && current_num_draft_mappings2 > 0) { std::vector> &mappings_on_diff_ref_seqs = mappings_on_diff_ref_seqs_for_diff_threads [omp_get_thread_num()]; if (!mapping_parameters_.split_alignment) { // GenerateBestMappingsForPairedEndRead assumes the // mappings are sorted by coordinate for non split // alignments. In split alignment, we don't want to sort // and this keeps mapping and split_sites vectors // consistent. paired_end_mapping_metadata.SortMappingsByPositions(); } int force_mapq = -1; if (supplementCandidateResult != 0) { force_mapq = 0; } mapping_generator.GenerateBestMappingsForPairedEndRead( pair_index, read_batch1, read_batch2, barcode_batch, reference, best_mapping_indices, generator, force_mapq, paired_end_mapping_metadata, mappings_on_diff_ref_seqs); if (paired_end_mapping_metadata.GetNumBestMappings() == 1) { ++thread_num_uniquely_mapped_reads; ++thread_num_uniquely_mapped_reads; } thread_num_mappings += std::min( paired_end_mapping_metadata.GetNumBestMappings(), mapping_parameters_.max_num_best_mappings); thread_num_mappings += std::min( paired_end_mapping_metadata.GetNumBestMappings(), mapping_parameters_.max_num_best_mappings); if (paired_end_mapping_metadata.GetNumBestMappings() > 0) { ++thread_num_mapped_reads; ++thread_num_mapped_reads; if (read_map_summary != NULL) read_map_summary[pair_index] |= (cache_miss < 2 ? 2 : 0) ; } } } // verify candidate } } else { if (read_map_summary != NULL) read_map_summary[pair_index] = 0 ; } } // end of for pair_index // if (num_reads_ / 2 > initial_num_sample_barcodes_) { // if (!is_bulk_data_) { // if (!barcode_whitelist_file_path_.empty()) { // UpdateBarcodeAbundance(num_loaded_pairs, barcode_batch); // } // } //} #pragma omp taskloop grainsize( std::max(history_update_threshold / mapping_parameters_.num_threads, (unsigned int)grain_size) ) // Update cache for (uint32_t pair_index = 0; pair_index < history_update_threshold; ++pair_index) { if (mm_history1[pair_index].timestamp != num_reads_) continue; mm_to_candidates_cache.Update( mm_history1[pair_index].minimizers, mm_history1[pair_index].positive_candidates, mm_history1[pair_index].negative_candidates, mm_history1[pair_index].repetitive_seed_length, mapping_parameters_.debug_cache); mm_to_candidates_cache.Update( mm_history2[pair_index].minimizers, mm_history2[pair_index].positive_candidates, mm_history2[pair_index].negative_candidates, mm_history2[pair_index].repetitive_seed_length, mapping_parameters_.debug_cache); if (mm_history1[pair_index].positive_candidates.size() > 50) { std::vector().swap( mm_history1[pair_index].positive_candidates); } if (mm_history1[pair_index].negative_candidates.size() > 50) { std::vector().swap( mm_history1[pair_index].negative_candidates); } if (mm_history2[pair_index].positive_candidates.size() > 50) { std::vector().swap( mm_history2[pair_index].positive_candidates); } if (mm_history2[pair_index].negative_candidates.size() > 50) { std::vector().swap( mm_history2[pair_index].negative_candidates); } } #pragma omp taskwait if (!mapping_parameters_.summary_metadata_file_path.empty()) { // Update total read count and number of cache hits if (mapping_parameters_.is_bulk_data) { // Sum up cache hits for each thread int cache_hits_for_batch = 0; for (int hits: cache_hits_per_thread) { cache_hits_for_batch += hits; } mapping_writer.UpdateSummaryMetadata(0, SUMMARY_METADATA_TOTAL, num_loaded_pairs); mapping_writer.UpdateSummaryMetadata(0, SUMMARY_METADATA_CACHEHIT, cache_hits_for_batch); } else { uint32_t nonwhitelist_count = 0; for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) { uint64_t pair_seed = seeds_for_batch[pair_index]; if (read_map_summary[pair_index] & 1) { mapping_writer.UpdateSummaryMetadata( pair_seed, SUMMARY_METADATA_TOTAL, 1); } else { ++nonwhitelist_count ; } if (read_map_summary[pair_index] & 2) { mapping_writer.UpdateSummaryMetadata( pair_seed, SUMMARY_METADATA_CACHEHIT, 1); } } mapping_writer.UpdateSpeicalCategorySummaryMetadata(/*nonwhitelist*/0, SUMMARY_METADATA_TOTAL, nonwhitelist_count); } memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_); } std::cerr << "Mapped " << num_loaded_pairs << " read pairs in " << GetRealTime() - real_batch_start_time << "s.\n"; real_batch_start_time = GetRealTime(); // Swap to next batch num_loaded_pairs = num_loaded_pairs_for_loading; read_batch1_for_loading.SwapSequenceBatch(read_batch1); read_batch2_for_loading.SwapSequenceBatch(read_batch2); barcode_batch_for_loading.SwapSequenceBatch(barcode_batch); mappings_on_diff_ref_seqs_for_diff_threads.swap( mappings_on_diff_ref_seqs_for_diff_threads_for_saving); // Reset for next batch std::fill(seeds_for_batch.begin(), seeds_for_batch.end(), 0); #pragma omp task { // Handle output num_mappings_in_mem += mapping_processor.MoveMappingsInBuffersToMappingContainer( num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs); if (mapping_parameters_.low_memory_mode && num_mappings_in_mem > max_num_mappings_in_mem) { mapping_processor.ParallelSortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs, 0); mapping_writer.OutputTempMappings(num_reference_sequences, mappings_on_diff_ref_seqs, temp_mapping_file_handles); if (temp_mapping_file_handles.size() > 850 && temp_mapping_file_handles.size() % 10 == 1) { // every 10 temp files, double the temp file size max_num_mappings_in_mem <<= 1; std::cerr << "Used " << temp_mapping_file_handles.size() << "temp files. Double the temp file volume to " << max_num_mappings_in_mem << "\n" ; } num_mappings_in_mem = 0; } } // end of omp task to handle output } // end of while num_loaded_pairs } // end of openmp single num_barcode_in_whitelist_ += thread_num_barcode_in_whitelist; num_corrected_barcode_ += thread_num_corrected_barcode; num_candidates_ += thread_num_candidates; num_mappings_ += thread_num_mappings; num_mapped_reads_ += thread_num_mapped_reads; num_uniquely_mapped_reads_ += thread_num_uniquely_mapped_reads; } // end of openmp parallel region read_batch1_for_loading.FinalizeLoading(); read_batch2_for_loading.FinalizeLoading(); if (!mapping_parameters_.is_bulk_data) { barcode_batch_for_loading.FinalizeLoading(); } } // end of for read_file_index std::cerr << "Mapped all reads in " << GetRealTime() - real_start_mapping_time << "s.\n"; delete[] mm_history1; delete[] mm_history2; if (read_map_summary != NULL) delete[] read_map_summary; OutputMappingStatistics(); if (!mapping_parameters_.is_bulk_data) { OutputBarcodeStatistics(); } index.Destroy(); if (mapping_parameters_.low_memory_mode) { // First, process the remaining mappings in the memory and save them on // disk. if (num_mappings_in_mem > 0) { mapping_processor.SortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs); mapping_writer.OutputTempMappings(num_reference_sequences, mappings_on_diff_ref_seqs, temp_mapping_file_handles); num_mappings_in_mem = 0; } mapping_writer.ProcessAndOutputMappingsInLowMemory( num_mappings_in_mem, num_reference_sequences, reference, barcode_whitelist_lookup_table_, temp_mapping_file_handles); } else { if (mapping_parameters_.Tn5_shift) { mapping_processor.ApplyTn5ShiftOnMappings(num_reference_sequences, mappings_on_diff_ref_seqs); } if (mapping_parameters_.remove_pcr_duplicates) { mapping_processor.RemovePCRDuplicate(num_reference_sequences, mappings_on_diff_ref_seqs, mapping_parameters_.num_threads); std::cerr << "After removing PCR duplications, "; mapping_processor.OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs); } else { mapping_processor.ParallelSortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs, mapping_parameters_.num_threads); } if (mapping_parameters_.allocate_multi_mappings) { const uint64_t num_multi_mappings = num_mapped_reads_ - num_uniquely_mapped_reads_; mapping_processor.AllocateMultiMappings( num_reference_sequences, num_multi_mappings, mapping_parameters_.multi_mapping_allocation_distance, mappings_on_diff_ref_seqs); std::cerr << "After allocating multi-mappings, "; mapping_processor.OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs); mapping_processor.SortOutputMappings(num_reference_sequences, mappings_on_diff_ref_seqs); } mapping_writer.OutputMappings(num_reference_sequences, reference, mappings_on_diff_ref_seqs); // Temporarily disable feature matrix output. Do not delete the following // commented code. // if (!is_bulk_data_ && !matrix_output_prefix_.empty()) { // if constexpr (std::is_same::value) { // FeatureBarcodeMatrix feature_barcode_matrix( // cell_by_bin_, bin_size_, multi_mapping_allocation_distance_, // depth_cutoff_to_call_peak_); // std::vector> &mappings = // allocate_multi_mappings_ // ? allocated_mappings_on_diff_ref_seqs // : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs // : mappings_on_diff_ref_seqs); // feature_barcode_matrix.OutputFeatureMatrix(num_reference_sequences, // reference, mappings, // matrix_output_prefix_); // } //} } if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM) mapping_writer.AdjustSummaryPairedEndOverCount() ; // Destory the locks used for map for (int i = 0; i < num_locks_for_map; ++i) { omp_destroy_lock(&map_locks[i]); } // Add cardinality information to summary metadata if (output_num_cache_slots_info) { for (auto curr_map: barcode_peak_map) { for (auto &pair: curr_map) { size_t curr_seed = pair.first; size_t est_num_slots = pair.second.compute_cardinality(); mapping_writer.UpdateSummaryMetadata( curr_seed, SUMMARY_METADATA_CARDINALITY, est_num_slots); } } } mapping_writer.OutputSummaryMetadata(frip_est_params, output_num_cache_slots_info); reference.FinalizeLoading(); if (mapping_parameters_.debug_cache) {mm_to_candidates_cache.PrintStats();} std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n"; } } // namespace chromap #endif // CHROMAP_H_ ================================================ FILE: src/chromap_driver.cc ================================================ #include "chromap_driver.h" #include #include #include #include #include #include "chromap.h" #include "cxxopts.hpp" namespace chromap { namespace { void AddIndexingOptions(cxxopts::Options &options) { options.add_options("Indexing")("i,build-index", "Build index")( "min-frag-length", "Min fragment length for choosing k and w automatically [30]", cxxopts::value(), "INT")("k,kmer", "Kmer length [17]", cxxopts::value(), "INT")( "w,window", "Window size [7]", cxxopts::value(), "INT"); } void AddMappingOptions(cxxopts::Options &options) { options.set_width(120).add_options("Mapping")( "preset", "Preset parameters for mapping reads (always applied before other " "options) []\natac: mapping ATAC-seq/scATAC-seq reads\nchip: mapping " "ChIP-seq reads\nhic: mapping Hi-C reads", cxxopts::value(), "STR")("split-alignment", "Allow split alignments")( "e,error-threshold", "Max # errors allowed to map a read [8]", cxxopts::value(), "INT") //("A,match-score", "Match score [1]", cxxopts::value(), "INT") //("B,mismatch-penalty", "Mismatch penalty [4]", cxxopts::value(), //"INT") //("O,gap-open-penalties", "Gap open penalty [6,6]", // cxxopts::value>(), "INT[,INT]") //("E,gap-extension-penalties", "Gap extension penalty [1,1]", // cxxopts::value>(), "INT[,INT]") ("s,min-num-seeds", "Min # seeds to try to map a read [2]", cxxopts::value(), "INT")("f,max-seed-frequencies", "Max seed frequencies for a seed to be selected [500,1000]", cxxopts::value>(), "INT[,INT]") //("n,max-num-best-mappings", "Only report n best mappings [1]", // cxxopts::value(), "INT") ("l,max-insert-size", "Max insert size, only for paired-end read mapping [1000]", cxxopts::value(), "INT")("q,MAPQ-threshold", "Min MAPQ in range [0, 60] for mappings to be output [30]", cxxopts::value(), "INT")("min-read-length", "Min read length [30]", cxxopts::value(), "INT") //("multi-mapping-allocation-distance", "Uni-mappings within this distance // from any end of multi-mappings are used for allocation [0]", // cxxopts::value(), "INT") //("multi-mapping-allocation-seed", "Seed for random number generator in // multi-mapping allocation [11]", cxxopts::value(), "INT") //("drop-repetitive-reads", "Drop reads with too many best mappings //[500000]", cxxopts::value(), "INT") ("trim-adapters", "Try to trim adapters on 3'")("remove-pcr-duplicates", "Remove PCR duplicates")( "remove-pcr-duplicates-at-bulk-level", "Remove PCR duplicates at bulk level for single cell data")( "remove-pcr-duplicates-at-cell-level", "Remove PCR duplicates at cell level for single cell data") //("allocate-multi-mappings", "Allocate multi-mappings") ("Tn5-shift", "Perform Tn5 shift")("low-mem", "Use low memory mode")( "bc-error-threshold", "Max Hamming distance allowed to correct a barcode [1]", cxxopts::value(), "INT")("bc-probability-threshold", "Min probability to correct a barcode [0.9]", cxxopts::value(), "FLT")("t,num-threads", "# threads for mapping [1]", cxxopts::value(), "INT") ("frip-est-params", "coefficients used for frip est calculation, separated by semi-colons", cxxopts::value(), "STR") ("turn-off-num-uniq-cache-slots", "turn off the output of number of cache slots in summary file"); } void AddInputOptions(cxxopts::Options &options) { options.add_options("Input")("r,ref", "Reference file", cxxopts::value(), "FILE")( "x,index", "Index file", cxxopts::value(), "FILE")( "1,read1", "Single-end read files or paired-end read files 1", cxxopts::value>(), "FILE")("2,read2", "Paired-end read files 2", cxxopts::value>(), "FILE")("b,barcode", "Cell barcode files", cxxopts::value>(), "FILE")( "barcode-whitelist", "Cell barcode whitelist file", cxxopts::value(), "FILE")("read-format", "Format for read files and barcode files [\"r1:0:-1,bc:0:-1\" " "as 10x Genomics single-end format]", cxxopts::value(), "STR"); } void AddOutputOptions(cxxopts::Options &options) { options.add_options("Output")("o,output", "Output file", cxxopts::value(), "FILE") //("p,matrix-output-prefix", "Prefix of matrix output files", // cxxopts::value(), "FILE") ("output-mappings-not-in-whitelist", "Output mappings with barcode not in the whitelist")( "chr-order", "Custom chromosome order file. If not specified, the order of " "reference sequences will be used", cxxopts::value(), "FILE")("BED", "Output mappings in BED/BEDPE format")( "TagAlign", "Output mappings in TagAlign/PairedTagAlign format")( "SAM", "Output mappings in SAM format")( "pairs", "Output mappings in pairs format (defined by 4DN for HiC data)")( "pairs-natural-chr-order", "Custom chromosome order file for pairs flipping. If not specified, " "the custom chromosome order will be used", cxxopts::value(), "FILE")("barcode-translate", "Convert barcode to the specified sequences during output", cxxopts::value(), "FILE")( "summary", "Summarize the mapping statistics at bulk or barcode level", cxxopts::value(), "FILE"); //("PAF", "Output mappings in PAF format (only for test)"); } void AddDevelopmentOptions(cxxopts::Options &options) { options.add_options("Development options")("A,match-score", "Match score [1]", cxxopts::value(), "INT")( "B,mismatch-penalty", "Mismatch penalty [4]", cxxopts::value(), "INT")("O,gap-open-penalties", "Gap open penalty [6,6]", cxxopts::value>(), "INT[,INT]")( "E,gap-extension-penalties", "Gap extension penalty [1,1]", cxxopts::value>(), "INT[,INT]")("n,max-num-best-mappings", "Only report n best mappings [1]", cxxopts::value(), "INT")("multi-mapping-allocation-distance", "Uni-mappings within this distance from any end of " "multi-mappings are used for allocation [0]", cxxopts::value(), "INT")( "multi-mapping-allocation-seed", "Seed for random number generator in multi-mapping allocation [11]", cxxopts::value(), "INT")( "drop-repetitive-reads", "Drop reads with too many best mappings [500000]", cxxopts::value(), "INT")("allocate-multi-mappings", "Allocate multi-mappings")( "PAF", "Output mappings in PAF format (only for test)")( "skip-barcode-check", "Do not check whether too few barcodes are in the whitelist") ("cache-size", "number of cache entries [4000003]", cxxopts::value(), "INT") ("cache-update-param", "value used to control number of reads sampled [0.01]", cxxopts::value(), "FLT") ("debug-cache", "verbose output for debugging cache used in chromap") ("k-for-minhash", "number of values stored in each MinHash sketch [250]", cxxopts::value(), "INT"); } void AddPeakOptions(cxxopts::Options &options) { options.add_options("Peak")("cell-by-bin", "Generate cell-by-bin matrix")( "bin-size", "Bin size to generate cell-by-bin matrix [5000]", cxxopts::value(), "INT")("depth-cutoff", "Depth cutoff for peak calling [3]", cxxopts::value(), "INT")("peak-min-length", "Min length of peaks to report [30]", cxxopts::value(), "INT")( "peak-merge-max-length", "Peaks within this length will be merged [30]", cxxopts::value(), "INT"); } // Return all file paths that match the input pattern. std::vector GetMatchedFilePaths(const std::string &pattern) { glob_t glob_result; memset(&glob_result, 0, sizeof(glob_result)); const int return_value = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result); if (return_value != 0) { globfree(&glob_result); chromap::ExitWithMessage("glob() failed with return value " + std::to_string(return_value) + "\n"); } std::vector matched_file_paths; matched_file_paths.reserve(glob_result.gl_pathc); for (size_t i = 0; i < glob_result.gl_pathc; ++i) { matched_file_paths.push_back(std::string(glob_result.gl_pathv[i])); std::cerr << matched_file_paths.back() << "\n"; } globfree(&glob_result); return matched_file_paths; } // Return all file paths that match the input patterns. std::vector GetMatchedFilePaths( const std::vector &patterns) { std::vector all_matched_file_paths; for (const auto &pattern : patterns) { std::vector matched_file_paths = GetMatchedFilePaths(pattern); all_matched_file_paths.reserve(all_matched_file_paths.size() + matched_file_paths.size()); all_matched_file_paths.insert( std::end(all_matched_file_paths), std::make_move_iterator(std::begin(matched_file_paths)), std::make_move_iterator(std::end(matched_file_paths))); } return all_matched_file_paths; } } // namespace void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) { cxxopts::Options options( "chromap", "Fast alignment and preprocessing of chromatin profiles"); options.add_options()("v,version", "Print version")("h,help", "Print help"); AddIndexingOptions(options); AddMappingOptions(options); // We don't support peak options for now. // AddPeakOptions(options); AddInputOptions(options); AddOutputOptions(options); AddDevelopmentOptions(options); auto result = options.parse(argc, argv); if (result.count("h")) { std::cerr << options.help( {"", "Indexing", "Mapping", "Peak", "Input", "Output"}); return; } if (result.count("v")) { std::cerr << CHROMAP_VERSION << "\n"; return; } // Parameters and their default IndexParameters index_parameters; MappingParameters mapping_parameters; if (result.count("preset")) { std::string read_type = result["preset"].as(); if (read_type == "atac") { std::cerr << "Preset parameters for ATAC-seq/scATAC-seq are used.\n"; mapping_parameters.max_insert_size = 2000; mapping_parameters.trim_adapters = true; mapping_parameters.remove_pcr_duplicates = true; mapping_parameters.remove_pcr_duplicates_at_bulk_level = false; mapping_parameters.Tn5_shift = true; mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED; mapping_parameters.low_memory_mode = true; } else if (read_type == "chip") { std::cerr << "Preset parameters for ChIP-seq are used.\n"; mapping_parameters.max_insert_size = 2000; mapping_parameters.remove_pcr_duplicates = true; mapping_parameters.low_memory_mode = true; mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED; } else if (read_type == "hic") { std::cerr << "Preset parameters for Hi-C are used.\n"; mapping_parameters.error_threshold = 4; mapping_parameters.mapq_threshold = 1; mapping_parameters.split_alignment = true; mapping_parameters.low_memory_mode = true; mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAIRS; } else { chromap::ExitWithMessage("Unrecognized preset parameters " + read_type + "\n"); } } // Optional parameters if (result.count("min-frag-length")) { int min_fragment_length = result["min-frag-length"].as(); if (min_fragment_length <= 60) { index_parameters.kmer_size = 17; index_parameters.window_size = 7; } else if (min_fragment_length <= 80) { index_parameters.kmer_size = 19; index_parameters.window_size = 10; } else { index_parameters.kmer_size = 23; index_parameters.window_size = 11; } } if (result.count("k")) { index_parameters.kmer_size = result["kmer"].as(); } if (result.count("w")) { index_parameters.window_size = result["window"].as(); } if (result.count("e")) { mapping_parameters.error_threshold = result["error-threshold"].as(); } if (result.count("A")) { mapping_parameters.match_score = result["match-score"].as(); } if (result.count("B")) { mapping_parameters.mismatch_penalty = result["mismatch-penalty"].as(); } if (result.count("O")) { mapping_parameters.gap_open_penalties = result["gap-open-penalties"].as>(); } if (result.count("E")) { mapping_parameters.gap_extension_penalties = result["gap-extension-penalties"].as>(); } if (result.count("s")) { mapping_parameters.min_num_seeds_required_for_mapping = result["min-num-seeds"].as(); } if (result.count("f")) { mapping_parameters.max_seed_frequencies = result["max-seed-frequencies"].as>(); } if (result.count("n")) { mapping_parameters.max_num_best_mappings = result["max-num-best-mappings"].as(); } if (result.count("l")) { mapping_parameters.max_insert_size = result["max-insert-size"].as(); } if (result.count("q")) { mapping_parameters.mapq_threshold = result["MAPQ-threshold"].as(); } if (result.count("t")) { mapping_parameters.num_threads = result["num-threads"].as(); } // check cache-related parameters if (result.count("cache-update-param")) { mapping_parameters.cache_update_param = result["cache-update-param"].as(); if (mapping_parameters.cache_update_param < 0.0 || mapping_parameters.cache_update_param > 1.0){ chromap::ExitWithMessage("cache update param is not approriate, must be in this range (0, 1]"); } } if (result.count("cache-size")) { mapping_parameters.cache_size = result["cache-size"].as(); if (mapping_parameters.cache_size < 2000000 || mapping_parameters.cache_size > 15000000) { chromap::ExitWithMessage("cache size is not in appropriate range\n"); } } if (result.count("debug-cache")) { mapping_parameters.debug_cache = true; } if (result.count("frip-est-params")) { mapping_parameters.frip_est_params = result["frip-est-params"].as(); } if (result.count("turn-off-num-uniq-cache-slots")) { mapping_parameters.output_num_uniq_cache_slots = false; } if (result.count("k-for-minhash")) { mapping_parameters.k_for_minhash = result["k-for-minhash"].as(); if (mapping_parameters.k_for_minhash < 1 || mapping_parameters.k_for_minhash >= 2000) { chromap::ExitWithMessage("Invalid paramter for size of MinHash sketch (--k-for-minhash)"); } } if (result.count("min-read-length")) { mapping_parameters.min_read_length = result["min-read-length"].as(); } if (result.count("bc-error-threshold")) { mapping_parameters.barcode_correction_error_threshold = result["bc-error-threshold"].as(); } if (result.count("bc-probability-threshold")) { mapping_parameters.barcode_correction_probability_threshold = result["bc-probability-threshold"].as(); } if (result.count("multi-mapping-allocation-distance")) { mapping_parameters.multi_mapping_allocation_distance = result["multi-mapping-allocation-distance"].as(); } if (result.count("multi-mapping-allocation-seed")) { mapping_parameters.multi_mapping_allocation_seed = result["multi-mapping-allocation-seed"].as(); } if (result.count("drop-repetitive-reads")) { mapping_parameters.drop_repetitive_reads = result["drop-repetitive-reads"].as(); } if (result.count("trim-adapters")) { mapping_parameters.trim_adapters = true; } if (result.count("remove-pcr-duplicates")) { mapping_parameters.remove_pcr_duplicates = true; } if (result.count("remove-pcr-duplicates-at-bulk-level")) { mapping_parameters.remove_pcr_duplicates_at_bulk_level = true; } if (result.count("remove-pcr-duplicates-at-cell-level")) { mapping_parameters.remove_pcr_duplicates_at_bulk_level = false; } if (result.count("allocate-multi-mappings")) { mapping_parameters.allocate_multi_mappings = true; mapping_parameters.only_output_unique_mappings = false; } if (result.count("Tn5-shift")) { mapping_parameters.Tn5_shift = true; } if (result.count("split-alignment")) { mapping_parameters.split_alignment = true; } if (result.count("output-mappings-not-in-whitelist")) { mapping_parameters.output_mappings_not_in_whitelist = true; } if (result.count("BED")) { mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED; } if (result.count("TagAlign")) { mapping_parameters.mapping_output_format = MAPPINGFORMAT_TAGALIGN; } if (result.count("PAF")) { mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAF; } if (result.count("pairs")) { mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAIRS; } if (result.count("SAM")) { mapping_parameters.mapping_output_format = MAPPINGFORMAT_SAM; } if (result.count("low-mem")) { mapping_parameters.low_memory_mode = true; } if (result.count("cell-by-bin")) { mapping_parameters.cell_by_bin = true; } if (result.count("bin-size")) { mapping_parameters.bin_size = result["bin-size"].as(); } if (result.count("depth-cutoff")) { mapping_parameters.depth_cutoff_to_call_peak = result["depth-cutoff"].as(); } if (result.count("peak-min-length")) { mapping_parameters.peak_min_length = result["peak-min-length"].as(); } if (result.count("peak-merge-max-length")) { mapping_parameters.peak_merge_max_length = result["peak-merge-max-length"].as(); } std::cerr << std::setprecision(2) << std::fixed; if (result.count("i")) { if (result.count("r")) { index_parameters.reference_file_path = result["ref"].as(); } else { chromap::ExitWithMessage("No reference specified!"); } if (result.count("o")) { index_parameters.index_output_file_path = result["output"].as(); } else { chromap::ExitWithMessage("No output file specified!"); } std::cerr << "Build index for the reference.\n"; std::cerr << "Kmer length: " << index_parameters.kmer_size << ", window size: " << index_parameters.window_size << "\n"; std::cerr << "Reference file: " << index_parameters.reference_file_path << "\n"; std::cerr << "Output file: " << index_parameters.index_output_file_path << "\n"; chromap::Chromap chromap_for_indexing(index_parameters); chromap_for_indexing.ConstructIndex(); } else if (result.count("1")) { std::cerr << "Start to map reads.\n"; if (result.count("r")) { mapping_parameters.reference_file_path = result["ref"].as(); } else { chromap::ExitWithMessage("No reference specified!"); } if (result.count("o")) { mapping_parameters.mapping_output_file_path = result["output"].as(); } else { chromap::ExitWithMessage("No output file specified!"); } if (result.count("x")) { mapping_parameters.index_file_path = result["index"].as(); } else { chromap::ExitWithMessage("No index file specified!"); } if (result.count("1")) { mapping_parameters.read_file1_paths = GetMatchedFilePaths(result["read1"].as>()); } else { chromap::ExitWithMessage("No read file specified!"); } if (result.count("2")) { mapping_parameters.read_file2_paths = GetMatchedFilePaths(result["read2"].as>()); } if (result.count("b")) { mapping_parameters.is_bulk_data = false; mapping_parameters.barcode_file_paths = GetMatchedFilePaths(result["barcode"].as>()); if (result.count("barcode-whitelist") == 0) { std::cerr << "WARNING: there are input barcode files but a barcode " "whitelist file is missing!\n"; } } if (result.count("barcode-whitelist")) { if (mapping_parameters.is_bulk_data) { chromap::ExitWithMessage( "No barcode file specified but the barcode whitelist file is " "given!"); } mapping_parameters.barcode_whitelist_file_path = result["barcode-whitelist"].as(); } if (result.count("p")) { mapping_parameters.matrix_output_prefix = result["matrix-output-prefix"].as(); if (mapping_parameters.is_bulk_data) { chromap::ExitWithMessage( "No barcode file specified but asked to output matrix files!"); } } if (result.count("read-format")) { mapping_parameters.read_format = result["read-format"].as(); } if (result.count("chr-order")) { mapping_parameters.custom_rid_order_file_path = result["chr-order"].as(); } if (result.count("pairs-natural-chr-order")) { mapping_parameters.pairs_flipping_custom_rid_order_file_path = result["pairs-natural-chr-order"].as(); } if (result.count("barcode-translate")) { mapping_parameters.barcode_translate_table_file_path = result["barcode-translate"].as(); } if (result.count("summary")) { mapping_parameters.summary_metadata_file_path = result["summary"].as(); } if (result.count("skip-barcode-check")) { mapping_parameters.skip_barcode_check = true; } // std::cerr << "Parameters: error threshold: " << error_threshold << ", // match score: " << match_score << ", mismatch_penalty: " << // mismatch_penalty << ", gap open penalties for deletions and insertions: " // << gap_open_penalties[0] << "," << gap_open_penalties[1] << ", gap // extension penalties for deletions and insertions: " << // gap_extension_penalties[0] << "," << gap_extension_penalties[1] << ", // min-num-seeds: " << min_num_seeds_required_for_mapping << ", // max-seed-frequency: " << max_seed_frequencies[0] << "," << // max_seed_frequencies[1] << ", max-num-best-mappings: " << // max_num_best_mappings << ", max-insert-size: " << max_insert_size << ", // MAPQ-threshold: " << (int)mapq_threshold << ", min-read-length: " << // min_read_length << ", multi-mapping-allocation-distance: " << // multi_mapping_allocation_distance << ", multi-mapping-allocation-seed: " // << multi_mapping_allocation_seed << ", drop-repetitive-reads: " << // drop_repetitive_reads << "\n"; std::cerr << "Parameters: error threshold: " << mapping_parameters.error_threshold << ", min-num-seeds: " << mapping_parameters.min_num_seeds_required_for_mapping << ", max-seed-frequency: " << mapping_parameters.max_seed_frequencies[0] << "," << mapping_parameters.max_seed_frequencies[1] << ", max-num-best-mappings: " << mapping_parameters.max_num_best_mappings << ", max-insert-size: " << mapping_parameters.max_insert_size << ", MAPQ-threshold: " << (int)mapping_parameters.mapq_threshold << ", min-read-length: " << mapping_parameters.min_read_length << ", bc-error-threshold: " << mapping_parameters.barcode_correction_error_threshold << ", bc-probability-threshold: " << mapping_parameters.barcode_correction_probability_threshold << "\n"; std::cerr << "Number of threads: " << mapping_parameters.num_threads << "\n"; if (mapping_parameters.is_bulk_data) { std::cerr << "Analyze bulk data.\n"; } else { std::cerr << "Analyze single-cell data.\n"; } if (mapping_parameters.trim_adapters) { std::cerr << "Will try to remove adapters on 3'.\n"; } else { std::cerr << "Won't try to remove adapters on 3'.\n"; } if (mapping_parameters.remove_pcr_duplicates) { std::cerr << "Will remove PCR duplicates after mapping.\n"; } else { std::cerr << "Won't remove PCR duplicates after mapping.\n"; } if (mapping_parameters.remove_pcr_duplicates_at_bulk_level) { std::cerr << "Will remove PCR duplicates at bulk level.\n"; } else { std::cerr << "Will remove PCR duplicates at cell level.\n"; } if (mapping_parameters.allocate_multi_mappings) { std::cerr << "Will allocate multi-mappings after mapping.\n"; } else { std::cerr << "Won't allocate multi-mappings after mapping.\n"; } if (mapping_parameters.only_output_unique_mappings) { std::cerr << "Only output unique mappings after mapping.\n"; } if (!mapping_parameters.output_mappings_not_in_whitelist) { std::cerr << "Only output mappings of which barcodes are in whitelist.\n"; } else { std::cerr << "No filtering of mappings based on whether their barcodes " "are in whitelist.\n"; } // if (allocate_multi_mappings && only_output_unique_mappings) { // std::cerr << "WARNING: you want to output unique mappings only but you // ask to allocate multi-mappings! In this case, it won't allocate // multi-mappings and will only output unique mappings.\n"; // allocate_multi_mappings = false; //} if (mapping_parameters.max_num_best_mappings > mapping_parameters.drop_repetitive_reads) { std::cerr << "WARNING: you want to drop mapped reads with more than " << mapping_parameters.drop_repetitive_reads << " mappings. But you want to output top " << mapping_parameters.max_num_best_mappings << " best mappings. In this case, only reads with <=" << mapping_parameters.drop_repetitive_reads << " best mappings will be output.\n"; mapping_parameters.max_num_best_mappings = mapping_parameters.drop_repetitive_reads; } if (mapping_parameters.Tn5_shift) { std::cerr << "Perform Tn5 shift.\n"; } if (mapping_parameters.split_alignment) { std::cerr << "Allow split alignment.\n"; } switch (mapping_parameters.mapping_output_format) { case MAPPINGFORMAT_BED: std::cerr << "Output mappings in BED/BEDPE format.\n"; break; case MAPPINGFORMAT_TAGALIGN: std::cerr << "Output mappings in TagAlign/PairedTagAlign format.\n"; break; case MAPPINGFORMAT_PAF: std::cerr << "Output mappings in PAF format.\n"; break; case MAPPINGFORMAT_SAM: std::cerr << "Output mappings in SAM format.\n"; break; case MAPPINGFORMAT_PAIRS: std::cerr << "Output mappings in pairs format.\n"; break; default: chromap::ExitWithMessage("Unknown mapping output format!"); break; } std::cerr << "Reference file: " << mapping_parameters.reference_file_path << "\n"; std::cerr << "Index file: " << mapping_parameters.index_file_path << "\n"; for (size_t i = 0; i < mapping_parameters.read_file1_paths.size(); ++i) { std::cerr << i + 1 << "th read 1 file: " << mapping_parameters.read_file1_paths[i] << "\n"; } if (result.count("2") != 0) { for (size_t i = 0; i < mapping_parameters.read_file2_paths.size(); ++i) { std::cerr << i + 1 << "th read 2 file: " << mapping_parameters.read_file2_paths[i] << "\n"; } } if (result.count("b") != 0) { for (size_t i = 0; i < mapping_parameters.barcode_file_paths.size(); ++i) { std::cerr << i + 1 << "th cell barcode file: " << mapping_parameters.barcode_file_paths[i] << "\n"; } } if (result.count("barcode-whitelist") != 0) { std::cerr << "Cell barcode whitelist file: " << mapping_parameters.barcode_whitelist_file_path << "\n"; } std::cerr << "Output file: " << mapping_parameters.mapping_output_file_path << "\n"; if (result.count("matrix-output-prefix") != 0) { std::cerr << "Matrix output prefix: " << mapping_parameters.matrix_output_prefix << "\n"; } chromap::Chromap chromap_for_mapping(mapping_parameters); if (result.count("2") == 0) { // Single-end reads. switch (mapping_parameters.mapping_output_format) { case MAPPINGFORMAT_PAF: { chromap_for_mapping.MapSingleEndReads(); break; } case MAPPINGFORMAT_SAM: { chromap_for_mapping.MapSingleEndReads(); break; } case MAPPINGFORMAT_PAIRS: chromap::ExitWithMessage("No support for single-end HiC yet!"); break; case MAPPINGFORMAT_BED: case MAPPINGFORMAT_TAGALIGN: if (result.count("b") != 0) { chromap_for_mapping .MapSingleEndReads(); } else { chromap_for_mapping .MapSingleEndReads(); } break; default: chromap::ExitWithMessage("Unknown mapping output format!"); break; } } else { // Paired-end reads. switch (mapping_parameters.mapping_output_format) { case MAPPINGFORMAT_PAF: { chromap_for_mapping.MapPairedEndReads(); break; } case MAPPINGFORMAT_SAM: { chromap_for_mapping.MapPairedEndReads(); break; } case MAPPINGFORMAT_PAIRS: { chromap_for_mapping.MapPairedEndReads(); break; } case MAPPINGFORMAT_BED: case MAPPINGFORMAT_TAGALIGN: if (result.count("b") != 0) { chromap_for_mapping .MapPairedEndReads(); } else { chromap_for_mapping .MapPairedEndReads(); } break; default: chromap::ExitWithMessage("Unknown mapping output format!"); break; } } } else { std::cerr << options.help( {"", "Indexing", "Mapping", "Peak", "Input", "Output"}); } } } // namespace chromap int main(int argc, char *argv[]) { chromap::ChromapDriver chromap_driver; chromap_driver.ParseArgsAndRun(argc, argv); return 0; } ================================================ FILE: src/chromap_driver.h ================================================ #ifndef CHROMAP_DRIVER_H_ #define CHROMAP_DRIVER_H_ namespace chromap { class ChromapDriver { public: ChromapDriver() = default; ~ChromapDriver() = default; void ParseArgsAndRun(int argc, char *argv[]); }; } // namespace chromap #endif // CHROMAP_DRIVER_H_ ================================================ FILE: src/cxxopts.hpp ================================================ /* Copyright (c) 2014, 2015, 2016, 2017 Jarryd Beck Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef CXXOPTS_HPP_INCLUDED #define CXXOPTS_HPP_INCLUDED #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cpp_lib_optional #include #define CXXOPTS_HAS_OPTIONAL #endif #if __cplusplus >= 201603L #define CXXOPTS_NODISCARD [[nodiscard]] #else #define CXXOPTS_NODISCARD #endif #ifndef CXXOPTS_VECTOR_DELIMITER #define CXXOPTS_VECTOR_DELIMITER ',' #endif #define CXXOPTS__VERSION_MAJOR 3 #define CXXOPTS__VERSION_MINOR 0 #define CXXOPTS__VERSION_PATCH 0 namespace cxxopts { static constexpr struct { uint8_t major, minor, patch; } version = { CXXOPTS__VERSION_MAJOR, CXXOPTS__VERSION_MINOR, CXXOPTS__VERSION_PATCH }; } // namespace cxxopts //when we ask cxxopts to use Unicode, help strings are processed using ICU, //which results in the correct lengths being computed for strings when they //are formatted for the help output //it is necessary to make sure that can be found by the //compiler, and that icu-uc is linked in to the binary. #ifdef CXXOPTS_USE_UNICODE #include namespace cxxopts { using String = icu::UnicodeString; inline String toLocalString(std::string s) { return icu::UnicodeString::fromUTF8(std::move(s)); } class UnicodeStringIterator : public std::iterator { public: UnicodeStringIterator(const icu::UnicodeString* string, int32_t pos) : s(string) , i(pos) { } value_type operator*() const { return s->char32At(i); } bool operator==(const UnicodeStringIterator& rhs) const { return s == rhs.s && i == rhs.i; } bool operator!=(const UnicodeStringIterator& rhs) const { return !(*this == rhs); } UnicodeStringIterator& operator++() { ++i; return *this; } UnicodeStringIterator operator+(int32_t v) { return UnicodeStringIterator(s, i + v); } private: const icu::UnicodeString* s; int32_t i; }; inline String& stringAppend(String&s, String a) { return s.append(std::move(a)); } inline String& stringAppend(String& s, size_t n, UChar32 c) { for (size_t i = 0; i != n; ++i) { s.append(c); } return s; } template String& stringAppend(String& s, Iterator begin, Iterator end) { while (begin != end) { s.append(*begin); ++begin; } return s; } inline size_t stringLength(const String& s) { return s.length(); } inline std::string toUTF8String(const String& s) { std::string result; s.toUTF8String(result); return result; } inline bool empty(const String& s) { return s.isEmpty(); } } namespace std { inline cxxopts::UnicodeStringIterator begin(const icu::UnicodeString& s) { return cxxopts::UnicodeStringIterator(&s, 0); } inline cxxopts::UnicodeStringIterator end(const icu::UnicodeString& s) { return cxxopts::UnicodeStringIterator(&s, s.length()); } } //ifdef CXXOPTS_USE_UNICODE #else namespace cxxopts { using String = std::string; template T toLocalString(T&& t) { return std::forward(t); } inline size_t stringLength(const String& s) { return s.length(); } inline String& stringAppend(String&s, const String& a) { return s.append(a); } inline String& stringAppend(String& s, size_t n, char c) { return s.append(n, c); } template String& stringAppend(String& s, Iterator begin, Iterator end) { return s.append(begin, end); } template std::string toUTF8String(T&& t) { return std::forward(t); } inline bool empty(const std::string& s) { return s.empty(); } } // namespace cxxopts //ifdef CXXOPTS_USE_UNICODE #endif namespace cxxopts { namespace { #ifdef _WIN32 const std::string LQUOTE("\'"); const std::string RQUOTE("\'"); #else const std::string LQUOTE("‘"); const std::string RQUOTE("’"); #endif } // namespace #if defined(__GNUC__) // GNU GCC with -Weffc++ will issue a warning regarding the upcoming class, we want to silence it: // warning: base class 'class std::enable_shared_from_this' has accessible non-virtual destructor #pragma GCC diagnostic ignored "-Wnon-virtual-dtor" #pragma GCC diagnostic push // This will be ignored under other compilers like LLVM clang. #endif class Value : public std::enable_shared_from_this { public: virtual ~Value() = default; virtual std::shared_ptr clone() const = 0; virtual void parse(const std::string& text) const = 0; virtual void parse() const = 0; virtual bool has_default() const = 0; virtual bool is_container() const = 0; virtual bool has_implicit() const = 0; virtual std::string get_default_value() const = 0; virtual std::string get_implicit_value() const = 0; virtual std::shared_ptr default_value(const std::string& value) = 0; virtual std::shared_ptr implicit_value(const std::string& value) = 0; virtual std::shared_ptr no_implicit_value() = 0; virtual bool is_boolean() const = 0; }; #if defined(__GNUC__) #pragma GCC diagnostic pop #endif class OptionException : public std::exception { public: explicit OptionException(std::string message) : m_message(std::move(message)) { } CXXOPTS_NODISCARD const char* what() const noexcept override { return m_message.c_str(); } private: std::string m_message; }; class OptionSpecException : public OptionException { public: explicit OptionSpecException(const std::string& message) : OptionException(message) { } }; class OptionParseException : public OptionException { public: explicit OptionParseException(const std::string& message) : OptionException(message) { } }; class option_exists_error : public OptionSpecException { public: explicit option_exists_error(const std::string& option) : OptionSpecException("Option " + LQUOTE + option + RQUOTE + " already exists") { } }; class invalid_option_format_error : public OptionSpecException { public: explicit invalid_option_format_error(const std::string& format) : OptionSpecException("Invalid option format " + LQUOTE + format + RQUOTE) { } }; class option_syntax_exception : public OptionParseException { public: explicit option_syntax_exception(const std::string& text) : OptionParseException("Argument " + LQUOTE + text + RQUOTE + " starts with a - but has incorrect syntax") { } }; class option_not_exists_exception : public OptionParseException { public: explicit option_not_exists_exception(const std::string& option) : OptionParseException("Option " + LQUOTE + option + RQUOTE + " does not exist") { } }; class missing_argument_exception : public OptionParseException { public: explicit missing_argument_exception(const std::string& option) : OptionParseException( "Option " + LQUOTE + option + RQUOTE + " is missing an argument" ) { } }; class option_requires_argument_exception : public OptionParseException { public: explicit option_requires_argument_exception(const std::string& option) : OptionParseException( "Option " + LQUOTE + option + RQUOTE + " requires an argument" ) { } }; class option_not_has_argument_exception : public OptionParseException { public: option_not_has_argument_exception ( const std::string& option, const std::string& arg ) : OptionParseException( "Option " + LQUOTE + option + RQUOTE + " does not take an argument, but argument " + LQUOTE + arg + RQUOTE + " given" ) { } }; class option_not_present_exception : public OptionParseException { public: explicit option_not_present_exception(const std::string& option) : OptionParseException("Option " + LQUOTE + option + RQUOTE + " not present") { } }; class option_has_no_value_exception : public OptionException { public: explicit option_has_no_value_exception(const std::string& option) : OptionException( option.empty() ? ("Option " + LQUOTE + option + RQUOTE + " has no value") : "Option has no value") { } }; class argument_incorrect_type : public OptionParseException { public: explicit argument_incorrect_type ( const std::string& arg ) : OptionParseException( "Argument " + LQUOTE + arg + RQUOTE + " failed to parse" ) { } }; class option_required_exception : public OptionParseException { public: explicit option_required_exception(const std::string& option) : OptionParseException( "Option " + LQUOTE + option + RQUOTE + " is required but not present" ) { } }; template void throw_or_mimic(const std::string& text) { static_assert(std::is_base_of::value, "throw_or_mimic only works on std::exception and " "deriving classes"); #ifndef CXXOPTS_NO_EXCEPTIONS // If CXXOPTS_NO_EXCEPTIONS is not defined, just throw throw T{text}; #else // Otherwise manually instantiate the exception, print what() to stderr, // and exit T exception{text}; std::cerr << exception.what() << std::endl; std::exit(EXIT_FAILURE); #endif } namespace values { namespace { std::basic_regex integer_pattern ("(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)"); std::basic_regex truthy_pattern ("(t|T)(rue)?|1"); std::basic_regex falsy_pattern ("(f|F)(alse)?|0"); } // namespace namespace detail { template struct SignedCheck; template struct SignedCheck { template void operator()(bool negative, U u, const std::string& text) { if (negative) { if (u > static_cast((std::numeric_limits::min)())) { throw_or_mimic(text); } } else { if (u > static_cast((std::numeric_limits::max)())) { throw_or_mimic(text); } } } }; template struct SignedCheck { template void operator()(bool, U, const std::string&) const {} }; template void check_signed_range(bool negative, U value, const std::string& text) { SignedCheck::is_signed>()(negative, value, text); } } // namespace detail template void checked_negate(R& r, T&& t, const std::string&, std::true_type) { // if we got to here, then `t` is a positive number that fits into // `R`. So to avoid MSVC C4146, we first cast it to `R`. // See https://github.com/jarro2783/cxxopts/issues/62 for more details. r = static_cast(-static_cast(t-1)-1); } template void checked_negate(R&, T&&, const std::string& text, std::false_type) { throw_or_mimic(text); } template void integer_parser(const std::string& text, T& value) { std::smatch match; std::regex_match(text, match, integer_pattern); if (match.length() == 0) { throw_or_mimic(text); } if (match.length(4) > 0) { value = 0; return; } using US = typename std::make_unsigned::type; constexpr bool is_signed = std::numeric_limits::is_signed; const bool negative = match.length(1) > 0; const uint8_t base = match.length(2) > 0 ? 16 : 10; auto value_match = match[3]; US result = 0; for (auto iter = value_match.first; iter != value_match.second; ++iter) { US digit = 0; if (*iter >= '0' && *iter <= '9') { digit = static_cast(*iter - '0'); } else if (base == 16 && *iter >= 'a' && *iter <= 'f') { digit = static_cast(*iter - 'a' + 10); } else if (base == 16 && *iter >= 'A' && *iter <= 'F') { digit = static_cast(*iter - 'A' + 10); } else { throw_or_mimic(text); } const US next = static_cast(result * base + digit); if (result > next) { throw_or_mimic(text); } result = next; } detail::check_signed_range(negative, result, text); if (negative) { checked_negate(value, result, text, std::integral_constant()); } else { value = static_cast(result); } } template void stringstream_parser(const std::string& text, T& value) { std::stringstream in(text); in >> value; if (!in) { throw_or_mimic(text); } } inline void parse_value(const std::string& text, uint8_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, int8_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, uint16_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, int16_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, uint32_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, int32_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, uint64_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, int64_t& value) { integer_parser(text, value); } inline void parse_value(const std::string& text, bool& value) { std::smatch result; std::regex_match(text, result, truthy_pattern); if (!result.empty()) { value = true; return; } std::regex_match(text, result, falsy_pattern); if (!result.empty()) { value = false; return; } throw_or_mimic(text); } inline void parse_value(const std::string& text, std::string& value) { value = text; } // The fallback parser. It uses the stringstream parser to parse all types // that have not been overloaded explicitly. It has to be placed in the // source code before all other more specialized templates. template void parse_value(const std::string& text, T& value) { stringstream_parser(text, value); } template void parse_value(const std::string& text, std::vector& value) { std::stringstream in(text); std::string token; while(!in.eof() && std::getline(in, token, CXXOPTS_VECTOR_DELIMITER)) { T v; parse_value(token, v); value.emplace_back(std::move(v)); } } #ifdef CXXOPTS_HAS_OPTIONAL template void parse_value(const std::string& text, std::optional& value) { T result; parse_value(text, result); value = std::move(result); } #endif inline void parse_value(const std::string& text, char& c) { if (text.length() != 1) { throw_or_mimic(text); } c = text[0]; } template struct type_is_container { static constexpr bool value = false; }; template struct type_is_container> { static constexpr bool value = true; }; template class abstract_value : public Value { using Self = abstract_value; public: abstract_value() : m_result(std::make_shared()) , m_store(m_result.get()) { } explicit abstract_value(T* t) : m_store(t) { } ~abstract_value() override = default; abstract_value& operator=(const abstract_value&) = default; abstract_value(const abstract_value& rhs) { if (rhs.m_result) { m_result = std::make_shared(); m_store = m_result.get(); } else { m_store = rhs.m_store; } m_default = rhs.m_default; m_implicit = rhs.m_implicit; m_default_value = rhs.m_default_value; m_implicit_value = rhs.m_implicit_value; } void parse(const std::string& text) const override { parse_value(text, *m_store); } bool is_container() const override { return type_is_container::value; } void parse() const override { parse_value(m_default_value, *m_store); } bool has_default() const override { return m_default; } bool has_implicit() const override { return m_implicit; } std::shared_ptr default_value(const std::string& value) override { m_default = true; m_default_value = value; return shared_from_this(); } std::shared_ptr implicit_value(const std::string& value) override { m_implicit = true; m_implicit_value = value; return shared_from_this(); } std::shared_ptr no_implicit_value() override { m_implicit = false; return shared_from_this(); } std::string get_default_value() const override { return m_default_value; } std::string get_implicit_value() const override { return m_implicit_value; } bool is_boolean() const override { return std::is_same::value; } const T& get() const { if (m_store == nullptr) { return *m_result; } return *m_store; } protected: std::shared_ptr m_result{}; T* m_store{}; bool m_default = false; bool m_implicit = false; std::string m_default_value{}; std::string m_implicit_value{}; }; template class standard_value : public abstract_value { public: using abstract_value::abstract_value; CXXOPTS_NODISCARD std::shared_ptr clone() const override { return std::make_shared>(*this); } }; template <> class standard_value : public abstract_value { public: ~standard_value() override = default; standard_value() { set_default_and_implicit(); } explicit standard_value(bool* b) : abstract_value(b) { set_default_and_implicit(); } std::shared_ptr clone() const override { return std::make_shared>(*this); } private: void set_default_and_implicit() { m_default = true; m_default_value = "false"; m_implicit = true; m_implicit_value = "true"; } }; } // namespace values template std::shared_ptr value() { return std::make_shared>(); } template std::shared_ptr value(T& t) { return std::make_shared>(&t); } class OptionAdder; class OptionDetails { public: OptionDetails ( std::string short_, std::string long_, String desc, std::shared_ptr val ) : m_short(std::move(short_)) , m_long(std::move(long_)) , m_desc(std::move(desc)) , m_value(std::move(val)) , m_count(0) { m_hash = std::hash{}(m_long + m_short); } OptionDetails(const OptionDetails& rhs) : m_desc(rhs.m_desc) , m_value(rhs.m_value->clone()) , m_count(rhs.m_count) { } OptionDetails(OptionDetails&& rhs) = default; CXXOPTS_NODISCARD const String& description() const { return m_desc; } CXXOPTS_NODISCARD const Value& value() const { return *m_value; } CXXOPTS_NODISCARD std::shared_ptr make_storage() const { return m_value->clone(); } CXXOPTS_NODISCARD const std::string& short_name() const { return m_short; } CXXOPTS_NODISCARD const std::string& long_name() const { return m_long; } size_t hash() const { return m_hash; } private: std::string m_short{}; std::string m_long{}; String m_desc{}; std::shared_ptr m_value{}; int m_count; size_t m_hash{}; }; struct HelpOptionDetails { std::string s; std::string l; String desc; bool has_default; std::string default_value; bool has_implicit; std::string implicit_value; std::string arg_help; bool is_container; bool is_boolean; }; struct HelpGroupDetails { std::string name{}; std::string description{}; std::vector options{}; }; class OptionValue { public: void parse ( const std::shared_ptr& details, const std::string& text ) { ensure_value(details); ++m_count; m_value->parse(text); m_long_name = &details->long_name(); } void parse_default(const std::shared_ptr& details) { ensure_value(details); m_default = true; m_long_name = &details->long_name(); m_value->parse(); } #if defined(__GNUC__) #if __GNUC__ <= 10 && __GNUC_MINOR__ <= 1 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Werror=null-dereference" #endif #endif CXXOPTS_NODISCARD size_t count() const noexcept { return m_count; } #if defined(__GNUC__) #if __GNUC__ <= 10 && __GNUC_MINOR__ <= 1 #pragma GCC diagnostic pop #endif #endif // TODO: maybe default options should count towards the number of arguments CXXOPTS_NODISCARD bool has_default() const noexcept { return m_default; } template const T& as() const { if (m_value == nullptr) { throw_or_mimic( m_long_name == nullptr ? "" : *m_long_name); } #ifdef CXXOPTS_NO_RTTI return static_cast&>(*m_value).get(); #else return dynamic_cast&>(*m_value).get(); #endif } private: void ensure_value(const std::shared_ptr& details) { if (m_value == nullptr) { m_value = details->make_storage(); } } const std::string* m_long_name = nullptr; // Holding this pointer is safe, since OptionValue's only exist in key-value pairs, // where the key has the string we point to. std::shared_ptr m_value{}; size_t m_count = 0; bool m_default = false; }; class KeyValue { public: KeyValue(std::string key_, std::string value_) : m_key(std::move(key_)) , m_value(std::move(value_)) { } CXXOPTS_NODISCARD const std::string& key() const { return m_key; } CXXOPTS_NODISCARD const std::string& value() const { return m_value; } template T as() const { T result; values::parse_value(m_value, result); return result; } private: std::string m_key; std::string m_value; }; using ParsedHashMap = std::unordered_map; using NameHashMap = std::unordered_map; class ParseResult { public: ParseResult() = default; ParseResult(const ParseResult&) = default; ParseResult(NameHashMap&& keys, ParsedHashMap&& values, std::vector sequential, std::vector&& unmatched_args) : m_keys(std::move(keys)) , m_values(std::move(values)) , m_sequential(std::move(sequential)) , m_unmatched(std::move(unmatched_args)) { } ParseResult& operator=(ParseResult&&) = default; ParseResult& operator=(const ParseResult&) = default; size_t count(const std::string& o) const { auto iter = m_keys.find(o); if (iter == m_keys.end()) { return 0; } auto viter = m_values.find(iter->second); if (viter == m_values.end()) { return 0; } return viter->second.count(); } const OptionValue& operator[](const std::string& option) const { auto iter = m_keys.find(option); if (iter == m_keys.end()) { throw_or_mimic(option); } auto viter = m_values.find(iter->second); if (viter == m_values.end()) { throw_or_mimic(option); } return viter->second; } const std::vector& arguments() const { return m_sequential; } const std::vector& unmatched() const { return m_unmatched; } private: NameHashMap m_keys{}; ParsedHashMap m_values{}; std::vector m_sequential{}; std::vector m_unmatched{}; }; struct Option { Option ( std::string opts, std::string desc, std::shared_ptr value = ::cxxopts::value(), std::string arg_help = "" ) : opts_(std::move(opts)) , desc_(std::move(desc)) , value_(std::move(value)) , arg_help_(std::move(arg_help)) { } std::string opts_; std::string desc_; std::shared_ptr value_; std::string arg_help_; }; using OptionMap = std::unordered_map>; using PositionalList = std::vector; using PositionalListIterator = PositionalList::const_iterator; class OptionParser { public: OptionParser(const OptionMap& options, const PositionalList& positional, bool allow_unrecognised) : m_options(options) , m_positional(positional) , m_allow_unrecognised(allow_unrecognised) { } ParseResult parse(int argc, const char* const* argv); bool consume_positional(const std::string& a, PositionalListIterator& next); void checked_parse_arg ( int argc, const char* const* argv, int& current, const std::shared_ptr& value, const std::string& name ); void add_to_option(OptionMap::const_iterator iter, const std::string& option, const std::string& arg); void parse_option ( const std::shared_ptr& value, const std::string& name, const std::string& arg = "" ); void parse_default(const std::shared_ptr& details); private: void finalise_aliases(); const OptionMap& m_options; const PositionalList& m_positional; std::vector m_sequential{}; bool m_allow_unrecognised; ParsedHashMap m_parsed{}; NameHashMap m_keys{}; }; class Options { public: explicit Options(std::string program, std::string help_string = "") : m_program(std::move(program)) , m_help_string(toLocalString(std::move(help_string))) , m_custom_help("[OPTION...]") , m_positional_help("positional parameters") , m_show_positional(false) , m_allow_unrecognised(false) , m_width(76) , m_tab_expansion(false) , m_options(std::make_shared()) { } Options& positional_help(std::string help_text) { m_positional_help = std::move(help_text); return *this; } Options& custom_help(std::string help_text) { m_custom_help = std::move(help_text); return *this; } Options& show_positional_help() { m_show_positional = true; return *this; } Options& allow_unrecognised_options() { m_allow_unrecognised = true; return *this; } Options& set_width(size_t width) { m_width = width; return *this; } Options& set_tab_expansion(bool expansion=true) { m_tab_expansion = expansion; return *this; } ParseResult parse(int argc, const char* const* argv); OptionAdder add_options(std::string group = ""); void add_options ( const std::string& group, std::initializer_list