Repository: haowenz/chromap
Branch: master
Commit: 949043c782e6
Files: 64
Total size: 765.5 KB
Directory structure:
gitextract_5tsxk3xx/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ └── bug_report.md
│ └── workflows/
│ └── ci.yml
├── LICENSE
├── Makefile
├── README.md
├── chromap.1
├── docs/
│ ├── _config.yml
│ ├── chromap.html
│ └── index.md
├── src/
│ ├── alignment.cc
│ ├── alignment.h
│ ├── barcode_translator.h
│ ├── bed_mapping.h
│ ├── candidate.h
│ ├── candidate_position_generating_config.h
│ ├── candidate_processor.cc
│ ├── candidate_processor.h
│ ├── chromap.cc
│ ├── chromap.h
│ ├── chromap_driver.cc
│ ├── chromap_driver.h
│ ├── cxxopts.hpp
│ ├── draft_mapping.h
│ ├── draft_mapping_generator.cc
│ ├── draft_mapping_generator.h
│ ├── feature_barcode_matrix.cc
│ ├── feature_barcode_matrix.h
│ ├── feature_barcode_matrix_writer.h
│ ├── hit_utils.h
│ ├── index.cc
│ ├── index.h
│ ├── index_parameters.h
│ ├── index_utils.h
│ ├── khash.h
│ ├── kseq.h
│ ├── ksw.cc
│ ├── ksw.h
│ ├── mapping.h
│ ├── mapping_generator.cc
│ ├── mapping_generator.h
│ ├── mapping_in_memory.h
│ ├── mapping_metadata.h
│ ├── mapping_parameters.h
│ ├── mapping_processor.h
│ ├── mapping_writer.cc
│ ├── mapping_writer.h
│ ├── minimizer.h
│ ├── minimizer_generator.cc
│ ├── minimizer_generator.h
│ ├── mmcache.hpp
│ ├── paf_mapping.h
│ ├── paired_end_mapping_metadata.h
│ ├── pairs_mapping.h
│ ├── sam_mapping.h
│ ├── sequence_batch.cc
│ ├── sequence_batch.h
│ ├── sequence_effective_range.h
│ ├── strand.h
│ ├── summary_metadata.h
│ ├── temp_mapping.h
│ └── utils.h
└── test/
├── read1.fq
├── read2.fq
└── ref.fa
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: "[BUG] XXX"
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Describe the data you are using and provide a sample of your data if possible. For example, the paired-end reads are generated by 10x scATAC-seq. The read length is 50bp and the barcode length is 16bp.
2. Get the Chromap version by running ```chromap -v``` and post it here.
3. Provide the full command line you used to run Chromap.
4. Provide the log output by Chromap and highlight the error message.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Environment (please complete the following information):**
- OS: [e.g. Ubuntu 22.10]
- Way you install Chromap [e.g. use Bioconda, download binary, build from source]
- If you compiled Chromap from source yourself, please provide the compiler version [e.g. GCC 7.4.0]
**Additional context**
Add any other context about the problem here.
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
env:
DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer
jobs:
ubuntu:
runs-on: ubuntu-latest
strategy:
matrix:
compiler: [g++, clang++]
steps:
- uses: actions/checkout@v2
- name: install-deps
run:
sudo apt-get update; sudo apt-get install -y clang libomp5 libomp-dev
- name: build-chromap
run:
make CXX=${{ matrix.compiler }}
- name: test-chromap
run:
./chromap -h
macos:
runs-on: macos-latest
strategy:
matrix:
compiler: [clang++]
steps:
- uses: actions/checkout@v2
- name: cache-openmp
id: cache-openmp
uses: actions/cache@v3
with:
path: openmp-install
key: openmp-macos-install
- name: build-openmp
if: steps.cache-openmp.outputs.cache-hit != 'true'
run: |
wget https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/openmp-14.0.0.src.tar.xz
tar -xf openmp-14.0.0.src.tar.xz
cd openmp-14.0.0.src
sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S
sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S
mkdir -p build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
-DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
cmake --build . -j 3
cmake --build . --target install
mkdir $GITHUB_WORKSPACE/openmp-install
cp -r install/* $GITHUB_WORKSPACE/openmp-install
- name: install-openmp
run: |
sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
- name: build-chromap
run:
make CXX=${{ matrix.compiler }} CXXFLAGS="-arch x86_64 -isysroot $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk -std=c++11 -Wall -O3 -Xclang -fopenmp -msse4.1" LDFLAGS="-L$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -rpath $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -lm -lz -lomp"
- name: test-chromap
run:
./chromap -h
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 Haowen Zhang, Li Song, X. Shirley Liu, Heng Li
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
CXX=g++
CXXFLAGS=-std=c++11 -Wall -O3 -fopenmp -msse4.1
LDFLAGS=-lm -lz
cpp_source=sequence_batch.cc index.cc minimizer_generator.cc candidate_processor.cc alignment.cc feature_barcode_matrix.cc ksw.cc draft_mapping_generator.cc mapping_generator.cc mapping_writer.cc chromap.cc chromap_driver.cc
src_dir=src
objs_dir=objs
objs+=$(patsubst %.cc,$(objs_dir)/%.o,$(cpp_source))
exec=chromap
ifneq ($(asan),)
CXXFLAGS+=-fsanitize=address -g
LDFLAGS+=-fsanitize=address -ldl -g
endif
all: dir $(exec)
dir:
mkdir -p $(objs_dir)
$(exec): $(objs)
$(CXX) $(CXXFLAGS) $(objs) -o $(exec) $(LDFLAGS)
$(objs_dir)/%.o: $(src_dir)/%.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
.PHONY: clean
clean:
-rm -rf $(exec) $(objs_dir)
================================================
FILE: README.md
================================================
[](https://github.com/haowenz/chromap/actions/workflows/ci.yml) [](https://github.com/haowenz/chromap/blob/master/LICENSE) [](https://anaconda.org/bioconda/chromap) [](https://anaconda.org/bioconda/chromap) [](https://anaconda.org/bioconda/chromap)
## Getting Started
```sh
git clone https://github.com/haowenz/chromap.git
cd chromap && make
# create an index first and then map
./chromap -i -r test/ref.fa -o ref.index
./chromap -x ref.index -r test/ref.fa -1 test/read1.fq -2 test/read2.fq -o test.bed
# use presets (no test data)
./chromap --preset atac -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed # ATAC-seq reads
./chromap --preset atac -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed \
-b barcode.fq.gz --barcode-whitelist whitelist.txt # scATAC-seq reads
./chromap --preset chip -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.bed # ChIP-seq reads
./chromap --preset hic -x index -r ref.fa -1 read1.fq -2 read2.fq -o aln.pairs # Hi-C reads and pairs output
./chromap --preset hic -x index -r ref.fa -1 read1.fq -2 read2.fq --SAM -o aln.sam # Hi-C reads and SAM output
```
## Table of Contents
- [Getting Started](#started)
- [User Guide](#uguide)
- [Installation](#install)
- [General usage](#general)
- [Use cases](#cases)
- [Map ChIP-seq short reads](#map-chip)
- [Map ATAC-seq/scATAC-seq short reads](#map-atac)
- [Map Hi-C short reads](#map-hic)
- [Summarizing mapping statistics/quality control](#atacseq-qc)
- [Summary File](#summaryfile)
- [Estimating FRiP](#estfrip)
- [Features to assist in doublet detection](#doublet)
- [Getting help](#help)
- [Citing Chromap](#cite)
## User Guide
Chromap is an ultrafast method for aligning and preprocessing high throughput chromatin profiles. Typical use cases include: (1) trimming sequencing adapters, mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq genomic reads to the human genome, correcting barcodes, removing duplicates and performing Tn5 shift; (3) split alignment of Hi-C reads against a reference genome. In all these three cases, Chromap is 10-20 times faster while being accurate.
### Installation
To compile from the source, you need to have the GCC compiler with version>=7.3.0, GNU make and zlib development files installed. Then type `make` in the source code directory to compile.
Chromap is also available on [bioconda][bioconda]. Thus you can easily install Chromap with Conda
```sh
conda install -c conda-forge -c bioconda chromap
```
### General usage
Before mapping, an index of the reference needs to be created and saved on the disk:
```sh
chromap -i -r ref.fa -o index
```
The users can input the min fragment length expected in their sequencing experiments, e.g. read length, by **--min-frag-length**. Then Chromap will choose proper k-mer length and window size to build the index. For human genome, it only takes a few minutes to build the index. Without any preset parameters, Chromap takes a reference database and a query sequence file as input and produce approximate mapping, without base-level alignment in the [BED format][bed]:
```sh
chromap -x index -r ref.fa -1 query.fq -o approx-mapping.bed
```
You may ask Chromap to output alignments in the [SAM format][sam]:
```sh
chromap -x index -r ref.fa -1 query.fq --SAM -o alignment.sam
```
But note that the the processing of SAM files is not fully optimized and can be slow. Thus generating the output in SAM format is not preferred and should be avoided when possible. Chromap can take multiple input read files:
```sh
chromap -x index -r ref.fa -1 query1.fq,query2.fq,query3.fq --SAM -o alignment.sam
```
Chromap also supports wildcards in the read file names and will find all matched read files. To use this function, the read file names ***must*** be put in quotation marks:
```sh
chromap -x index -r ref.fa -1 "query*.fq" --SAM -o alignment.sam
```
Chromap works with gzip'd FASTA and FASTQ formats as input. You don't need to convert between FASTA and FASTQ or decompress gzip'd files first.
***Importantly***, it should be noted that once you build the index, indexing parameters such as **-k**, **-w** and **--min-frag-length** can't be changed during mapping. If you are running Chromap for different data types, you will probably need to keep multiple indexes generated with different parameters.
This makes Chromap different from BWA which always uses the same index regardless of query data types. Chromap can build the human genome index file in a few minutes.
Detailed explanations for the options can be found at the [manpage][manpage].
### Use cases
To support different data types (e.g. ChIP-seq, Hi-C, ATAC-seq), Chromap needs to be tuned for optimal performance and accuracy. It is usually recommended to choose a preset with option **--preset**, which sets multiple parameters at the same time.
#### Map ChIP-seq short reads
```sh
chromap --preset chip -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed # ChIP-seq reads
```
This set of parameters is tuned for mapping ChIP-seq reads. Chromap will map the paired-end reads with max insert size up to 2000 (**-l 2000**) and then remove duplicates (**--remove-pcr-duplicates**) using the low memory mode (**--low-mem**). The output is in BED format (**--BED**). In the output BED file, each row is a mapping of a fragment (i.e., a read pair) and the columns are
chrom chrom_start chrom_end N mapq strand
The strand here is the strand of the first read in a read pair (specified by **-1**). If the mapping start and end locations of each read in a read pair are desired, **--TagAlign** should be used to overide **--BED** in the preset parameters as following
```sh
chromap --preset chip -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz --TagAlign -o aln.tagAlign # ChIP-seq reads
```
For each read pair, there will be two rows in the output file, one for each read in the pair respectively. The meaning of the columns remains the same.
#### Map ATAC-seq/scATAC-seq short reads
```sh
chromap --preset atac -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed # ATAC-seq reads
chromap --preset atac -x index -r ref.fa -1 read1.fq.gz -2 read2.fq.gz -o aln.bed\
-b barcode.fq.gz --barcode-whitelist whitelist.txt # scATAC-seq reads
```
This set of parameters is tuned for mapping ATAC-seq/scATAC-seq reads. Chromap will trim the adapters on 3' end (**--trim-adapters**), map the paired-end reads with max insert size up to 2000 (**-l 2000**) and then remove duplicates at cell level (**--remove-pcr-duplicates-at-cell-level**). Tn5 shift will also be applied to the fragments (**--Tn5-shift**). The forward mapping start positions are increased by 4bp and the reverse mapping end positions are decreased by 5bp. The processing is run in the low memory mode (**--low-mem**).
If no barcode whitelist file is given, Chromap will skip barcode correction. When barcodes and a whitelist are given as input, by default Chromap will estimate barcode abundance and use this information to perform barcode correction with up to 1 Hamming distance from a whitelist barcode. By setting **--bc-error-threshold** to 2, Chromap is able to correct barcodes with up to 2 Hamming distance from a whitelist barcode. User can also increase the probability threshold to make a correction by setting **--bc-probability-threshold** (set to 0.9 by default) to a large value (e.g., 0.975) to only make reliable corrections. For scATAC-seq data with multiple read and barcode files, you can use "," to concatenate multiple input files as the example [above](#general).
Chromap also supports user-defined barcode format, including mixed barcode and genomic data case. User can specify the sequence structure through option **--read-format**. The value is a comma-separated string, each field in the string is also a semi-comma-splitted string
[r1|r2|bc]:start:end:strand
The start and end are inclusive and -1 means the end of the read. User may use multiple fields to specify non-consecutive segments, e.g. bc:0:15,bc:32:-1. The strand is presented by '+' and '-' symbol, if '-' the barcode will be reverse-complemented after extraction. The strand symbol can be omitted if it is '+' and is ignored on r1 and r2. For example, when the barcode is in the first 16bp of read1, one can use the option `-1 read1.fq.gz -2 read2.fq.gz --barcode read1.fq.gz --read-format bc:0:15,r1:16:-1`.
The output file formats for bulk and single-cell data are different except for the first three columns. For bulk data, the columns are
chrom chrom_start chrom_end N mapq strand duplicate_count
For single-cell data, the columns are
chrom chrom_start chrom_end barcode duplicate_count
the same as the definition of the fragment file in [CellRanger][cellranger]. Note that chrom_end is open-end. This output fragment file can be used as input of downstream analysis tools such as [MAESTRO][MAESTRO], [ArchR][ArchR], [signac][signac] and etc.
Besides, Chromap can translate input cell barcodes to another set of barcodes. Users can specify the translation file through the option **--barcode-translate**. The translation file is a two-column tsv/csv file with the translated barcode on the first column and the original barcode on the second column. This is useful for 10x Multiome data, where scATAC-seq and scRNA-seq data use different sets of barcodes. This option also supports combinatorial barcoding, such as SHARE-seq. Chromap can translate each barcode segment provided in the second column to the ID in the first column and add "-" to concatenate the IDs in the output.
#### Map Hi-C short reads
```sh
chromap --preset hic -x index -r ref.fa -1 read1.fa -2 read2.fa -o aln.pairs # Hi-C reads and pairs output
```
Chromap will perform split alignment (**--split-alignment**) on Hi-C reads and output mappings in [pairs][pairs] format (**--pairs**), which is used in [4DN Hi-C data processing pipeline][4DN]. Some Hi-C data analysis pipelines may require the reads are sorted in specific chromosome order other than the one in the index. Therefore, Chromap provides the option **--chr-order** to specify the alignment order, and **--pairs-natural-chr-order** for flipping the pair in the pairs format.
### Summarizing mapping statistics/quality control
Chromap allows you to summarize the dataset's mapping statistics as well as quality metrics at either a *bulk* or *single cell* level. To enable this feature, users can specify a file path using this option, **--summary [FILE]**, where a csv file will be saved.
This summary file will output a series of metrics for each barcode (or the overall dataset if it is bulk). Here are the different columns contained within the summary file:
```sh
barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip,numcacheslots
```
- `barcode` - Barcode label for cell
- `total` - Total number of fragments
- `duplicate` - Number of duplicate fragments
- `unmapped` - Number of unmapped fragments
- `lowmapq` - Number of fragments with a low MAPQ
- `cachehit` - Number of fragments that were found in the chromap cache during alignment
- `fric` - Fraction of fragments in the chromap cache
- `estfrip` - Estimated FRiP value based on a linear model ([See below for more details](#estfrip))
- `numcacheslots` - Number of unique associated cache slots for this barcode (Relevant feature for doublet detection, [see below for more](#doublet))
The summary contains metrics relevant to the mappability of fragments from each barcode.
However, it also contains metrics (`estfrip` and `numcacheslots`) relevant to quality control for chromatin profiling assays like scATAC-seq. These cache-related metrics require overall deep sequencing depth, so it is more useful for single-cell data.
The next two sections briefly describe these two metrics and how they can be useful for users.
#### Estimating FRiP
The `estfrip` column in Chromap's summary file represents an estimate of the FRiP score (Fraction of Reads in Peak Regions) computed by Chromap.
Chromap uses a simple multi-variate linear model to estimate the FRiP for each barcode and the features used in this model are `fric`, `duplicate`, `unmapped` and `lowmapq`.
Typically, the FRiP score is used to assess the quality of chromatin profiles, where typically the higher the FRiP score the better.
For users, this `estfrip` can be used to quickly gauge the quality of the data by plotting all the values in a histogram and looking to see if you a multi-modal distribution.
In addition, when combining Chromap with downstream analysis tools such as [SnapATAC2](https://github.com/kaizhang/SnapATAC2) that perform clustering, the `estfrip` can be used to quickly identify any specific clusters that are lower quality than the rest.
**An important note to users**, the `estfrip` values for every barcode should not be taken by themselves and used as the true FRiP score.
These estimates are mainly intended to be used for quality control at a dataset level where we compare different `estfrip` values to each other.
#### Features to assist in doublet detection
The `numcacheslots` column in Chromap's summary file estimates the number of unique cache slots queried for each barcode during the alignment. This feature can be useful in assisting users for doublet detection/filtering.
Typically for doublet detection in single-cell datasets, a simple and naive metric used to identify potential doublets is the number of fragments in cells (i.e. more reads, more likely a doublet).
Chromap uses the simple intuition that barcodes with higher number of peaks than usual, could be doublets. The number of unique cache slots that are queried can be seen as a proxy for the number of peaks. In our experiments, using `numcacheslots` yields a larger AUC compared using `total` for binary classification of doublets. Therefore, users can potentially use this metric as an additional check/feature along with other doublet-detection specific methods.
### Getting help
Detailed description of Chromap command line options and optional tags can be displayed by running Chromap with **-h** or be found at the [manpage][manpage]. If you encounter bugs or have further questions or requests, you can raise an issue at the [issue page][issue].
### Citing Chromap
If you use Chromap, please cite:
> Zhang, H., Song, L., Wang, X., Cheng, H., Wang, C., Meyer, C. A., ..., Liu, X. S., Li, H. (2021). Fast alignment and preprocessing of chromatin profiles with Chromap. Nature communications, 12(1), 1-6.
> https://doi.org/10.1038/s41467-021-26865-w
The summary file for QC is described in the manuscript:
> Ahmed, O., Zhang, H., Langmead, B., Song, L. (2025). Quality control of single-cell ATAC-seq data without peak calling using Chromap. Biorxiv.
> https://doi.org/10.1101/2025.07.15.664951
[bed]: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
[sam]: https://samtools.github.io/hts-specs/SAMv1.pdf
[pairs]: https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md
[4DN]: https://data.4dnucleome.org/resources/data-analysis/hi_c-processing-pipeline
[minimap]: https://github.com/lh3/minimap
[release]: https://github.com/haowenz/chromap/releases
[issue]: https://github.com/haowenz/chromap/issues
[cellranger]: https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments
[manpage]: https://haowenz.github.io/chromap/chromap.html
[bioconda]: https://anaconda.org/bioconda/chromap
[ArchR]: https://www.archrproject.com/index.html
[MAESTRO]: https://github.com/liulab-dfci/MAESTRO
[signac]: https://satijalab.org/signac/articles/pbmc_vignette.html
================================================
FILE: chromap.1
================================================
.TH chromap 1 "25 Jan 2024" "chromap-0.2.6 (r490)" "Bioinformatics tools"
.SH NAME
.PP
chromap - fast alignment and preprocessing of chromatin profiles
.SH SYNOPSIS
* Indexing the reference genome:
.RS 4
chromap
.B -i
.RB [ -k
.IR kmer ]
.RB [ -w
.IR miniWinSize ]
.B -r
.I ref.fa
.B -o
.I ref.index
.RE
* Mapping (sc)ATAC-seq reads:
.RS 4
chromap
.B --preset
.I atac
.B -r
.I ref.fa
.B -x
.I ref.index
.B -1
.I read1.fq
.B -2
.I read2.fq
.B -o
.I aln.bed
.RB [ -b
.IR barcode.fq.gz ]
.RB [ --barcode-whitelist
.IR whitelist.txt ]
.RE
* Mapping ChIP-seq reads:
.RS 4
chromap
.B --preset
.I chip
.B -r
.I ref.fa
.B -x
.I ref.index
.B -1
.I read1.fq
.B -2
.I read2.fq
.B -o
.I aln.bed
.RE
* Mapping Hi-C reads:
.RS 4
chromap
.B --preset
.I hic
.B -r
.I ref.fa
.B -x
.I ref.index
.B -1
.I read1.fq
.B -2
.I read2.fq
.B -o
.I aln.pairs
.br
chromap
.B --preset
.I hic
.B -r
.I ref.fa
.B -x
.I ref.index
.B -1
.I read1.fq
.B -2
.I read2.fq
.B --SAM
.B -o
.I aln.sam
.RE
.SH DESCRIPTION
.PP
Chromap is an ultrafast method for aligning and preprocessing high throughput
chromatin profiles. Typical use cases include: (1) trimming sequencing adapters,
mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing
duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq
genomic reads to the human genome, correcting barcodes, removing duplicates and
performing Tn5 shift; (3) split alignment of Hi-C reads against a reference
genome. In all these three cases, Chromap is 10-20 times faster while being
accurate.
.SH OPTIONS
.SS Indexing options
.TP 10
.BI -k \ INT
Minimizer k-mer length [17].
.TP
.BI -w \ INT
Minimizer window size [7]. A minimizer is the smallest k-mer
in a window of w consecutive k-mers.
.TP
.B --min-frag-length
Min fragment length for choosing k and w automatically [30]. Users can increase
this value when the min length of the fragments of interest is long, which can
increase the mapping speed. Note that the default value 30 is the min fragment
length that chromap can map.
.SS Mapping options
.TP 10
.BI --split-alignment
Allow split alignments. This option should be set only when mapping Hi-C reads.
.TP
.BI -e \ INT
Max edit distance allowed to map a read [8].
.TP
.BI -s \ INT
Min number of minimizers required to map a read [2].
.TP
.BI -f \ INT1 [, INT2 ]
Ignore minimizers occuring more than
.I INT1
[500] times.
.I INT2
[1000] is the threshold for a second round of seeding.
.TP
.BI -l \ INT
Max insert size, only for paired-end read mapping [1000].
.TP
.BI -q \ INT
Min MAPQ in range [0, 60] for mappings to be output [30].
.TP
.BI --min-read-length \ INT
Skip mapping the reads of length less than
.I INT
[30]. Note that this is different from the index option
.BR --min-frag-length
, which set
.BR -k
and
.BR -w
for indexing the genome.
.TP
.BI --trim-adapters
Try to trim adapters on 3'. This only works for paired-end reads. When the
fragment length indicated by the read pair is less than the length of the reads,
the two mates are overlapped with each other. Then the regions outside the
overlap are regarded as adapters and trimmed.
.TP
.BI --remove-pcr-duplicates
Remove PCR duplicates.
.TP
.BI --remove-pcr-duplicates-at-bulk-level
Remove PCR duplicates at bulk level for single cell data.
.TP
.BI --remove-pcr-duplicates-at-cell-level
Remove PCR duplicates at cell level for single cell data.
.TP
.BI --Tn5-shift
Perform Tn5 shift. When this option is turned on, the forward mapping start
positions are increased by 4bp and the reverse mapping end positions are
decreased by 5bp. Note that this works only when
.BR --SAM
is NOT set.
.TP
.BI --low-mem
Use low memory mode. When this option is set, multiple temporary intermediate
mapping files might be generated on disk and they are merged at the end of
processing to reduce memory usage. When this is NOT set, all the mapping results
are kept in the memory before they are saved on disk, which works more
efficiently for datasets that are not too large.
.TP
.BI --bc-error-threshold \ INT
Max Hamming distance allowed to correct a barcode [1]. Note that the max
supported threshold is 2.
.TP
.BI --bc-probability-threshold \ FLT
Min probability to correct a barcode [0.9]. When there are multiple whitelisted
barcodes with the same Hamming distance to the barcode to correct, chromap will
process the base quality of the mismatched bases, and compute a probability that
the correction is right.
.TP
.BI -t \ INT
The number of threads for mapping [1].
.SS Input options
.TP 10
.BI -r \ FILE
Reference file.
.TP
.BI -x \ FILE
Index file.
.TP
.BI -1 \ FILE
Single-end read files or paired-end read files 1. Chromap supports mulitple
input files concatenate by ",". For example, setting this option to
"Library1_R1.fastq.gz,Library2_R1.fastq.gz,Library3_R1.fastq.gz" will make
all three files as input and map them in this order. Similarly,
.BR -2
and
.BR -b
also support multiple input files. And the ordering of the input files for all
the three options should match.
.TP
.BI -2 \ FILE
Paired-end read files 2.
.TP
.BI -b \ FILE
Cell barcode files.
.TP
.BI --barcode-whitelist \ FILE
Cell barcode whitelist file. This is supposed to be a txt file where each line
is a whitelisted barcode.
.TP
.BI --read-format \ STR
Format for read files and barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics
single-end format.
.SS Output options
.TP 10
.BR -o \ FILE
Output file.
.TP
.BR --output-mappings-not-in-whitelist
Output mappings with barcode not in the whitelist.
.TP
.BR --chr-order \ FILE
Custom chromosome order file. If not specified, the order of reference sequences will be used.
.TP
.BR --BED
Output mappings in BED/BEDPE format. Note that only one of the formats should be
set.
.TP
.BR --TagAlign
Output mappings in TagAlign/PairedTagAlign format.
.TP
.BR --SAM
Output mappings in SAM format.
.TP
.BR --pairs
Output mappings in pairs format (defined by 4DN for HiC data).
.TP
.BR --pairs-natural-chr-order \ FILE
Custom chromosome order file for pairs flipping. If not specified, the custom chromosome order will be used.
.TP
.BR --barcode-translate \ FILE
Convert input barcodes to another set of barcodes in the output.
.TP
.BR --summary \ FILE
Summarize the mapping statistics at bulk or barcode level.
.TP
.B -v
Print version number to stdout.
.SS Preset options
.TP 10
.BI --preset \ STR
Preset []. This option applies multiple options at the same time. It should be
applied before other options because options applied later will overwrite the
values set by
.BR --preset .
Available
.I STR
are:
.RS
.TP 10
.B chip
Mapping ChIP-seq reads
.RB ( -l
.I 2000
.B --remove-pcr-duplicates --low-mem
.BR --BED ).
.TP
.B atac
Mapping ATAC-seq/scATAC-seq reads
.RB ( -l
.I 2000
.B --remove-pcr-duplicates --low-mem --trim-adapters --Tn5-shift
.B --remove-pcr-duplicates-at-cell-level
.BR --BED ).
.TP
.B hic
Mapping Hi-C reads
.RB ( -e
.I 4
.B -q
.I 1
.B --low-mem --split-alignment
.BR --pairs ).
================================================
FILE: docs/_config.yml
================================================
theme: jekyll-theme-modernist
================================================
FILE: docs/chromap.html
================================================
chromap
chromap
NAME
SYNOPSIS
DESCRIPTION
OPTIONS
NAME
chromap - fast
alignment and preprocessing of chromatin profiles
SYNOPSIS
* Indexing the
reference genome:
chromap -i [-k
kmer] [-w miniWinSize] -r
ref.fa -o ref.index
* Mapping
(sc)ATAC-seq reads:
chromap --preset
atac -r ref.fa -x
ref.index -1 read1.fq -2
read2.fq -o aln.bed [-b
barcode.fq.gz] [--barcode-whitelist
whitelist.txt]
* Mapping
ChIP-seq reads:
chromap --preset
chip -r ref.fa -x
ref.index -1 read1.fq -2
read2.fq -o aln.bed
* Mapping Hi-C
reads:
chromap --preset
hic -r ref.fa -x
ref.index -1 read1.fq -2
read2.fq -o aln.pairs
chromap --preset hic -r ref.fa
-x ref.index -1 read1.fq
-2 read2.fq --SAM -o aln.sam
DESCRIPTION
Chromap is an
ultrafast method for aligning and preprocessing high
throughput chromatin profiles. Typical use cases include:
(1) trimming sequencing adapters, mapping bulk ATAC-seq or
ChIP-seq genomic reads to the human genome and removing
duplicates; (2) trimming sequencing adapters, mapping single
cell ATAC-seq genomic reads to the human genome, correcting
barcodes, removing duplicates and performing Tn5 shift; (3)
split alignment of Hi-C reads against a reference genome. In
all these three cases, Chromap is 10-20 times faster while
being accurate.
OPTIONS
Indexing
options
|
-k INT |
|
Minimizer k-mer length [17]. |
|
-w INT |
|
Minimizer window size [7]. A minimizer is the smallest
k-mer in a window of w consecutive k-mers. |
--min-frag-length
Min fragment length for
choosing k and w automatically [30]. Users can increase this
value when the min length of the fragments of interest is
long, which can increase the mapping speed. Note that the
default value 30 is the min fragment length that chromap can
map.
Mapping
options
--split-alignment
Allow split alignments. This
option should be set only when mapping Hi-C reads.
|
-e INT |
|
Max edit distance allowed to map a read [8]. |
|
-s INT |
|
Min number of minimizers required to map a read [2]. |
-f INT1[,INT2]
Ignore minimizers occuring more
than INT1 [500] times. INT2 [1000] is the
threshold for a second round of seeding.
|
-l INT |
|
Max insert size, only for paired-end read mapping
[1000]. |
|
-q INT |
|
Min MAPQ in range [0, 60] for mappings to be output
[30]. |
--min-read-length INT
Skip mapping the reads of
length less than INT [30]. Note that this is
different from the index option --min-frag-length ,
which set -k and -w for indexing the
genome.
--trim-adapters
Try to trim adapters on
3’. This only works for paired-end reads. When the
fragment length indicated by the read pair is less than the
length of the reads, the two mates are overlapped with each
other. Then the regions outside the overlap are regarded as
adapters and trimmed.
--remove-pcr-duplicates
Remove PCR duplicates.
--remove-pcr-duplicates-at-bulk-level
Remove PCR duplicates at bulk
level for single cell data.
--remove-pcr-duplicates-at-cell-level
Remove PCR duplicates at cell
level for single cell data.
--Tn5-shift
Perform Tn5 shift. When this
option is turned on, the forward mapping start positions are
increased by 4bp and the reverse mapping end positions are
decreased by 5bp. Note that this works only when
--SAM is NOT set.
|
--low-mem |
|
Use low memory mode. When this option is set, multiple
temporary intermediate mapping files might be generated on
disk and they are merged at the end of processing to reduce
memory usage. When this is NOT set, all the mapping results
are kept in the memory before they are saved on disk, which
works more efficiently for datasets that are not too
large. |
--bc-error-threshold INT
Max Hamming distance allowed to
correct a barcode [1]. Note that the max supported threshold
is 2.
--bc-probability-threshold FLT
Min probability to correct a
barcode [0.9]. When there are multiple whitelisted barcodes
with the same Hamming distance to the barcode to correct,
chromap will process the base quality of the mismatched
bases, and compute a probability that the correction is
right.
|
-t INT |
|
The number of threads for mapping [1]. |
|
Input
options
|
-r FILE |
|
Reference file. |
|
-x FILE |
|
Index file. |
|
-1 FILE |
|
Single-end read files or paired-end read files 1.
Chromap supports mulitple input files concatenate by
",". For example, setting this option to
"read11.fq,read12.fq,read13.fq" will make all
three files as input and map them in this order. Similarly,
-2 and -b also support multiple input files.
And the ordering of the input files for all the three
options should match. |
|
-2 FILE |
|
Paired-end read files 2. |
|
-b FILE |
|
Cell barcode files. |
--barcode-whitelist FILE
Cell barcode whitelist file.
This is supposed to be a txt file where each line is a
whitelisted barcode.
--read-format STR
Format for read files and
barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics
single-end format.
Output
options
--output-mappings-not-in-whitelist
Output mappings with barcode
not in the whitelist.
--chr-order FILE
Customized chromsome order.
|
--BED |
|
Output mappings in BED/BEDPE format. Note that only one
of the formats should be set. |
--TagAlign
Output mappings in
TagAlign/PairedTagAlign format.
|
--SAM |
|
Output mappings in SAM format. |
|
--pairs |
|
Output mappings in pairs format (defined by 4DN for HiC
data). |
--pairs-natural-chr-order FILE
Natural chromosome order for
pairs flipping.
|
-v |
|
Print version number to stdout. |
|
Preset
options
--preset STR
Preset []. This option applies
multiple options at the same time. It should be applied
before other options because options applied later will
overwrite the values set by --preset. Available
STR are:
|
chip |
|
Mapping ChIP-seq reads (-l 2000
--remove-pcr-duplicates --low-mem --BED). |
|
atac |
|
Mapping ATAC-seq/scATAC-seq reads (-l 2000
--remove-pcr-duplicates --low-mem --trim-adapters
--Tn5-shift --remove-pcr-duplicates-at-cell-level
--BED). |
|
hic |
|
Mapping Hi-C reads (-e 4 -q
1 --low-mem --split-alignment --pairs). |
================================================
FILE: docs/index.md
================================================
## Getting help
* [README][doc]: general documentation
* [Manpage](chromap.html): explanation of command-line options
* [Preprint][biorxiv]: free of charge preprint that describes the method
* [GitHub Issues page][issue]: report bugs, request features and ask questions
## Acquiring Chromap
* `git clone https://github.com/haowenz/chromap.git`
* [GitHub Release page][release]: versioned packages
* Also [available from BioConda][bioconda]
[doc]: https://github.com/haowenz/chromap/blob/master/README.md
[biorxiv]: https://www.biorxiv.org/content/10.1101/2021.06.18.448995v1
[bioconda]: https://anaconda.org/bioconda/chromap
[release]: https://github.com/haowenz/chromap/releases
[issue]: https://github.com/haowenz/chromap/issues
================================================
FILE: src/alignment.cc
================================================
#include "alignment.h"
#include
namespace chromap {
int GetLongestMatchLength(const char *pattern, const char *text,
const int read_length) {
int max_match = 0;
int tmp = 0;
for (int i = 0; i < read_length; ++i) {
if (CharToUint8(pattern[i]) == CharToUint8(text[i])) {
++tmp;
} else if (tmp > max_match) {
max_match = tmp;
}
}
if (tmp > max_match) {
max_match = tmp;
}
return max_match;
}
int AdjustGapBeginning(const Strand mapping_strand, const char *ref,
const char *read, int *gap_beginning, int read_end,
int ref_start_position, int ref_end_position,
int *n_cigar, uint32_t **cigar) {
int i, j;
if (mapping_strand == kPositive) {
if (*gap_beginning <= 0) {
return ref_start_position;
}
// printf("%d\n", *gap_beginning);
for (i = *gap_beginning - 1, j = ref_start_position - 1; i >= 0 && j >= 0;
--i, --j) {
// printf("%c %c\n", read[i], ref[j]);
if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
break;
}
}
*gap_beginning = i + 1;
// TODO: add soft clip in cigar
if (n_cigar && *n_cigar > 0) {
if (((*cigar)[0] & 0xf) == BAM_CMATCH) {
(*cigar)[0] += (ref_start_position - 1 - j) << 4;
}
}
return j + 1;
}
if (*gap_beginning <= 0) {
return ref_end_position;
}
// printf("%d\n", *gap_beginning);
/*char *tmp = new char[255] ;
strncpy(tmp, ref + ref_start_position, ref_end_position - ref_start_position
+ 1 + 10) ; printf("%s %d. %d %d\n", tmp, strlen(tmp), ref_end_position -
ref_start_position + 1 + 10, strlen(ref)) ; delete[] tmp;*/
for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j];
++i, ++j) {
// printf("%c %c %c %c %c %c\n", read[i], ref[j - 1], ref[j], ref[j + 1],
// ref[j + 2], ref[j + 3]);
if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
break;
}
}
*gap_beginning = *gap_beginning + i - (read_end + 1);
if (n_cigar && *n_cigar > 0) {
if (((*cigar)[*n_cigar - 1] & 0xf) == BAM_CMATCH) {
(*cigar)[*n_cigar - 1] += (j - (ref_end_position + 1)) << 4;
}
}
return j - 1;
}
void GenerateNMAndMDTag(const char *pattern, const char *text,
int mapping_start_position,
MappingInMemory &mapping_in_memory) {
const char *read = text;
const char *reference = pattern + mapping_start_position;
const uint32_t *cigar = mapping_in_memory.cigar;
const int n_cigar = mapping_in_memory.n_cigar;
mapping_in_memory.NM = 0;
mapping_in_memory.MD_tag.clear();
int num_matches = 0;
int read_position = 0;
int reference_position = 0;
for (int ci = 0; ci < n_cigar; ++ci) {
uint32_t current_cigar_uint = cigar[ci];
uint8_t cigar_operation = bam_cigar_op(current_cigar_uint);
int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
if (cigar_operation == BAM_CMATCH) {
for (int opi = 0; opi < num_cigar_operations; ++opi) {
if (reference[reference_position] == read[read_position] ||
reference[reference_position] - 'a' + 'A' == read[read_position]) {
// a match
++num_matches;
} else {
// a mismatch
++mapping_in_memory.NM;
mapping_in_memory.MD_tag.append(std::to_string(num_matches));
num_matches = 0;
mapping_in_memory.MD_tag.push_back(reference[reference_position]);
}
++reference_position;
++read_position;
}
} else if (cigar_operation == BAM_CINS) {
mapping_in_memory.NM += num_cigar_operations;
read_position += num_cigar_operations;
} else if (cigar_operation == BAM_CDEL) {
mapping_in_memory.NM += num_cigar_operations;
mapping_in_memory.MD_tag.append(std::to_string(num_matches));
num_matches = 0;
mapping_in_memory.MD_tag.push_back('^');
for (int opi = 0; opi < num_cigar_operations; ++opi) {
mapping_in_memory.MD_tag.push_back(reference[reference_position]);
++reference_position;
}
} else {
std::cerr << "Unexpected cigar op: " << (int)cigar_operation << "\n";
}
}
mapping_in_memory.MD_tag.append(std::to_string(num_matches));
}
int BandedAlignPatternToText(int error_threshold, const char *pattern,
const char *text, const int read_length,
int *mapping_end_position) {
uint32_t Peq[5] = {0, 0, 0, 0, 0};
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base = CharToUint8(pattern[i]);
Peq[base] = Peq[base] | (1 << i);
}
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
uint32_t lowest_bit_in_band_mask = 1;
uint32_t VP = 0;
uint32_t VN = 0;
uint32_t X = 0;
uint32_t D0 = 0;
uint32_t HN = 0;
uint32_t HP = 0;
int num_errors_at_band_start_position = 0;
for (int i = 0; i < read_length; i++) {
uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]);
Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
X = Peq[CharToUint8(text[i])] | VN;
D0 = ((VP + (X & VP)) ^ VP) | X;
HN = VP & D0;
HP = VN | ~(VP | D0);
X = D0 >> 1;
VN = X & HP;
VP = HN | ~(X | HP);
num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
if (num_errors_at_band_start_position > 3 * error_threshold) {
return error_threshold + 1;
}
for (int ai = 0; ai < 5; ai++) {
Peq[ai] >>= 1;
}
}
int band_start_position = read_length - 1;
int min_num_errors = num_errors_at_band_start_position;
*mapping_end_position = band_start_position;
for (int i = 0; i < 2 * error_threshold; i++) {
num_errors_at_band_start_position =
num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1);
num_errors_at_band_start_position =
num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1);
if (num_errors_at_band_start_position < min_num_errors ||
(num_errors_at_band_start_position == min_num_errors &&
i + 1 == error_threshold)) {
min_num_errors = num_errors_at_band_start_position;
*mapping_end_position = band_start_position + 1 + i;
}
}
return min_num_errors;
}
// Return negative number if the termination are deemed at the beginning of the
// read mappping_end_position is relative to pattern (reference)
// read_mapping_length is for text (read)
int BandedAlignPatternToTextWithDropOff(int error_threshold,
const char *pattern, const char *text,
const int read_length,
int *mapping_end_position,
int *read_mapping_length) {
uint32_t Peq[5] = {0, 0, 0, 0, 0};
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base = CharToUint8(pattern[i]);
Peq[base] = Peq[base] | (1 << i);
}
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
uint32_t lowest_bit_in_band_mask = 1;
uint32_t VP = 0;
uint32_t VN = 0;
uint32_t X = 0;
uint32_t D0 = 0;
uint32_t HN = 0;
uint32_t HP = 0;
uint32_t prev_VP = 0;
uint32_t prev_VN = 0;
int num_errors_at_band_start_position = 0;
int i = 0;
int fail_beginning = 0; // the alignment failed at the beginning part
int prev_num_errors_at_band_start_position = 0;
for (; i < read_length; i++) {
uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]);
Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
X = Peq[CharToUint8(text[i])] | VN;
D0 = ((VP + (X & VP)) ^ VP) | X;
HN = VP & D0;
HP = VN | ~(VP | D0);
X = D0 >> 1;
prev_VN = VN;
prev_VP = VP;
VN = X & HP;
VP = HN | ~(X | HP);
prev_num_errors_at_band_start_position = num_errors_at_band_start_position;
num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
if (num_errors_at_band_start_position > 2 * error_threshold) {
// return error_threshold + 1;
// the min error in this band could be still less than the
// error_threshold, and could but this should be fine since it does not
// affect the 5' end of the read.
if (i < 4 * error_threshold && i < read_length / 2) {
fail_beginning = 1;
}
break;
}
for (int ai = 0; ai < 5; ai++) {
Peq[ai] >>= 1;
}
}
/*char tmp[255] ;
strncpy(tmp, pattern, read_length + 2 * error_threshold);
printf("%s\n%s\n", tmp, text);
printf("%d\n", i) ;
fflush(stdout);*/
if (i < read_length) {
num_errors_at_band_start_position = prev_num_errors_at_band_start_position;
VN = prev_VN;
VP = prev_VP;
}
int band_start_position = i - 1;
int min_num_errors = num_errors_at_band_start_position;
*read_mapping_length = i;
*mapping_end_position = band_start_position;
for (i = 0; i < 2 * error_threshold; i++) {
num_errors_at_band_start_position =
num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1);
num_errors_at_band_start_position =
num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1);
if (num_errors_at_band_start_position < min_num_errors ||
(num_errors_at_band_start_position == min_num_errors &&
i + 1 == error_threshold)) {
min_num_errors = num_errors_at_band_start_position;
*mapping_end_position = band_start_position + 1 + i;
}
}
if (fail_beginning ||
(read_length > 60 &&
*mapping_end_position + 1 - error_threshold - min_num_errors < 30)) {
*mapping_end_position = -*mapping_end_position;
}
return min_num_errors;
}
int BandedAlignPatternToTextWithDropOffFrom3End(int error_threshold,
const char *pattern,
const char *text,
const int read_length,
int *mapping_end_position,
int *read_mapping_length) {
uint32_t Peq[5] = {0, 0, 0, 0, 0};
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base =
CharToUint8(pattern[read_length + 2 * error_threshold - 1 - i]);
Peq[base] = Peq[base] | (1 << i);
}
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
uint32_t lowest_bit_in_band_mask = 1;
uint32_t VP = 0;
uint32_t VN = 0;
uint32_t X = 0;
uint32_t D0 = 0;
uint32_t HN = 0;
uint32_t HP = 0;
uint32_t prev_VP = 0;
uint32_t prev_VN = 0;
int num_errors_at_band_start_position = 0;
int i = 0;
int fail_beginning = 0; // the alignment failed at the beginning part
int prev_num_errors_at_band_start_position = 0;
for (; i < read_length; i++) {
// printf("%c %c %d\n", pattern[read_length - 1 - i], pattern[read_length -
// 1 - i + error_threshold], text[read_length - 1 - i]);
uint8_t pattern_base = CharToUint8(pattern[read_length - 1 - i]);
Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
X = Peq[CharToUint8(text[read_length - 1 - i])] | VN;
D0 = ((VP + (X & VP)) ^ VP) | X;
HN = VP & D0;
HP = VN | ~(VP | D0);
X = D0 >> 1;
prev_VN = VN;
prev_VP = VP;
VN = X & HP;
VP = HN | ~(X | HP);
prev_num_errors_at_band_start_position = num_errors_at_band_start_position;
num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
/*printf("->%d %d %c %c", i, num_errors_at_band_start_position,
pattern[read_length - 1 - i], text[read_length - 1 - i]) ; int tmp =
num_errors_at_band_start_position; for (int j = 0; j < 2 * error_threshold;
j++) { tmp = tmp + ((VP >> j) & (uint32_t) 1); tmp = tmp - ((VN >> j) &
(uint32_t) 1); printf(" %d", tmp);
}
printf("\n");*/
if (num_errors_at_band_start_position > 2 * error_threshold) {
// return error_threshold + 1;
if (i < 4 * error_threshold && i < read_length / 2) {
fail_beginning = 1;
}
break;
}
for (int ai = 0; ai < 5; ai++) {
Peq[ai] >>= 1;
}
}
// printf("li %d: %d %d %d\n", fail_beginning, i, error_threshold,
// read_length);
if (i < read_length) {
num_errors_at_band_start_position = prev_num_errors_at_band_start_position;
VN = prev_VN;
VP = prev_VP;
}
int band_start_position = i - 1;
int min_num_errors = num_errors_at_band_start_position;
*read_mapping_length = i;
*mapping_end_position = band_start_position;
// printf("-1: %d\n", num_errors_at_band_start_position);
for (i = 0; i < 2 * error_threshold; i++) {
num_errors_at_band_start_position =
num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1);
num_errors_at_band_start_position =
num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1);
// printf("%d: %d\n", i, num_errors_at_band_start_position);
if (num_errors_at_band_start_position < min_num_errors ||
(num_errors_at_band_start_position == min_num_errors &&
i + 1 == error_threshold)) {
min_num_errors = num_errors_at_band_start_position;
*mapping_end_position = band_start_position + (1 + i);
}
}
if (fail_beginning ||
(read_length > 60 &&
*mapping_end_position + 1 - error_threshold - min_num_errors < 30)) {
*mapping_end_position = -*mapping_end_position;
}
return min_num_errors;
}
void BandedAlign4PatternsToText(int error_threshold, const char **patterns,
const char *text, int read_length,
int32_t *mapping_edit_distances,
int32_t *mapping_end_positions) {
int ALPHABET_SIZE = 5;
const char *reference_sequence0 = patterns[0];
const char *reference_sequence1 = patterns[1];
const char *reference_sequence2 = patterns[2];
const char *reference_sequence3 = patterns[3];
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
__m128i highest_bit_in_band_mask_vpu0 =
_mm_set_epi32(0, 0, 0, highest_bit_in_band_mask);
__m128i highest_bit_in_band_mask_vpu1 =
_mm_set_epi32(0, 0, highest_bit_in_band_mask, 0);
__m128i highest_bit_in_band_mask_vpu2 =
_mm_set_epi32(0, highest_bit_in_band_mask, 0, 0);
__m128i highest_bit_in_band_mask_vpu3 =
_mm_set_epi32(highest_bit_in_band_mask, 0, 0, 0);
// Init Peq
__m128i Peq[ALPHABET_SIZE];
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_setzero_si128();
}
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base0 = CharToUint8(reference_sequence0[i]);
uint8_t base1 = CharToUint8(reference_sequence1[i]);
uint8_t base2 = CharToUint8(reference_sequence2[i]);
uint8_t base3 = CharToUint8(reference_sequence3[i]);
Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]);
Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]);
Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]);
Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]);
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_srli_epi32(Peq[ai], 1);
}
}
uint32_t lowest_bit_in_band_mask = 1;
__m128i lowest_bit_in_band_mask_vpu = _mm_set1_epi32(lowest_bit_in_band_mask);
__m128i VP = _mm_setzero_si128();
__m128i VN = _mm_setzero_si128();
__m128i X = _mm_setzero_si128();
__m128i D0 = _mm_setzero_si128();
__m128i HN = _mm_setzero_si128();
__m128i HP = _mm_setzero_si128();
__m128i max_mask_vpu = _mm_set1_epi32(0xffffffff);
__m128i num_errors_at_band_start_position_vpu = _mm_setzero_si128();
__m128i early_stop_threshold_vpu = _mm_set1_epi32(error_threshold * 3);
for (int i = 0; i < read_length; i++) {
uint8_t base0 = CharToUint8(reference_sequence0[i + 2 * error_threshold]);
uint8_t base1 = CharToUint8(reference_sequence1[i + 2 * error_threshold]);
uint8_t base2 = CharToUint8(reference_sequence2[i + 2 * error_threshold]);
uint8_t base3 = CharToUint8(reference_sequence3[i + 2 * error_threshold]);
Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]);
Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]);
Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]);
Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]);
X = _mm_or_si128(Peq[CharToUint8(text[i])], VN);
D0 = _mm_and_si128(X, VP);
D0 = _mm_add_epi32(D0, VP);
D0 = _mm_xor_si128(D0, VP);
D0 = _mm_or_si128(D0, X);
HN = _mm_and_si128(VP, D0);
HP = _mm_or_si128(VP, D0);
HP = _mm_xor_si128(HP, max_mask_vpu);
HP = _mm_or_si128(HP, VN);
X = _mm_srli_epi32(D0, 1);
VN = _mm_and_si128(X, HP);
VP = _mm_or_si128(X, HP);
VP = _mm_xor_si128(VP, max_mask_vpu);
VP = _mm_or_si128(VP, HN);
__m128i E = _mm_and_si128(D0, lowest_bit_in_band_mask_vpu);
E = _mm_xor_si128(E, lowest_bit_in_band_mask_vpu);
num_errors_at_band_start_position_vpu =
_mm_add_epi32(num_errors_at_band_start_position_vpu, E);
__m128i early_stop = _mm_cmpgt_epi32(num_errors_at_band_start_position_vpu,
early_stop_threshold_vpu);
int tmp = _mm_movemask_epi8(early_stop);
if (tmp == 0xffff) {
_mm_store_si128((__m128i *)mapping_edit_distances,
num_errors_at_band_start_position_vpu);
return;
}
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_srli_epi32(Peq[ai], 1);
}
}
int band_start_position = read_length - 1;
__m128i min_num_errors_vpu = num_errors_at_band_start_position_vpu;
for (int i = 0; i < 2 * error_threshold; i++) {
__m128i lowest_bit_in_VP_vpu =
_mm_and_si128(VP, lowest_bit_in_band_mask_vpu);
__m128i lowest_bit_in_VN_vpu =
_mm_and_si128(VN, lowest_bit_in_band_mask_vpu);
num_errors_at_band_start_position_vpu = _mm_add_epi32(
num_errors_at_band_start_position_vpu, lowest_bit_in_VP_vpu);
num_errors_at_band_start_position_vpu = _mm_sub_epi32(
num_errors_at_band_start_position_vpu, lowest_bit_in_VN_vpu);
__m128i mapping_end_positions_update_mask_vpu = _mm_cmplt_epi32(
num_errors_at_band_start_position_vpu, min_num_errors_vpu);
__m128i mapping_end_positions_update_mask_vpu1 = _mm_cmpeq_epi32(
num_errors_at_band_start_position_vpu, min_num_errors_vpu);
int mapping_end_positions_update_mask =
_mm_movemask_epi8(mapping_end_positions_update_mask_vpu);
int mapping_end_positions_update_mask1 =
_mm_movemask_epi8(mapping_end_positions_update_mask_vpu1);
for (int li = 0; li < 4; ++li) {
if ((mapping_end_positions_update_mask & 1) == 1 ||
((mapping_end_positions_update_mask1 & 1) == 1 &&
i + 1 == error_threshold)) {
mapping_end_positions[li] = band_start_position + 1 + i;
}
mapping_end_positions_update_mask =
mapping_end_positions_update_mask >> 4;
mapping_end_positions_update_mask1 =
mapping_end_positions_update_mask1 >> 4;
}
min_num_errors_vpu = _mm_min_epi32(min_num_errors_vpu,
num_errors_at_band_start_position_vpu);
VP = _mm_srli_epi32(VP, 1);
VN = _mm_srli_epi32(VN, 1);
}
_mm_store_si128((__m128i *)mapping_edit_distances, min_num_errors_vpu);
}
void BandedAlign8PatternsToText(int error_threshold, const char **patterns,
const char *text, int read_length,
int16_t *mapping_edit_distances,
int16_t *mapping_end_positions) {
int ALPHABET_SIZE = 5;
const char *reference_sequence0 = patterns[0];
const char *reference_sequence1 = patterns[1];
const char *reference_sequence2 = patterns[2];
const char *reference_sequence3 = patterns[3];
const char *reference_sequence4 = patterns[4];
const char *reference_sequence5 = patterns[5];
const char *reference_sequence6 = patterns[6];
const char *reference_sequence7 = patterns[7];
uint16_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
__m128i highest_bit_in_band_mask_vpu0 =
_mm_set_epi16(0, 0, 0, 0, 0, 0, 0, highest_bit_in_band_mask);
__m128i highest_bit_in_band_mask_vpu1 =
_mm_set_epi16(0, 0, 0, 0, 0, 0, highest_bit_in_band_mask, 0);
__m128i highest_bit_in_band_mask_vpu2 =
_mm_set_epi16(0, 0, 0, 0, 0, highest_bit_in_band_mask, 0, 0);
__m128i highest_bit_in_band_mask_vpu3 =
_mm_set_epi16(0, 0, 0, 0, highest_bit_in_band_mask, 0, 0, 0);
__m128i highest_bit_in_band_mask_vpu4 =
_mm_set_epi16(0, 0, 0, highest_bit_in_band_mask, 0, 0, 0, 0);
__m128i highest_bit_in_band_mask_vpu5 =
_mm_set_epi16(0, 0, highest_bit_in_band_mask, 0, 0, 0, 0, 0);
__m128i highest_bit_in_band_mask_vpu6 =
_mm_set_epi16(0, highest_bit_in_band_mask, 0, 0, 0, 0, 0, 0);
__m128i highest_bit_in_band_mask_vpu7 =
_mm_set_epi16(highest_bit_in_band_mask, 0, 0, 0, 0, 0, 0, 0);
// Init Peq
__m128i Peq[ALPHABET_SIZE];
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_setzero_si128();
}
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base0 = CharToUint8(reference_sequence0[i]);
uint8_t base1 = CharToUint8(reference_sequence1[i]);
uint8_t base2 = CharToUint8(reference_sequence2[i]);
uint8_t base3 = CharToUint8(reference_sequence3[i]);
uint8_t base4 = CharToUint8(reference_sequence4[i]);
uint8_t base5 = CharToUint8(reference_sequence5[i]);
uint8_t base6 = CharToUint8(reference_sequence6[i]);
uint8_t base7 = CharToUint8(reference_sequence7[i]);
Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]);
Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]);
Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]);
Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]);
Peq[base4] = _mm_or_si128(highest_bit_in_band_mask_vpu4, Peq[base4]);
Peq[base5] = _mm_or_si128(highest_bit_in_band_mask_vpu5, Peq[base5]);
Peq[base6] = _mm_or_si128(highest_bit_in_band_mask_vpu6, Peq[base6]);
Peq[base7] = _mm_or_si128(highest_bit_in_band_mask_vpu7, Peq[base7]);
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_srli_epi16(Peq[ai], 1);
}
}
uint16_t lowest_bit_in_band_mask = 1;
__m128i lowest_bit_in_band_mask_vpu = _mm_set1_epi16(lowest_bit_in_band_mask);
__m128i VP = _mm_setzero_si128();
__m128i VN = _mm_setzero_si128();
__m128i X = _mm_setzero_si128();
__m128i D0 = _mm_setzero_si128();
__m128i HN = _mm_setzero_si128();
__m128i HP = _mm_setzero_si128();
__m128i max_mask_vpu = _mm_set1_epi16(0xffff);
__m128i num_errors_at_band_start_position_vpu = _mm_setzero_si128();
__m128i early_stop_threshold_vpu = _mm_set1_epi16(error_threshold * 3);
for (int i = 0; i < read_length; i++) {
uint8_t base0 = CharToUint8(reference_sequence0[i + 2 * error_threshold]);
uint8_t base1 = CharToUint8(reference_sequence1[i + 2 * error_threshold]);
uint8_t base2 = CharToUint8(reference_sequence2[i + 2 * error_threshold]);
uint8_t base3 = CharToUint8(reference_sequence3[i + 2 * error_threshold]);
uint8_t base4 = CharToUint8(reference_sequence4[i + 2 * error_threshold]);
uint8_t base5 = CharToUint8(reference_sequence5[i + 2 * error_threshold]);
uint8_t base6 = CharToUint8(reference_sequence6[i + 2 * error_threshold]);
uint8_t base7 = CharToUint8(reference_sequence7[i + 2 * error_threshold]);
Peq[base0] = _mm_or_si128(highest_bit_in_band_mask_vpu0, Peq[base0]);
Peq[base1] = _mm_or_si128(highest_bit_in_band_mask_vpu1, Peq[base1]);
Peq[base2] = _mm_or_si128(highest_bit_in_band_mask_vpu2, Peq[base2]);
Peq[base3] = _mm_or_si128(highest_bit_in_band_mask_vpu3, Peq[base3]);
Peq[base4] = _mm_or_si128(highest_bit_in_band_mask_vpu4, Peq[base4]);
Peq[base5] = _mm_or_si128(highest_bit_in_band_mask_vpu5, Peq[base5]);
Peq[base6] = _mm_or_si128(highest_bit_in_band_mask_vpu6, Peq[base6]);
Peq[base7] = _mm_or_si128(highest_bit_in_band_mask_vpu7, Peq[base7]);
X = _mm_or_si128(Peq[CharToUint8(text[i])], VN);
D0 = _mm_and_si128(X, VP);
D0 = _mm_add_epi16(D0, VP);
D0 = _mm_xor_si128(D0, VP);
D0 = _mm_or_si128(D0, X);
HN = _mm_and_si128(VP, D0);
HP = _mm_or_si128(VP, D0);
HP = _mm_xor_si128(HP, max_mask_vpu);
HP = _mm_or_si128(HP, VN);
X = _mm_srli_epi16(D0, 1);
VN = _mm_and_si128(X, HP);
VP = _mm_or_si128(X, HP);
VP = _mm_xor_si128(VP, max_mask_vpu);
VP = _mm_or_si128(VP, HN);
__m128i E = _mm_and_si128(D0, lowest_bit_in_band_mask_vpu);
E = _mm_xor_si128(E, lowest_bit_in_band_mask_vpu);
num_errors_at_band_start_position_vpu =
_mm_add_epi16(num_errors_at_band_start_position_vpu, E);
__m128i early_stop = _mm_cmpgt_epi16(num_errors_at_band_start_position_vpu,
early_stop_threshold_vpu);
int tmp = _mm_movemask_epi8(early_stop);
if (tmp == 0xffff) {
_mm_store_si128((__m128i *)mapping_edit_distances,
num_errors_at_band_start_position_vpu);
return;
}
for (int ai = 0; ai < ALPHABET_SIZE; ai++) {
Peq[ai] = _mm_srli_epi16(Peq[ai], 1);
}
}
int band_start_position = read_length - 1;
__m128i min_num_errors_vpu = num_errors_at_band_start_position_vpu;
for (int i = 0; i < 2 * error_threshold; i++) {
__m128i lowest_bit_in_VP_vpu =
_mm_and_si128(VP, lowest_bit_in_band_mask_vpu);
__m128i lowest_bit_in_VN_vpu =
_mm_and_si128(VN, lowest_bit_in_band_mask_vpu);
num_errors_at_band_start_position_vpu = _mm_add_epi16(
num_errors_at_band_start_position_vpu, lowest_bit_in_VP_vpu);
num_errors_at_band_start_position_vpu = _mm_sub_epi16(
num_errors_at_band_start_position_vpu, lowest_bit_in_VN_vpu);
__m128i mapping_end_positions_update_mask_vpu = _mm_cmplt_epi16(
num_errors_at_band_start_position_vpu, min_num_errors_vpu);
__m128i mapping_end_positions_update_mask_vpu1 = _mm_cmpeq_epi16(
num_errors_at_band_start_position_vpu, min_num_errors_vpu);
int mapping_end_positions_update_mask =
_mm_movemask_epi8(mapping_end_positions_update_mask_vpu);
int mapping_end_positions_update_mask1 =
_mm_movemask_epi8(mapping_end_positions_update_mask_vpu1);
for (int li = 0; li < 8; ++li) {
if ((mapping_end_positions_update_mask & 1) == 1 ||
((mapping_end_positions_update_mask1 & 1) == 1 &&
i + 1 == error_threshold)) {
mapping_end_positions[li] = band_start_position + 1 + i;
}
mapping_end_positions_update_mask =
mapping_end_positions_update_mask >> 2;
mapping_end_positions_update_mask1 =
mapping_end_positions_update_mask1 >> 2;
}
min_num_errors_vpu = _mm_min_epi16(min_num_errors_vpu,
num_errors_at_band_start_position_vpu);
VP = _mm_srli_epi16(VP, 1);
VN = _mm_srli_epi16(VN, 1);
}
_mm_store_si128((__m128i *)mapping_edit_distances, min_num_errors_vpu);
}
void BandedTraceback(int error_threshold, int min_num_errors,
const char *pattern, const char *text,
const int read_length, int *mapping_start_position) {
// fisrt calculate the hamming distance and see whether it's equal to # errors
if (min_num_errors == 0) {
*mapping_start_position = error_threshold;
return;
}
int error_count = 0;
for (int i = 0; i < read_length; ++i) {
if (pattern[i + error_threshold] != text[i]) {
++error_count;
}
}
if (error_count == min_num_errors) {
*mapping_start_position = error_threshold;
return;
}
// if not then there are gaps so that we have to traceback with edit distance.
uint32_t Peq[5] = {0, 0, 0, 0, 0};
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base =
CharToUint8(pattern[read_length - 1 + 2 * error_threshold - i]);
Peq[base] = Peq[base] | (1 << i);
}
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
uint32_t lowest_bit_in_band_mask = 1;
uint32_t VP = 0;
uint32_t VN = 0;
uint32_t X = 0;
uint32_t D0 = 0;
uint32_t HN = 0;
uint32_t HP = 0;
int num_errors_at_band_start_position = 0;
for (int i = 0; i < read_length; i++) {
uint8_t pattern_base = CharToUint8(pattern[read_length - 1 - i]);
Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
X = Peq[CharToUint8(text[read_length - 1 - i])] | VN;
D0 = ((VP + (X & VP)) ^ VP) | X;
HN = VP & D0;
HP = VN | ~(VP | D0);
X = D0 >> 1;
VN = X & HP;
VP = HN | ~(X | HP);
num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
for (int ai = 0; ai < 5; ai++) {
Peq[ai] >>= 1;
}
}
*mapping_start_position = 2 * error_threshold;
for (int i = 0; i < 2 * error_threshold; i++) {
num_errors_at_band_start_position =
num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1);
num_errors_at_band_start_position =
num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1);
if (num_errors_at_band_start_position == min_num_errors) {
*mapping_start_position = 2 * error_threshold - (1 + i);
if (i + 1 == error_threshold) {
return;
}
}
}
}
void BandedTracebackToEnd(int error_threshold, int min_num_errors,
const char *pattern, const char *text,
const int read_length, int *mapping_end_position) {
// fisrt calculate the hamming distance and see whether it's equal to # errors
if (min_num_errors == 0) {
*mapping_end_position = read_length + error_threshold;
return;
}
int error_count = 0;
for (int i = 0; i < read_length; ++i) {
if (pattern[i + error_threshold] != text[i]) {
++error_count;
}
}
if (error_count == min_num_errors) {
*mapping_end_position = read_length + error_threshold;
return;
}
// if not then there are gaps so that we have to traceback with edit distance.
uint32_t Peq[5] = {0, 0, 0, 0, 0};
for (int i = 0; i < 2 * error_threshold; i++) {
uint8_t base = CharToUint8(pattern[i]);
Peq[base] = Peq[base] | (1 << i);
}
uint32_t highest_bit_in_band_mask = 1 << (2 * error_threshold);
uint32_t lowest_bit_in_band_mask = 1;
uint32_t VP = 0;
uint32_t VN = 0;
uint32_t X = 0;
uint32_t D0 = 0;
uint32_t HN = 0;
uint32_t HP = 0;
int num_errors_at_band_start_position = 0;
for (int i = 0; i < read_length; i++) {
// printf("=>%d %d %c %c\n", i, num_errors_at_band_start_position, pattern[i
// + 2 * error_threshold], text[i]) ;
uint8_t pattern_base = CharToUint8(pattern[i + 2 * error_threshold]);
Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
X = Peq[CharToUint8(text[i])] | VN;
D0 = ((VP + (X & VP)) ^ VP) | X;
HN = VP & D0;
HP = VN | ~(VP | D0);
X = D0 >> 1;
VN = X & HP;
VP = HN | ~(X | HP);
num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
for (int ai = 0; ai < 5; ai++) {
Peq[ai] >>= 1;
}
}
int band_start_position = read_length;
*mapping_end_position = band_start_position + 1;
for (int i = 0; i < 2 * error_threshold; i++) {
num_errors_at_band_start_position =
num_errors_at_band_start_position + ((VP >> i) & (uint32_t)1);
num_errors_at_band_start_position =
num_errors_at_band_start_position - ((VN >> i) & (uint32_t)1);
if (num_errors_at_band_start_position == min_num_errors) {
*mapping_end_position = band_start_position + (i + 1);
if (i + 1 == error_threshold) {
return;
}
}
}
}
} // namespace chromap
================================================
FILE: src/alignment.h
================================================
#ifndef ALIGNMENT_H_
#define ALIGNMENT_H_
#include "mapping_in_memory.h"
#include "sam_mapping.h"
#include "sequence_batch.h"
#include "utils.h"
namespace chromap {
int GetLongestMatchLength(const char *pattern, const char *text,
const int read_length);
// Return newly adjusted reference start/end position for kPositive/kNegative
// mappings.
int AdjustGapBeginning(const Strand mapping_strand, const char *ref,
const char *read, int *gap_beginning, int read_end,
int ref_start_position, int ref_end_position,
int *n_cigar, uint32_t **cigar);
// Reference (pattern) mapping start postion and cigar must be computed before
// calling this function. Read (text) must be already at the start position.
void GenerateNMAndMDTag(const char *pattern, const char *text,
int mapping_start_position,
MappingInMemory &mapping_in_memory);
int BandedAlignPatternToText(int error_threshold, const char *pattern,
const char *text, const int read_length,
int *mapping_end_position);
// Return negative number if the termination are deemed at the beginning of the
// read mappping_end_position is relative to pattern (reference)
// read_mapping_length is for text (read)
int BandedAlignPatternToTextWithDropOff(int error_threshold,
const char *pattern, const char *text,
const int read_length,
int *mapping_end_position,
int *read_mapping_length);
int BandedAlignPatternToTextWithDropOffFrom3End(
int error_threshold, const char *pattern, const char *text,
const int read_length, int *mapping_end_position, int *read_mapping_length);
void BandedAlign4PatternsToText(int error_threshold, const char **patterns,
const char *text, int read_length,
int32_t *mapping_edit_distances,
int32_t *mapping_end_positions);
void BandedAlign8PatternsToText(int error_threshold, const char **patterns,
const char *text, int read_length,
int16_t *mapping_edit_distances,
int16_t *mapping_end_positions);
void BandedTraceback(int error_threshold, int min_num_errors,
const char *pattern, const char *text,
const int read_length, int *mapping_start_position);
void BandedTracebackToEnd(int error_threshold, int min_num_errors,
const char *pattern, const char *text,
const int read_length, int *mapping_end_position);
} // namespace chromap
#endif // ALIGNMENT_H_
================================================
FILE: src/barcode_translator.h
================================================
#ifndef BARCODETRANSLATOR_H_
#define BARCODETRANSLATOR_H_
#include
#include
#include
#include
#include
#include
#include
#include
#include "khash.h"
#include "utils.h"
namespace chromap {
KHASH_INIT(k64_str, uint64_t, char *, 1, kh_int64_hash_func,
kh_int64_hash_equal);
// The class for handling barcode convertion.
class BarcodeTranslator {
public:
BarcodeTranslator() {
barcode_translate_table_ = NULL;
from_bc_length_ = -1;
}
~BarcodeTranslator() {
if (barcode_translate_table_ != NULL) {
khiter_t k;
for (k = kh_begin(barcode_translate_table_);
k != kh_end(barcode_translate_table_); ++k) {
if (kh_exist(barcode_translate_table_, k))
free(kh_value(barcode_translate_table_, k));
}
kh_destroy(k64_str, barcode_translate_table_);
}
}
void SetTranslateTable(const std::string &file) {
barcode_translate_table_ = kh_init(k64_str);
if (1) {
gzFile barcode_translate_file = gzopen(file.c_str(), "r");
const uint32_t line_buffer_size = 512;
char file_line[line_buffer_size];
while (gzgets(barcode_translate_file, file_line, line_buffer_size) != NULL) {
int line_len = strlen(file_line);
if (file_line[line_len - 1] == '\n') {
file_line[line_len - 1] = '\0';
}
std::string tmp_string(file_line);
ProcessTranslateFileLine(tmp_string);
}
} else {
// Old implementation, which does not support gzipped input.
std::ifstream file_stream(file);
std::string file_line;
while (getline(file_stream, file_line)) {
ProcessTranslateFileLine(file_line);
}
}
mask_ = (1ull << (2 * from_bc_length_)) - 1;
/*for (int i = 0; i < from_bc_length_; ++i)
{
mask_ |= (3ull << (2*i));
}*/
}
std::string Translate(uint64_t bc, uint32_t bc_length) {
if (barcode_translate_table_ == NULL) {
return Seed2Sequence(bc, bc_length);
}
std::string ret;
uint64_t i;
for (i = 0; i < bc_length / from_bc_length_; ++i) {
uint64_t seed = (bc << (2 * i * from_bc_length_)) >>
(2 * (bc_length / from_bc_length_ - 1) * from_bc_length_);
seed &= mask_;
khiter_t barcode_translate_table_iter =
kh_get(k64_str, barcode_translate_table_, seed);
if (barcode_translate_table_iter == kh_end(barcode_translate_table_)) {
std::cerr << "Barcode does not exist in the translation table."
<< std::endl;
exit(-1);
}
std::string bc_to(
kh_value(barcode_translate_table_, barcode_translate_table_iter));
if (i == 0) {
ret = bc_to;
} else {
ret += "-" + bc_to;
}
}
return ret;
}
private:
khash_t(k64_str) * barcode_translate_table_;
int from_bc_length_;
uint64_t mask_;
std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
std::string sequence;
sequence.reserve(seed_length);
uint64_t mask_ = 3;
for (uint32_t i = 0; i < seed_length; ++i) {
sequence.push_back(
Uint8ToChar((seed >> ((seed_length - 1 - i) * 2)) & mask_));
}
return sequence;
}
void ProcessTranslateFileLine(std::string &line) {
int i;
int len = line.length();
std::string to;
for (i = 0; i < len; ++i) {
if (line[i] == ',' || line[i] == '\t') break;
}
to = line.substr(0, i);
// from = line.substr(i + 1, len - i - 1);
from_bc_length_ = len - i - 1;
uint64_t from_seed =
GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length_);
int khash_return_code;
khiter_t barcode_translate_table_iter = kh_put(
k64_str, barcode_translate_table_, from_seed, &khash_return_code);
kh_value(barcode_translate_table_, barcode_translate_table_iter) =
strdup(to.c_str());
}
};
} // namespace chromap
#endif
================================================
FILE: src/bed_mapping.h
================================================
#ifndef BEDMAPPING_H_
#define BEDMAPPING_H_
#include
#include "mapping.h"
namespace chromap {
class MappingWithBarcode : public Mapping {
public:
uint32_t read_id_;
uint64_t cell_barcode_;
uint32_t fragment_start_position_;
uint16_t fragment_length_;
uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
uint8_t num_dups_;
// uint8_t mapq;
MappingWithBarcode() : num_dups_(0) {}
MappingWithBarcode(uint32_t read_id, uint64_t cell_barcode,
uint32_t fragment_start_position, uint16_t fragment_length,
uint8_t mapq, uint8_t direction, uint8_t is_unique,
uint8_t num_dups)
: read_id_(read_id),
cell_barcode_(cell_barcode),
fragment_start_position_(fragment_start_position),
fragment_length_(fragment_length),
mapq_(mapq),
direction_(direction),
is_unique_(is_unique),
num_dups_(num_dups) {}
bool operator<(const MappingWithBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_, cell_barcode_,
mapq_, direction_, is_unique_, read_id_) <
std::tie(m.fragment_start_position_, m.fragment_length_,
m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_,
m.read_id_);
}
bool operator==(const MappingWithBarcode &m) const {
return std::tie(cell_barcode_, fragment_start_position_) ==
std::tie(m.cell_barcode_, m.fragment_start_position_);
}
bool IsSamePosition(const MappingWithBarcode &m) const {
return std::tie(fragment_start_position_) ==
std::tie(m.fragment_start_position_);
}
uint64_t GetBarcode() const { return cell_barcode_; }
void Tn5Shift() {
if (direction_ == 1) {
fragment_start_position_ += 4;
} else {
fragment_length_ -= 5;
}
}
bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
uint32_t GetStartPosition() const { // inclusive
return fragment_start_position_;
}
uint32_t GetEndPosition() const { // exclusive
return fragment_start_position_ + fragment_length_;
}
};
class MappingWithoutBarcode : public Mapping {
public:
uint32_t read_id_;
uint32_t fragment_start_position_;
uint16_t fragment_length_;
// uint8_t mapq;
uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
uint16_t num_dups_; // Need higher limit in bulk setting
MappingWithoutBarcode() : num_dups_(0) {}
MappingWithoutBarcode(uint32_t read_id, uint32_t fragment_start_position,
uint16_t fragment_length, uint16_t mapq,
uint8_t direction, uint8_t is_unique, uint8_t num_dups)
: read_id_(read_id),
fragment_start_position_(fragment_start_position),
fragment_length_(fragment_length),
mapq_(mapq),
direction_(direction),
is_unique_(is_unique),
num_dups_(num_dups) {}
bool operator<(const MappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_, mapq_,
direction_, is_unique_, read_id_) <
std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_,
m.direction_, m.is_unique_, m.read_id_);
}
bool operator==(const MappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_) ==
std::tie(m.fragment_start_position_);
}
bool IsSamePosition(const MappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_) ==
std::tie(m.fragment_start_position_);
}
uint64_t GetBarcode() const { return 0; }
void Tn5Shift() {
if (direction_ == 1) {
fragment_start_position_ += 4;
} else {
fragment_length_ -= 5;
}
}
bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
uint32_t GetStartPosition() const { // inclusive
return fragment_start_position_;
}
uint32_t GetEndPosition() const { // exclusive
return fragment_start_position_ + fragment_length_;
}
};
class PairedEndMappingWithBarcode : public Mapping {
public:
uint32_t read_id_;
uint64_t cell_barcode_;
uint32_t fragment_start_position_;
uint16_t fragment_length_;
uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
uint8_t num_dups_;
// uint8_t mapq;
uint16_t positive_alignment_length_;
uint16_t negative_alignment_length_;
PairedEndMappingWithBarcode() : num_dups_(0) {}
PairedEndMappingWithBarcode(uint32_t read_id, uint64_t cell_barcode,
uint32_t fragment_start_position,
uint16_t fragment_length, uint8_t mapq,
uint8_t direction, uint8_t is_unique,
uint8_t num_dups,
uint16_t positive_alignment_length,
uint16_t negative_alignment_length)
: read_id_(read_id),
cell_barcode_(cell_barcode),
fragment_start_position_(fragment_start_position),
fragment_length_(fragment_length),
mapq_(mapq),
direction_(direction),
is_unique_(is_unique),
num_dups_(num_dups),
positive_alignment_length_(positive_alignment_length),
negative_alignment_length_(negative_alignment_length) {}
bool operator<(const PairedEndMappingWithBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_, cell_barcode_,
mapq_, direction_, is_unique_, read_id_,
positive_alignment_length_, negative_alignment_length_) <
std::tie(m.fragment_start_position_, m.fragment_length_,
m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_,
m.read_id_, m.positive_alignment_length_,
m.negative_alignment_length_);
}
bool operator==(const PairedEndMappingWithBarcode &m) const {
return std::tie(cell_barcode_, fragment_start_position_,
fragment_length_) == std::tie(m.cell_barcode_,
m.fragment_start_position_,
m.fragment_length_);
}
bool IsSamePosition(const PairedEndMappingWithBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_) ==
std::tie(m.fragment_start_position_, m.fragment_length_);
}
uint64_t GetBarcode() const { return cell_barcode_; }
void Tn5Shift() {
fragment_start_position_ += 4;
positive_alignment_length_ -= 4;
fragment_length_ -= 9;
negative_alignment_length_ -= 5;
}
bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
uint32_t GetStartPosition() const { // inclusive
return fragment_start_position_;
}
uint32_t GetEndPosition() const { // exclusive
return fragment_start_position_ + fragment_length_;
}
};
class PairedEndMappingWithoutBarcode : public Mapping {
public:
uint32_t read_id_;
uint32_t fragment_start_position_;
uint16_t fragment_length_;
uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
uint8_t num_dups_;
// uint8_t mapq;
uint16_t positive_alignment_length_;
uint16_t negative_alignment_length_;
PairedEndMappingWithoutBarcode() : num_dups_(0) {}
PairedEndMappingWithoutBarcode(uint32_t read_id,
uint32_t fragment_start_position,
uint16_t fragment_length, uint8_t mapq,
uint8_t direction, uint8_t is_unique,
uint16_t num_dups,
uint16_t positive_alignment_length,
uint16_t negative_alignment_length)
: read_id_(read_id),
fragment_start_position_(fragment_start_position),
fragment_length_(fragment_length),
mapq_(mapq),
direction_(direction),
is_unique_(is_unique),
num_dups_(num_dups),
positive_alignment_length_(positive_alignment_length),
negative_alignment_length_(negative_alignment_length) {}
bool operator<(const PairedEndMappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_, mapq_,
direction_, is_unique_, read_id_,
positive_alignment_length_, negative_alignment_length_) <
std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_,
m.direction_, m.is_unique_, m.read_id_,
m.positive_alignment_length_, m.negative_alignment_length_);
}
bool operator==(const PairedEndMappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_) ==
std::tie(m.fragment_start_position_, m.fragment_length_);
}
bool IsSamePosition(const PairedEndMappingWithoutBarcode &m) const {
return std::tie(fragment_start_position_, fragment_length_) ==
std::tie(m.fragment_start_position_, m.fragment_length_);
}
uint64_t GetBarcode() const { return 0; }
void Tn5Shift() {
fragment_start_position_ += 4;
positive_alignment_length_ -= 4;
fragment_length_ -= 9;
negative_alignment_length_ -= 5;
}
bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
uint32_t GetStartPosition() const { // inclusive
return fragment_start_position_;
}
uint32_t GetEndPosition() const { // exclusive
return fragment_start_position_ + fragment_length_;
}
};
} // namespace chromap
#endif // BEDMAPPING_H_
================================================
FILE: src/candidate.h
================================================
#ifndef CANDIDATE_H_
#define CANDIDATE_H_
#include
namespace chromap {
struct Candidate {
// The high 32 bits save the reference sequence index in the reference
// sequence batch. The low 32 bits save the reference position on that
// sequence.
uint64_t position = 0;
// The number of minimizers supports the position.
uint8_t count = 0;
inline uint32_t GetReferenceSequenceIndex() const { return (position >> 32); }
inline uint32_t GetReferenceSequencePosition() const { return position; }
inline uint8_t GetCount() { return count; }
inline bool operator<(const Candidate &c) const {
if (count > c.count) {
return true;
}
if (count < c.count) {
return false;
}
return position < c.position;
}
};
} // namespace chromap
#endif // CANDIDATE_H_
================================================
FILE: src/candidate_position_generating_config.h
================================================
#ifndef CANDIDATE_POSITION_GENERATING_CONFIG_H_
#define CANDIDATE_POSITION_GENERATING_CONFIG_H_
#include
namespace chromap {
// This class holds the parameters to generate candidate position. Using the
// parameters, it can check whether a seed is frequent or repetitive.
class CandidatePositionGeneratingConfig {
public:
CandidatePositionGeneratingConfig() = delete;
CandidatePositionGeneratingConfig(uint32_t max_seed_frequency,
uint32_t repetitive_seed_frequency,
bool use_heap_merge)
: max_seed_frequency_(max_seed_frequency),
repetitive_seed_frequency_(repetitive_seed_frequency),
use_heap_merge_(use_heap_merge) {}
~CandidatePositionGeneratingConfig() = default;
inline bool IsFrequentSeed(uint32_t seed_frequency) const {
return seed_frequency >= max_seed_frequency_;
}
inline bool IsRepetitiveSeed(uint32_t seed_frequency) const {
return seed_frequency >= repetitive_seed_frequency_;
}
inline bool UseHeapMerge() const { return use_heap_merge_; }
inline uint32_t GetMaxSeedFrequency() const { return max_seed_frequency_; }
private:
// Only seeds with frequency less than this threshold will be used.
const uint32_t max_seed_frequency_;
// Seeds with frequency greater than or equal to this threshold will be
// considered as repetitive seeds.
const uint32_t repetitive_seed_frequency_;
// When the number of candidate positions is really large, use heap merge to
// merge sorted candidate lists.
const bool use_heap_merge_;
};
} // namespace chromap
#endif // CANDIDATE_POSITION_GENERATING_CONFIG_H_
================================================
FILE: src/candidate_processor.cc
================================================
#include "candidate_processor.h"
#include
#include
#include
#include
#include
#include
namespace chromap {
void CandidateProcessor::GenerateCandidates(
int error_threshold, const Index &index,
MappingMetadata &mapping_metadata) const {
const std::vector &minimizers = mapping_metadata.minimizers_;
std::vector &positive_hits = mapping_metadata.positive_hits_;
std::vector &negative_hits = mapping_metadata.negative_hits_;
std::vector &positive_candidates =
mapping_metadata.positive_candidates_;
std::vector &negative_candidates =
mapping_metadata.negative_candidates_;
uint32_t &repetitive_seed_length = mapping_metadata.repetitive_seed_length_;
const CandidatePositionGeneratingConfig first_round_generating_config(
/*max_seed_frequency=*/max_seed_frequencies_[0],
/*repetitive_seed_frequency=*/max_seed_frequencies_[0],
/*use_heap_merge=*/false);
repetitive_seed_length = 0;
int repetitive_seed_count = index.GenerateCandidatePositions(
first_round_generating_config, mapping_metadata);
bool use_high_frequency_minimizers = false;
if (positive_hits.size() + negative_hits.size() == 0) {
positive_hits.clear();
negative_hits.clear();
repetitive_seed_length = 0;
const CandidatePositionGeneratingConfig second_round_generating_config(
/*max_seed_frequency=*/max_seed_frequencies_[1],
/*repetitive_seed_frequency=*/max_seed_frequencies_[0],
/*use_heap_merge=*/true);
repetitive_seed_count = index.GenerateCandidatePositions(
second_round_generating_config, mapping_metadata);
use_high_frequency_minimizers = true;
if (positive_hits.size() == 0 || negative_hits.size() == 0) {
use_high_frequency_minimizers = false;
}
}
int num_required_seeds = minimizers.size() - repetitive_seed_count;
num_required_seeds = num_required_seeds > 1 ? num_required_seeds : 1;
num_required_seeds = num_required_seeds > min_num_seeds_required_for_mapping_
? min_num_seeds_required_for_mapping_
: num_required_seeds;
if (use_high_frequency_minimizers) {
num_required_seeds = min_num_seeds_required_for_mapping_;
}
// std::cerr << "Normal positive gen on one dir\n";
GenerateCandidatesOnOneStrand(error_threshold, num_required_seeds,
minimizers.size(), positive_hits,
positive_candidates);
// std::cerr << "Normal negative gen on one dir\n";
GenerateCandidatesOnOneStrand(error_threshold, num_required_seeds,
minimizers.size(), negative_hits,
negative_candidates);
// fprintf(stderr, "p+n: %d\n", positive_candidates->size() +
// negative_candidates->size()) ;
}
// Return 0 if it supplements normally. Return 1 if the supplement could be too
// aggressive, and MAPQ needs setting to 0.
int CandidateProcessor::SupplementCandidates(
int error_threshold, uint32_t search_range, const Index &index,
PairedEndMappingMetadata &paired_end_mapping_metadata) const {
std::vector augment_positive_candidates1;
std::vector augment_positive_candidates2;
std::vector augment_negative_candidates1;
std::vector augment_negative_candidates2;
int ret = 0;
for (int mate = 0; mate <= 1; ++mate) {
std::vector *minimizers;
std::vector *positive_hits;
std::vector *negative_hits;
std::vector *positive_candidates;
std::vector *negative_candidates;
std::vector *mate_positive_candidates;
std::vector *mate_negative_candidates;
std::vector *augment_positive_candidates;
std::vector *augment_negative_candidates;
uint32_t *repetitive_seed_length;
if (mate == 0) {
minimizers = &paired_end_mapping_metadata.mapping_metadata1_.minimizers_;
positive_hits =
&paired_end_mapping_metadata.mapping_metadata1_.positive_hits_;
negative_hits =
&paired_end_mapping_metadata.mapping_metadata1_.negative_hits_;
positive_candidates =
&paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_;
negative_candidates =
&paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_;
mate_positive_candidates =
&paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_;
mate_negative_candidates =
&paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_;
augment_positive_candidates = &augment_positive_candidates1;
augment_negative_candidates = &augment_negative_candidates1;
repetitive_seed_length = &paired_end_mapping_metadata.mapping_metadata1_
.repetitive_seed_length_;
} else {
minimizers = &paired_end_mapping_metadata.mapping_metadata2_.minimizers_;
positive_hits =
&paired_end_mapping_metadata.mapping_metadata2_.positive_hits_;
negative_hits =
&paired_end_mapping_metadata.mapping_metadata2_.negative_hits_;
positive_candidates =
&paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_;
negative_candidates =
&paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_;
mate_positive_candidates =
&paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_;
mate_negative_candidates =
&paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_;
augment_positive_candidates = &augment_positive_candidates2;
augment_negative_candidates = &augment_negative_candidates2;
repetitive_seed_length = &paired_end_mapping_metadata.mapping_metadata2_
.repetitive_seed_length_;
}
uint32_t mm_count = minimizers->size();
bool augment_flag = true;
uint32_t candidate_num = positive_candidates->size();
for (uint32_t i = 0; i < candidate_num; ++i) {
if ((*positive_candidates)[i].count >= mm_count / 2) {
augment_flag = false;
break;
}
}
candidate_num = negative_candidates->size();
if (augment_flag) {
for (uint32_t i = 0; i < candidate_num; ++i) {
if ((*negative_candidates)[i].count >= mm_count / 2) {
augment_flag = false;
break;
}
}
}
if (augment_flag) {
positive_hits->clear();
negative_hits->clear();
positive_hits->reserve(max_seed_frequencies_[0]);
negative_hits->reserve(max_seed_frequencies_[0]);
int positive_rescue_result = 0;
int negative_rescue_result = 0;
if (mate_positive_candidates->size() > 0) {
positive_rescue_result =
GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand(
kNegative, search_range, error_threshold, index, *minimizers,
*mate_positive_candidates, *repetitive_seed_length,
*negative_hits, *augment_negative_candidates);
}
if (mate_negative_candidates->size() > 0) {
negative_rescue_result =
GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand(
kPositive, search_range, error_threshold, index, *minimizers,
*mate_negative_candidates, *repetitive_seed_length,
*positive_hits, *augment_positive_candidates);
}
// If one of the strand did not supplement due to too many best candidate,
// and the filtered strand have better best candidates,
// and there is no candidate directly from minimizers,
// then we remove the supplement
if (((positive_rescue_result < 0 && negative_rescue_result > 0 &&
-positive_rescue_result >= negative_rescue_result) ||
(positive_rescue_result > 0 && negative_rescue_result < 0 &&
positive_rescue_result <= -negative_rescue_result)) &&
positive_candidates->size() + negative_candidates->size() == 0) {
// augment_positive_candidates->clear();
// augment_negative_candidates->clear();
ret = 1;
}
}
}
if (augment_positive_candidates1.size() > 0) {
MergeCandidates(
error_threshold,
paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_,
augment_positive_candidates1,
paired_end_mapping_metadata.mapping_metadata1_
.positive_candidates_buffer_);
}
if (augment_negative_candidates1.size() > 0) {
MergeCandidates(
error_threshold,
paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_,
augment_negative_candidates1,
paired_end_mapping_metadata.mapping_metadata1_
.negative_candidates_buffer_);
}
if (augment_positive_candidates2.size() > 0) {
MergeCandidates(
error_threshold,
paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_,
augment_positive_candidates2,
paired_end_mapping_metadata.mapping_metadata2_
.positive_candidates_buffer_);
}
if (augment_negative_candidates2.size() > 0) {
MergeCandidates(
error_threshold,
paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_,
augment_negative_candidates2,
paired_end_mapping_metadata.mapping_metadata2_
.negative_candidates_buffer_);
}
return ret;
}
void CandidateProcessor::ReduceCandidatesForPairedEndRead(
uint32_t mapping_positions_distance,
PairedEndMappingMetadata &paired_end_mapping_metadata) const {
const std::vector &positive_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_
.positive_candidates_buffer_;
const std::vector &negative_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_
.negative_candidates_buffer_;
const std::vector &positive_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_
.positive_candidates_buffer_;
const std::vector &negative_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_
.negative_candidates_buffer_;
std::vector &filtered_positive_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_.positive_candidates_;
std::vector &filtered_negative_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_.negative_candidates_;
std::vector &filtered_positive_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_.positive_candidates_;
std::vector &filtered_negative_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_.negative_candidates_;
ReduceCandidatesForPairedEndReadOnOneDirection(
mapping_positions_distance, positive_candidates1, negative_candidates2,
filtered_positive_candidates1, filtered_negative_candidates2);
ReduceCandidatesForPairedEndReadOnOneDirection(
mapping_positions_distance, negative_candidates1, positive_candidates2,
filtered_negative_candidates1, filtered_positive_candidates2);
}
int CandidateProcessor::
GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand(
const Strand strand, uint32_t search_range, int error_threshold,
const Index &index, const std::vector &minimizers,
const std::vector &mate_candidates,
uint32_t &repetitive_seed_length, std::vector &hits,
std::vector &candidates) const {
int max_seed_count =
index.GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(
strand, search_range, min_num_seeds_required_for_mapping_,
max_seed_frequencies_[0], error_threshold, minimizers,
mate_candidates, repetitive_seed_length, hits);
GenerateCandidatesOnOneStrand(error_threshold, /*num_seeds_required=*/1,
minimizers.size(), hits, candidates);
return max_seed_count;
}
void CandidateProcessor::GenerateCandidatesOnOneStrand(
int error_threshold, int num_seeds_required, uint32_t num_minimizers,
std::vector &hits, std::vector &candidates) const {
hits.emplace_back(UINT64_MAX);
if (hits.size() > 0) {
int minimizer_count = 1;
// The number of seeds with the exact same reference position.
int equal_count = 1;
int best_equal_count = 1;
uint64_t previous_hit = hits[0];
uint32_t previous_reference_id = previous_hit >> 32;
uint32_t previous_reference_position = previous_hit;
uint64_t best_local_hit = hits[0];
for (uint32_t pi = 1; pi < hits.size(); ++pi) {
uint32_t current_reference_id = hits[pi] >> 32;
uint32_t current_reference_position = hits[pi];
#ifdef LI_DEBUG
printf("%s: %d %d\n", __func__, current_reference_id,
current_reference_position);
#endif
if (current_reference_id != previous_reference_id ||
current_reference_position >
previous_reference_position + error_threshold ||
((uint32_t)minimizer_count >= num_minimizers &&
current_reference_position >
(uint32_t)best_local_hit + error_threshold)) {
if (minimizer_count >= num_seeds_required) {
Candidate candidate;
candidate.position = best_local_hit;
candidate.count = best_equal_count;
candidates.push_back(candidate);
}
minimizer_count = 1;
equal_count = 1;
best_equal_count = 1;
best_local_hit = hits[pi];
} else {
if (hits[pi] == best_local_hit) {
++equal_count;
++best_equal_count;
} else if (hits[pi] == previous_hit) {
++equal_count;
if (equal_count > best_equal_count) {
best_local_hit = previous_hit;
best_equal_count = equal_count;
}
} else {
equal_count = 1;
}
++minimizer_count;
}
previous_hit = hits[pi];
previous_reference_id = current_reference_id;
previous_reference_position = current_reference_position;
}
}
}
// Merge c1 and c2 into buffer and then swap the results into c1.
void CandidateProcessor::MergeCandidates(int error_threshold,
std::vector &c1,
std::vector &c2,
std::vector &buffer) const {
if (c1.size() == 0) {
c1.swap(c2);
return;
}
uint32_t i, j;
uint32_t size1, size2;
size1 = c1.size();
size2 = c2.size();
buffer.clear();
#ifdef LI_DEBUG
for (i = 0; i < size1; ++i)
printf("c1: %d %d %d\n", (int)(c1[i].position >> 32), (int)c1[i].position,
c1[i].count);
for (i = 0; i < size2; ++i)
printf("c2: %d %d %d\n", (int)(c2[i].position >> 32), (int)c2[i].position,
c2[i].count);
#endif
i = 0;
j = 0;
while (i < size1 && j < size2) {
if (c1[i].position == c2[j].position) {
if (buffer.empty() ||
c1[i].position > buffer.back().position + error_threshold) {
if (c1[i].count > c2[j].count) {
buffer.push_back(c1[i]);
} else {
buffer.push_back(c2[j]);
}
}
++i, ++j;
} else if (c1[i].position < c2[j].position) {
if (buffer.empty() ||
c1[i].position > buffer.back().position + error_threshold) {
buffer.push_back(c1[i]);
}
++i;
} else {
if (buffer.empty() ||
c2[j].position > buffer.back().position + error_threshold) {
buffer.push_back(c2[j]);
}
++j;
}
}
while (i < size1) {
if (buffer.empty() ||
c1[i].position > buffer.back().position + error_threshold) {
buffer.push_back(c1[i]);
}
++i;
}
while (j < size2) {
if (buffer.empty() ||
c2[j].position > buffer.back().position + error_threshold) {
buffer.push_back(c2[j]);
}
++j;
}
c1.swap(buffer);
}
void CandidateProcessor::ReduceCandidatesForPairedEndReadOnOneDirection(
uint32_t mapping_positions_distance,
const std::vector &candidates1,
const std::vector &candidates2,
std::vector &filtered_candidates1,
std::vector &filtered_candidates2) const {
uint32_t i1 = 0;
uint32_t i2 = 0;
int num_unpaired_candidate1 = 0;
int num_unpaired_candidate2 = 0;
int num_unpaired_candidate_threshold = 5;
int max_candidate_count1 = 6;
int max_candidate_count2 = 6;
uint32_t previous_end_i2 = i2;
#ifdef LI_DEBUG
for (uint32_t i = 0; i < candidates1.size(); ++i)
printf("%s 0: %d %d:%d\n", __func__, i,
(int)(candidates1[i].position >> 32), (int)candidates1[i].position);
for (uint32_t i = 0; i < candidates2.size(); ++i)
printf("%s 1: %d %d:%d\n", __func__, i,
(int)(candidates2[i].position >> 32), (int)candidates2[i].position);
#endif
while (i1 < candidates1.size() && i2 < candidates2.size()) {
if (candidates1[i1].position >
candidates2[i2].position + mapping_positions_distance) {
if (i2 >= previous_end_i2 &&
num_unpaired_candidate2 < num_unpaired_candidate_threshold &&
(candidates1[i1].position >> 32) ==
(candidates2[i2].position >> 32) &&
candidates2[i2].count >= max_candidate_count2) {
filtered_candidates2.emplace_back(candidates2[i2]);
++num_unpaired_candidate2;
}
++i2;
} else if (candidates2[i2].position >
candidates1[i1].position + mapping_positions_distance) {
if (num_unpaired_candidate1 < num_unpaired_candidate_threshold &&
(candidates1[i1].position >> 32) ==
(candidates2[i2].position >> 32) &&
candidates1[i1].count >= max_candidate_count1) {
filtered_candidates1.emplace_back(candidates1[i1]);
++num_unpaired_candidate1;
}
++i1;
} else {
// ok, find a pair, we store current ni2 somewhere and keep looking until
// we go out of the range, then we go back and then move to next pi1 and
// keep doing the similar thing.
filtered_candidates1.emplace_back(candidates1[i1]);
if (candidates1[i1].count > max_candidate_count1) {
max_candidate_count1 = candidates1[i1].count;
}
uint32_t current_i2 = i2;
while (current_i2 < candidates2.size() &&
candidates2[current_i2].position <=
candidates1[i1].position + mapping_positions_distance) {
if (current_i2 >= previous_end_i2) {
filtered_candidates2.emplace_back(candidates2[current_i2]);
if (candidates2[current_i2].count > max_candidate_count2) {
max_candidate_count2 = candidates2[current_i2].count;
}
}
++current_i2;
}
previous_end_i2 = current_i2;
++i1;
}
}
}
} // namespace chromap
================================================
FILE: src/candidate_processor.h
================================================
#ifndef CANDIDATE_PROCESSOR_H_
#define CANDIDATE_PROCESSOR_H_
#include
#include
#include
#include
#include
#include
#include "candidate.h"
#include "index.h"
#include "mapping_metadata.h"
#include "paired_end_mapping_metadata.h"
#include "sequence_batch.h"
#include "utils.h"
namespace chromap {
class CandidateProcessor {
public:
CandidateProcessor() = delete;
CandidateProcessor(int min_num_seeds_required_for_mapping,
const std::vector max_seed_frequencies)
: min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping),
max_seed_frequencies_(max_seed_frequencies) {}
~CandidateProcessor() = default;
void GenerateCandidates(int error_threshold, const Index &index,
MappingMetadata &mapping_metadata) const;
int SupplementCandidates(
int error_threshold, uint32_t search_range, const Index &index,
PairedEndMappingMetadata &paired_end_mapping_metadata) const;
void ReduceCandidatesForPairedEndRead(
uint32_t mapping_positions_distance,
PairedEndMappingMetadata &paired_end_mapping_metadata) const;
private:
void GenerateCandidatesOnOneStrand(int error_threshold,
int num_seeds_required,
uint32_t num_minimizers,
std::vector &hits,
std::vector &candidates) const;
int GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand(
const Strand strand, uint32_t search_range, int error_threshold,
const Index &index, const std::vector &minimizers,
const std::vector &mate_candidates,
uint32_t &repetitive_seed_length, std::vector &hits,
std::vector &candidates) const;
void MergeCandidates(int error_threshold, std::vector &c1,
std::vector &c2,
std::vector &buffer) const;
void ReduceCandidatesForPairedEndReadOnOneDirection(
uint32_t mapping_positions_distance,
const std::vector &candidates1,
const std::vector &candidates2,
std::vector &filtered_candidates1,
std::vector &filtered_candidates2) const;
const int min_num_seeds_required_for_mapping_;
// Vector of size 2. The first element is the frequency threshold, and the
// second element is the frequency threshold to run rescue. The second element
// should always larger than the first one.
// TODO(Haowen): add an error check.
const std::vector max_seed_frequencies_;
};
} // namespace chromap
#endif // CANDIDATE_PROCESSOR_H_
================================================
FILE: src/chromap.cc
================================================
#include "chromap.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace chromap {
void Chromap::ConstructIndex() {
// TODO(Haowen): Need a faster algorithm
// Load all sequences in the reference into one batch
SequenceBatch reference;
reference.InitializeLoading(index_parameters_.reference_file_path);
reference.LoadAllSequences();
const uint32_t num_sequences = reference.GetNumSequences();
Index index(index_parameters_);
index.Construct(num_sequences, reference);
index.Statistics(num_sequences, reference);
index.Save();
reference.FinalizeLoading();
}
uint32_t Chromap::LoadSingleEndReadsWithBarcodes(SequenceBatch &read_batch,
SequenceBatch &barcode_batch,
bool parallel_parsing) {
//double real_start_time = GetRealTime();
uint32_t num_loaded_reads = 0;
if (!parallel_parsing || mapping_parameters_.is_bulk_data) {
while (num_loaded_reads < read_batch_size_) {
bool no_more_read = read_batch.LoadOneSequenceAndSaveAt(num_loaded_reads);
bool no_more_barcode = no_more_read;
if (!mapping_parameters_.is_bulk_data) {
no_more_barcode =
barcode_batch.LoadOneSequenceAndSaveAt(num_loaded_reads);
}
if (no_more_read && no_more_barcode) {
break;
} else if (no_more_read || no_more_barcode){
ExitWithMessage("Numbers of reads and barcodes don't match!");
}
++num_loaded_reads;
}
} else {
uint32_t num_loaded_barcode = 0 ;
#pragma omp task shared(num_loaded_reads, read_batch)
{
uint32_t i = 0 ;
for (i = 0 ; i < read_batch_size_; ++i) {
if (read_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read
break ;
}
}
num_loaded_reads = i ;
}
#pragma omp task shared(num_loaded_barcode, barcode_batch)
{ // bulk data will go to the other big branch
uint32_t i = 0 ;
for (i = 0 ; i < read_batch_size_; ++i) {
if (barcode_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read
break ;
}
}
num_loaded_barcode = i ;
}
#pragma omp taskwait
if (num_loaded_reads != num_loaded_barcode) {
ExitWithMessage("Numbers of reads and barcodes don't match!");
}
}
/*if (num_loaded_reads > 0) {
std::cerr << "Loaded " << num_loaded_reads << " reads in "
<< GetRealTime() - real_start_time << "s.\n";
} else {
std::cerr << "No more reads.\n";
}*/
return num_loaded_reads;
}
uint32_t Chromap::LoadPairedEndReadsWithBarcodes(SequenceBatch &read_batch1,
SequenceBatch &read_batch2,
SequenceBatch &barcode_batch,
bool parallel_parsing) {
// double real_start_time = Chromap<>::GetRealTime();
uint32_t num_loaded_pairs = 0;
if (!parallel_parsing) {
while (num_loaded_pairs < read_batch_size_) {
bool no_more_read1 = read_batch1.LoadOneSequenceAndSaveAt(num_loaded_pairs);
bool no_more_read2 = read_batch2.LoadOneSequenceAndSaveAt(num_loaded_pairs);
bool no_more_barcode = no_more_read2;
if (!mapping_parameters_.is_bulk_data) {
no_more_barcode =
barcode_batch.LoadOneSequenceAndSaveAt(num_loaded_pairs);
}
if (no_more_read1 && no_more_read2 && no_more_barcode) {
break;
} else if (no_more_read1 || no_more_read2 || no_more_barcode){
ExitWithMessage("Numbers of reads and barcodes don't match!");
}
++num_loaded_pairs;
}
} else {
uint32_t num_loaded_read1 = 0;
uint32_t num_loaded_read2 = 0;
uint32_t num_loaded_barcode = 0;
#pragma omp task shared(num_loaded_read1, read_batch1)
{
uint32_t i = 0 ;
for (i = 0 ; i < read_batch_size_; ++i) {
if (read_batch1.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read
break ;
}
}
num_loaded_read1 = i ;
}
#pragma omp task shared(num_loaded_read2, read_batch2)
{
uint32_t i = 0 ;
for (i = 0 ; i < read_batch_size_; ++i) {
if (read_batch2.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read
break ;
}
}
num_loaded_read2 = i ;
}
#pragma omp task shared(num_loaded_barcode, barcode_batch)
{
if (!mapping_parameters_.is_bulk_data) {
uint32_t i = 0 ;
for (i = 0 ; i < read_batch_size_; ++i) {
if (barcode_batch.LoadOneSequenceAndSaveAt(i) == true) { // true: no more read
break ;
}
}
num_loaded_barcode = i ;
}
}
#pragma omp taskwait
if (mapping_parameters_.is_bulk_data) {
num_loaded_barcode = num_loaded_read2;
}
if (num_loaded_read1 != num_loaded_read2 || num_loaded_read2 != num_loaded_barcode) {
ExitWithMessage("Numbers of reads and barcodes don't match!");
}
num_loaded_pairs = num_loaded_read1 ;
}
// if (num_loaded_pairs > 0) {
// std::cerr << "Loaded " << num_loaded_pairs << " pairs in "<<
// Chromap<>::GetRealTime() - real_start_time << "s. ";
//} else {
// std::cerr << "No more reads.\n";
//}
return num_loaded_pairs;
}
void Chromap::TrimAdapterForPairedEndRead(uint32_t pair_index,
SequenceBatch &read_batch1,
SequenceBatch &read_batch2) {
const uint32_t raw_read1_length = read_batch1.GetSequenceLengthAt(pair_index);
const uint32_t raw_read2_length = read_batch2.GetSequenceLengthAt(pair_index);
const char *raw_read1 = read_batch1.GetSequenceAt(pair_index);
const char *raw_read2 = read_batch2.GetSequenceAt(pair_index);
const std::string &raw_negative_read1 =
read_batch1.GetNegativeSequenceAt(pair_index);
const std::string &raw_negative_read2 =
read_batch2.GetNegativeSequenceAt(pair_index);
// In the actual adaptor trimming, we assuem length(read1)<=length(read2). So
// we can have the case that read1 is a subset of read2.
const char *read1 =
raw_read1_length <= raw_read2_length ? raw_read1 : raw_read2;
const std::string &negative_read2 = raw_read1_length <= raw_read2_length
? raw_negative_read2
: raw_negative_read1;
const uint32_t read1_length = raw_read1_length <= raw_read2_length
? raw_read1_length
: raw_read2_length;
const uint32_t read2_length = raw_read1_length <= raw_read2_length
? raw_read2_length
: raw_read1_length;
const int min_overlap_length = mapping_parameters_.min_read_length;
const int seed_length = min_overlap_length / 2;
const int error_threshold_for_merging = 1;
bool is_merged = false;
for (int si = 0; si < error_threshold_for_merging + 1; ++si) {
size_t seed_start_position =
negative_read2.find(read1 + si * seed_length, 0, seed_length);
while (seed_start_position != std::string::npos) {
const bool before_seed_is_enough_long =
seed_start_position >= (size_t)(si * seed_length);
const bool overlap_is_enough_long =
(int)(read2_length - seed_start_position + seed_length * si) >=
min_overlap_length;
if (!before_seed_is_enough_long || !overlap_is_enough_long) {
seed_start_position = negative_read2.find(
read1 + si * seed_length, seed_start_position + 1, seed_length);
continue;
}
bool can_merge = true;
int num_errors = 0;
// The bases before the seed.
for (int i = 0; i < seed_length * si; ++i) {
if (negative_read2[seed_start_position - si * seed_length + i] !=
read1[i]) {
++num_errors;
}
if (num_errors > error_threshold_for_merging) {
can_merge = false;
break;
}
}
// The bases after the seed.
for (uint32_t i = seed_length; i + seed_start_position < read2_length &&
si * seed_length + i < read1_length;
++i) {
if (negative_read2[seed_start_position + i] !=
read1[si * seed_length + i]) {
++num_errors;
}
if (num_errors > error_threshold_for_merging) {
can_merge = false;
break;
}
}
if (can_merge) {
// Trim adapters and TODO: fix sequencing errors
int overlap_length =
read2_length - seed_start_position + si * seed_length;
int read2_offset = 0;
// The case that read1 is strictly contained in read2. overlap_length is
// inferred from the longer read2, which could be longer than read1. In
// that case, we don't trim read1 (make overlap length equal to read1
// length) and trim read2 as the original plan.
if (overlap_length > (int)read1_length) {
read2_offset = overlap_length - read1_length;
overlap_length = read1_length;
}
if (raw_read1_length <= raw_read2_length) {
read_batch1.TrimSequenceAt(pair_index, overlap_length);
read_batch2.TrimSequenceAt(pair_index, overlap_length + read2_offset);
} else {
read_batch1.TrimSequenceAt(pair_index, overlap_length + read2_offset);
read_batch2.TrimSequenceAt(pair_index, overlap_length);
}
is_merged = true;
// std::cerr << "Trimed! overlap length: " << overlap_length << ", " <<
// read1.GetLength() << " " << read2.GetLength() << "\n";
break;
}
seed_start_position = negative_read2.find(
read1 + si * seed_length, seed_start_position + 1, seed_length);
}
if (is_merged) {
break;
}
}
}
bool Chromap::PairedEndReadWithBarcodeIsDuplicate(
uint32_t pair_index, const SequenceBatch &barcode_batch,
const SequenceBatch &read_batch1, const SequenceBatch &read_batch2) {
int dedupe_seed_length = 16;
uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(pair_index);
uint64_t barcode_key =
barcode_batch.GenerateSeedFromSequenceAt(pair_index, 0, barcode_length);
uint64_t read1_seed1 =
read_batch1.GenerateSeedFromSequenceAt(pair_index, 0, dedupe_seed_length);
uint64_t read2_seed1 =
read_batch2.GenerateSeedFromSequenceAt(pair_index, 0, dedupe_seed_length);
uint64_t read_seed_key =
(read1_seed1 << (dedupe_seed_length * 2)) | read2_seed1;
uint64_t read1_seed2 = read_batch1.GenerateSeedFromSequenceAt(
pair_index, dedupe_seed_length, dedupe_seed_length * 2);
uint64_t read2_seed2 = read_batch2.GenerateSeedFromSequenceAt(
pair_index, dedupe_seed_length, dedupe_seed_length * 2);
khiter_t barcode_table_iterator =
kh_get(k64_seq, barcode_lookup_table_, barcode_key);
if (barcode_table_iterator != kh_end(barcode_lookup_table_)) {
uint32_t read_lookup_table_index =
kh_value(barcode_lookup_table_, barcode_table_iterator);
// std::cerr << "Have barcode, try to check read. " <<
// read_lookup_table_index << "\n";
khash_t(k128) *read_lookup_table =
read_lookup_tables_[read_lookup_table_index];
khiter_t read_lookup_table_iterator =
kh_get(k128, read_lookup_table, read_seed_key);
if (read_lookup_table_iterator != kh_end(read_lookup_table)) {
// std::cerr << "Have barcode, have read, try whether match.\n";
uint128_t read_seeds =
kh_value(read_lookup_table, read_lookup_table_iterator);
if (read_seeds.first == read1_seed2 && read_seeds.second == read2_seed2) {
// std::cerr << "Have barcode, have read, and match.\n";
return true;
} else {
// std::cerr << "Have barcode, have read, but don't match.\n";
return false;
}
} else {
// std::cerr << "Have barcode, no read.\n";
uint128_t read_seeds = {.first = read1_seed2, .second = read2_seed2};
int khash_return_code;
khiter_t read_lookup_table_insert_iterator =
kh_put(k128, read_lookup_table, read_seed_key, &khash_return_code);
assert(khash_return_code != -1 && khash_return_code != 0);
kh_value(read_lookup_table, read_lookup_table_insert_iterator) =
read_seeds;
// std::cerr << "Have barcode, no read.\n";
return false;
}
} else {
// insert the barcode and append a new read hash table to tables and then
// insert the reads
// std::cerr << "No barcode, no read.\n";
int khash_return_code;
khiter_t barcode_table_insert_iterator =
kh_put(k64_seq, barcode_lookup_table_, barcode_key, &khash_return_code);
assert(khash_return_code != -1 && khash_return_code != 0);
kh_value(barcode_lookup_table_, barcode_table_insert_iterator) =
read_lookup_tables_.size();
khash_t(k128) *read_lookup_table = kh_init(k128);
khiter_t read_lookup_table_iterator =
kh_put(k128, read_lookup_table, read_seed_key, &khash_return_code);
assert(khash_return_code != -1 && khash_return_code != 0);
uint128_t read_seeds = {.first = read1_seed2, .second = read2_seed2};
kh_value(read_lookup_table, read_lookup_table_iterator) = read_seeds;
read_lookup_tables_.push_back(read_lookup_table);
// std::cerr << "No barcode, no read.\n";
return false;
}
}
uint32_t Chromap::SampleInputBarcodesAndExamineLength() {
if (mapping_parameters_.is_bulk_data) {
return 0;
}
uint32_t sample_batch_size = 1000;
SequenceBatch barcode_batch(sample_batch_size, barcode_effective_range_);
barcode_batch.InitializeLoading(mapping_parameters_.barcode_file_paths[0]);
uint32_t num_loaded_barcodes = barcode_batch.LoadBatch();
uint32_t cell_barcode_length = barcode_batch.GetSequenceLengthAt(0);
for (uint32_t i = 1; i < num_loaded_barcodes; ++i) {
if (barcode_batch.GetSequenceLengthAt(i) != cell_barcode_length) {
ExitWithMessage("ERROR: barcode lengths are not equal in the sample!");
}
}
barcode_batch.FinalizeLoading();
return cell_barcode_length;
}
void Chromap::LoadBarcodeWhitelist() {
double real_start_time = GetRealTime();
int num_barcodes = 0;
if (1) {
gzFile barcode_whitelist_file =
gzopen(mapping_parameters_.barcode_whitelist_file_path.c_str(), "r");
const uint32_t barcode_buffer_size = 256;
char barcode[barcode_buffer_size];
while (gzgets(barcode_whitelist_file, barcode, barcode_buffer_size) != NULL) {
size_t barcode_length = strlen(barcode);
if (barcode[barcode_length - 1] == '\n') {
barcode[barcode_length - 1] = '\0';
--barcode_length;
}
if (barcode_length > 32) {
ExitWithMessage("ERROR: barcode length is greater than 32!");
}
if (barcode_length != barcode_length_) {
if (num_barcodes == 0) {
ExitWithMessage(
"ERROR: whitelist and input barcode lengths are not equal!");
} else {
ExitWithMessage(
"ERROR: barcode lengths are not equal in the whitelist!");
}
}
uint64_t barcode_key = GenerateSeedFromSequence(
barcode, barcode_length, 0, barcode_length);
int khash_return_code;
khiter_t barcode_whitelist_lookup_table_iterator =
kh_put(k64_seq, barcode_whitelist_lookup_table_, barcode_key,
&khash_return_code);
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) = 0;
assert(khash_return_code != -1 && khash_return_code != 0);
++num_barcodes;
}
if (!gzeof(barcode_whitelist_file)) {
ExitWithMessage("ERROR: barcode whitelist file does not exist or is truncated!");
}
gzclose(barcode_whitelist_file);
} else {
std::ifstream barcode_whitelist_file_stream(
mapping_parameters_.barcode_whitelist_file_path);
std::string barcode_whitelist_file_line;
// bool first_line = true;
while (getline(barcode_whitelist_file_stream, barcode_whitelist_file_line)) {
std::stringstream barcode_whitelist_file_line_string_stream(
barcode_whitelist_file_line);
//// skip the header
// if (barcode_whitelist_file_line[0] == '#' ||
// barcode_whitelist_file_line.find("kmer") == 0) {
// continue;
//}
std::string barcode;
barcode_whitelist_file_line_string_stream >> barcode;
size_t barcode_length = barcode.length();
if (barcode_length > 32) {
ExitWithMessage("ERROR: barcode length is greater than 32!");
}
if (barcode_length != barcode_length_) {
if (num_barcodes == 0) {
ExitWithMessage(
"ERROR: whitelist and input barcode lengths are not equal!");
} else {
ExitWithMessage(
"ERROR: barcode lengths are not equal in the whitelist!");
}
}
// if (first_line) {
// //size_t barcode_length = kmer.length();
// // Allocate memory to save pore model parameters
// //size_t num_pore_models = 1 << (kmer_size_ * 2);
// //pore_models_.assign(num_pore_models, PoreModelParameters());
// //first_line = false;
//}
// assert(kmer.length() == (size_t)kmer_size_);
uint64_t barcode_key = GenerateSeedFromSequence(
barcode.data(), barcode_length, 0, barcode_length);
// PoreModelParameters &pore_model_parameters =
// pore_models_[kmer_hash_value]; barcode_whitelist_file_line_string_stream
// >> pore_model_parameters.level_mean >> pore_model_parameters.level_stdv
// >> pore_model_parameters.sd_mean >> pore_model_parameters.sd_stdv;
int khash_return_code;
khiter_t barcode_whitelist_lookup_table_iterator =
kh_put(k64_seq, barcode_whitelist_lookup_table_, barcode_key,
&khash_return_code);
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) = 0;
assert(khash_return_code != -1 && khash_return_code != 0);
++num_barcodes;
}
barcode_whitelist_file_stream.close();
}
std::cerr << "Loaded " << num_barcodes << " barcodes in "
<< GetRealTime() - real_start_time << "s.\n";
}
void Chromap::ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes) {
double real_start_time = GetRealTime();
SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
for (size_t read_file_index = 0;
read_file_index < mapping_parameters_.read_file1_paths.size();
++read_file_index) {
barcode_batch.InitializeLoading(
mapping_parameters_.barcode_file_paths[read_file_index]);
uint32_t num_loaded_barcodes = barcode_batch.LoadBatch();
while (num_loaded_barcodes > 0) {
for (uint32_t barcode_index = 0; barcode_index < num_loaded_barcodes;
++barcode_index) {
std::vector N_pos; // position of Ns
barcode_batch.GetSequenceNsAt(barcode_index, /*little_endian=*/true,
N_pos);
if (N_pos.size() > 0) continue;
uint32_t barcode_length =
barcode_batch.GetSequenceLengthAt(barcode_index);
uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt(
barcode_index, 0, barcode_length);
khiter_t barcode_whitelist_lookup_table_iterator =
kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key);
if (barcode_whitelist_lookup_table_iterator !=
kh_end(barcode_whitelist_lookup_table_)) {
// Correct barcode
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) += 1;
++num_sample_barcodes_;
}
}
if (!mapping_parameters_.skip_barcode_check &&
num_sample_barcodes_ * 20 < num_loaded_barcodes) {
// Since num_loaded_pairs is a constant, this if is actuaclly only
// effective in the first iteration
ExitWithMessage(
"Less than 5\% barcodes can be found or corrected based on the "
"barcode whitelist.\nPlease check whether the barcode whitelist "
"matches the data, e.g. length, reverse-complement. If this is a "
"false warning, please run Chromap with the option "
"--skip-barcode-check.");
}
if (num_sample_barcodes_ >= max_num_sample_barcodes) {
break;
}
num_loaded_barcodes = barcode_batch.LoadBatch();
}
barcode_batch.FinalizeLoading();
if (num_sample_barcodes_ >= max_num_sample_barcodes) {
break;
}
}
std::cerr << "Compute barcode abundance using " << num_sample_barcodes_
<< " in " << GetRealTime() - real_start_time << "s.\n";
}
void Chromap::UpdateBarcodeAbundance(uint32_t num_loaded_barcodes,
const SequenceBatch &barcode_batch) {
double real_start_time = GetRealTime();
for (uint32_t barcode_index = 0; barcode_index < num_loaded_barcodes;
++barcode_index) {
uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(barcode_index);
uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt(
barcode_index, 0, barcode_length);
khiter_t barcode_whitelist_lookup_table_iterator =
kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key);
if (barcode_whitelist_lookup_table_iterator !=
kh_end(barcode_whitelist_lookup_table_)) {
// Correct barcode
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) += 1;
++num_sample_barcodes_;
}
}
std::cerr << "Update barcode abundance using " << num_sample_barcodes_
<< " in " << GetRealTime() - real_start_time << "s.\n";
}
bool Chromap::CorrectBarcodeAt(uint32_t barcode_index,
SequenceBatch &barcode_batch,
uint64_t &num_barcode_in_whitelist,
uint64_t &num_corrected_barcode) {
const uint32_t barcode_length =
barcode_batch.GetSequenceLengthAt(barcode_index);
const uint64_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt(
barcode_index, 0, barcode_length);
khiter_t barcode_whitelist_lookup_table_iterator =
kh_get(k64_seq, barcode_whitelist_lookup_table_, barcode_key);
std::vector N_pos; // position of Ns
barcode_batch.GetSequenceNsAt(barcode_index, /*little_endian=*/true, N_pos);
if (N_pos.size() >
(uint32_t)mapping_parameters_.barcode_correction_error_threshold)
return false;
if (N_pos.size() == 0 && barcode_whitelist_lookup_table_iterator !=
kh_end(barcode_whitelist_lookup_table_)) {
// Correct barcode
++num_barcode_in_whitelist;
return true;
} else if (mapping_parameters_.barcode_correction_error_threshold > 0) {
// Need to correct this barcode
// const char *barcode = barcode_batch->GetSequenceAt(barcode_index);
// std::cerr << barcode_index << " barcode " << barcode << " needs
// correction\n";
const char *barcode_qual = barcode_batch.GetSequenceQualAt(barcode_index);
std::vector corrected_barcodes_with_quals;
uint64_t mask = (uint64_t)3;
uint32_t i_start = 0;
uint32_t i_end = barcode_length;
uint32_t ti_limit = 3;
if (N_pos.size() > 0) {
i_start = N_pos[0];
i_end = N_pos[0] + 1;
ti_limit = 4;
}
for (uint32_t i = i_start; i < i_end; ++i) {
uint64_t barcode_key_to_change = mask << (2 * i);
barcode_key_to_change = ~barcode_key_to_change;
barcode_key_to_change &= barcode_key;
uint64_t base_to_change1 = (barcode_key >> (2 * i)) & mask;
for (uint32_t ti = 0; ti < ti_limit; ++ti) {
// change the base
base_to_change1 += 1;
base_to_change1 &= mask;
// generate the corrected key
uint64_t corrected_barcode_key =
barcode_key_to_change | (base_to_change1 << (2 * i));
barcode_whitelist_lookup_table_iterator = kh_get(
k64_seq, barcode_whitelist_lookup_table_, corrected_barcode_key);
if (barcode_whitelist_lookup_table_iterator !=
kh_end(barcode_whitelist_lookup_table_)) {
// find one possible corrected barcode
double barcode_abundance =
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) /
(double)num_sample_barcodes_;
int qual_offset = 33;
int adjusted_qual =
barcode_qual[barcode_length - 1 - i] - qual_offset;
adjusted_qual = adjusted_qual > 40 ? 40 : adjusted_qual;
adjusted_qual = adjusted_qual < 3 ? 3 : adjusted_qual;
double score =
pow(10.0, ((-adjusted_qual) / 10.0)) * barcode_abundance;
corrected_barcodes_with_quals.emplace_back(
BarcodeWithQual{barcode_length - 1 - i,
Uint8ToChar(base_to_change1), 0, 0, score});
// std::cerr << "1score: " << score << " pos1: " << barcode_length - 1
// - i << " b1: " << base_to_change1 << " pos2: " << 0 << " b2: " <<
// (char)0 << "\n";
}
if (mapping_parameters_.barcode_correction_error_threshold == 2) {
uint32_t j_start = i + 1;
uint32_t j_end = barcode_length;
uint32_t ti2_limit = 3;
if (N_pos.size() == 2) {
j_start = N_pos[1];
j_end = N_pos[1] + 1;
ti2_limit = 4;
}
for (uint32_t j = j_start; j < j_end; ++j) {
uint64_t barcode_key_to_change2 = mask << (2 * i);
barcode_key_to_change2 = mask << (2 * j);
barcode_key_to_change2 = ~barcode_key_to_change2;
barcode_key_to_change2 &= corrected_barcode_key;
uint64_t base_to_change2 =
(corrected_barcode_key >> (2 * j)) & mask;
for (uint32_t ti2 = 0; ti2 < ti2_limit; ++ti2) {
// change the base
base_to_change2 += 1;
base_to_change2 &= mask;
// generate the corrected key
uint64_t corrected_barcode_key2 =
barcode_key_to_change2 | (base_to_change2 << (2 * j));
barcode_whitelist_lookup_table_iterator =
kh_get(k64_seq, barcode_whitelist_lookup_table_,
corrected_barcode_key2);
if (barcode_whitelist_lookup_table_iterator !=
kh_end(barcode_whitelist_lookup_table_)) {
// find one possible corrected barcode
double barcode_abundance =
kh_value(barcode_whitelist_lookup_table_,
barcode_whitelist_lookup_table_iterator) /
(double)num_sample_barcodes_;
int qual_offset = 33;
int adjusted_qual =
barcode_qual[barcode_length - 1 - j] - qual_offset;
adjusted_qual = adjusted_qual > 40 ? 40 : adjusted_qual;
adjusted_qual = adjusted_qual < 3 ? 3 : adjusted_qual;
int adjusted_qual1 =
barcode_qual[barcode_length - 1 - i] - qual_offset;
adjusted_qual1 = adjusted_qual1 > 40 ? 40 : adjusted_qual1;
adjusted_qual1 = adjusted_qual1 < 3 ? 3 : adjusted_qual1;
adjusted_qual += adjusted_qual1;
double score =
pow(10.0, ((-adjusted_qual) / 10.0)) * barcode_abundance;
corrected_barcodes_with_quals.emplace_back(BarcodeWithQual{
barcode_length - 1 - i, Uint8ToChar(base_to_change1),
barcode_length - 1 - j, Uint8ToChar(base_to_change2),
score});
// std::cerr << "2score: " << score << " pos1: " <<
// barcode_length - 1 - i << " b1: " << base_to_change1 << "
// pos2: " << barcode_length - 1 -j << " b2: " <<
// base_to_change2
// << "\n";
}
}
}
}
}
}
size_t num_possible_corrected_barcodes =
corrected_barcodes_with_quals.size();
if (num_possible_corrected_barcodes == 0) {
// Barcode cannot be corrected, leave it for downstream
return false;
} else if (num_possible_corrected_barcodes == 1) {
// Just correct it
// std::cerr << "Corrected the barcode from " << barcode << " to ";
barcode_batch.CorrectBaseAt(
barcode_index, corrected_barcodes_with_quals[0].corrected_base_index1,
corrected_barcodes_with_quals[0].correct_base1);
if (corrected_barcodes_with_quals[0].correct_base2 != 0) {
barcode_batch.CorrectBaseAt(
barcode_index,
corrected_barcodes_with_quals[0].corrected_base_index2,
corrected_barcodes_with_quals[0].correct_base2);
}
// std::cerr << barcode << "\n";
// std::cerr << "score: " << corrected_barcodes_with_quals[0].score <<
// "\n"; std::cerr << "score: " << corrected_barcodes_with_quals[0].score
// << " pos1: " << corrected_barcodes_with_quals[0].corrected_base_index1
// << " b1: " << corrected_barcodes_with_quals[0].correct_base1 << " pos2:
// " << corrected_barcodes_with_quals[0].corrected_base_index2 << " b2: "
// << corrected_barcodes_with_quals[0].correct_base2 << "\n";
++num_corrected_barcode;
return true;
} else {
// Select the best correction
std::sort(corrected_barcodes_with_quals.begin(),
corrected_barcodes_with_quals.end(),
std::greater());
// int num_ties = 0;
double sum_score = 0;
for (size_t ci = 0; ci < num_possible_corrected_barcodes; ++ci) {
sum_score += corrected_barcodes_with_quals[ci].score;
// std::cerr << ci << " score: " <<
// corrected_barcodes_with_quals[ci].score << " pos1: " <<
// corrected_barcodes_with_quals[ci].corrected_base_index1 << " b1: " <<
// corrected_barcodes_with_quals[ci].correct_base1 << " pos2: " <<
// corrected_barcodes_with_quals[ci].corrected_base_index2 << " b2: " <<
// corrected_barcodes_with_quals[ci].correct_base2 << "\n"; if
// (corrected_barcodes_with_quals[ci].qual ==
// corrected_barcodes_with_quals[0].qual) {
// ++num_ties;
//}
}
int best_corrected_barcode_index = 0;
// if (num_ties > 0) {
// std::mt19937 tmp_generator(11);
// std::uniform_int_distribution distribution(0, num_ties); //
// important: inclusive range best_corrected_barcode_index =
// distribution(tmp_generator);
//}
// std::cerr << "Corrected the barcode from " << barcode << " to ";
double confidence_threshold =
mapping_parameters_.barcode_correction_probability_threshold;
if (corrected_barcodes_with_quals[best_corrected_barcode_index].score /
sum_score >
confidence_threshold) {
barcode_batch.CorrectBaseAt(
barcode_index,
corrected_barcodes_with_quals[best_corrected_barcode_index]
.corrected_base_index1,
corrected_barcodes_with_quals[best_corrected_barcode_index]
.correct_base1);
if (corrected_barcodes_with_quals[best_corrected_barcode_index]
.correct_base2 != 0) {
barcode_batch.CorrectBaseAt(
barcode_index,
corrected_barcodes_with_quals[best_corrected_barcode_index]
.corrected_base_index2,
corrected_barcodes_with_quals[best_corrected_barcode_index]
.correct_base2);
}
// std::cerr << barcode << "\n";
// std::cerr << "score: " <<
// corrected_barcodes_with_quals[best_corrected_barcode_index].score <<
// "\n"; std::cerr << "best score: " <<
// corrected_barcodes_with_quals[best_corrected_barcode_index].score <<
// " sum score: " << sum_score << "\n";
++num_corrected_barcode;
return true;
} else {
// std::cerr << "Didnt pass filter: " <<
// corrected_barcodes_with_quals[best_corrected_barcode_index].score /
// sum_score << "\n"; std::cerr << "best score: " <<
// corrected_barcodes_with_quals[best_corrected_barcode_index].score <<
// " sum score: " << sum_score << "\n";
return false;
}
}
} else {
return false;
}
}
void Chromap::OutputBarcodeStatistics() {
std::cerr << "Number of barcodes in whitelist: " << num_barcode_in_whitelist_
<< ".\n";
std::cerr << "Number of corrected barcodes: " << num_corrected_barcode_
<< ".\n";
}
void Chromap::OutputMappingStatistics() {
std::cerr << "Number of reads: " << num_reads_ << ".\n";
// std::cerr << "Number of duplicated reads: " << num_duplicated_reads_ <<
// ".\n";
std::cerr << "Number of mapped reads: " << num_mapped_reads_ << ".\n";
std::cerr << "Number of uniquely mapped reads: " << num_uniquely_mapped_reads_
<< ".\n";
std::cerr << "Number of reads have multi-mappings: "
<< num_mapped_reads_ - num_uniquely_mapped_reads_ << ".\n";
std::cerr << "Number of candidates: " << num_candidates_ << ".\n";
std::cerr << "Number of mappings: " << num_mappings_ << ".\n";
std::cerr << "Number of uni-mappings: " << num_uniquely_mapped_reads_
<< ".\n";
std::cerr << "Number of multi-mappings: "
<< num_mappings_ - num_uniquely_mapped_reads_ << ".\n";
}
void Chromap::ParseReadFormat(const std::string &read_format) {
if (read_format.empty()) {
return;
}
read1_effective_range_.InitializeParsing();
read2_effective_range_.InitializeParsing();
barcode_effective_range_.InitializeParsing();
uint32_t i, j;
for (i = 0; i < read_format.size();) {
for (j = i + 1; j < read_format.size() && read_format[j] != ','; ++j)
;
bool parse_success = true;
if (read_format[i] == 'r' && read_format[i + 1] == '1') {
parse_success =
read1_effective_range_.ParseFormatStringAndAppendEffectiveRange(
read_format.c_str() + i, j - i);
} else if (read_format[i] == 'r' && read_format[i + 1] == '2') {
parse_success =
read2_effective_range_.ParseFormatStringAndAppendEffectiveRange(
read_format.c_str() + i, j - i);
} else if (read_format[i] == 'b' && read_format[i + 1] == 'c') {
parse_success =
barcode_effective_range_.ParseFormatStringAndAppendEffectiveRange(
read_format.c_str() + i, j - i);
} else {
parse_success = false;
}
if (!parse_success) {
ExitWithMessage("Unknown read format: " + read_format + "\n");
}
i = j + 1;
}
read1_effective_range_.FinalizeParsing();
read2_effective_range_.FinalizeParsing();
barcode_effective_range_.FinalizeParsing();
}
void Chromap::GenerateCustomRidRanks(
const std::string &custom_rid_order_file_path,
uint32_t num_reference_sequences, const SequenceBatch &reference,
std::vector &rid_ranks) {
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
rid_ranks.emplace_back(i);
}
if (custom_rid_order_file_path.empty()) {
return;
}
std::unordered_map ref_name_to_rank;
std::ifstream custom_rid_order_file_stream(custom_rid_order_file_path);
std::string ref_name;
uint32_t ref_rank = 0;
while (getline(custom_rid_order_file_stream, ref_name)) {
ref_name_to_rank[ref_name] = ref_rank;
ref_rank += 1;
}
custom_rid_order_file_stream.close();
// First, rank the chromosomes in the custom order provided by users.
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
std::string ref_name(reference.GetSequenceNameAt(i));
if (ref_name_to_rank.find(ref_name) != ref_name_to_rank.end()) {
rid_ranks[i] = ref_name_to_rank[ref_name];
} else {
rid_ranks[i] = -1;
}
}
// There might be some rids without any custom order. We just order them based
// on their original order in the reference file.
uint32_t k = ref_name_to_rank.size();
// Rank the remaining chromosomes.
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
if (rid_ranks[i] == -1) {
rid_ranks[i] = k;
++k;
}
}
if (k > num_reference_sequences) {
ExitWithMessage(
"ERROR: unknown chromsome names found in chromosome order file.");
}
}
void Chromap::RerankCandidatesRid(std::vector &candidates) {
for (size_t i = 0; i < candidates.size(); ++i) {
uint64_t rid = (uint32_t)(candidates[i].position >> 32);
rid = custom_rid_rank_[rid];
candidates[i].position =
(candidates[i].position & (uint64_t)0xffffffff) | (rid << 32);
}
}
} // namespace chromap
================================================
FILE: src/chromap.h
================================================
#ifndef CHROMAP_H_
#define CHROMAP_H_
#include
#include
#include
#include
#include
#include
#include // Used these two for k-minhash
#include
#include // Used for frip est params splitting
#include "candidate_processor.h"
#include "cxxopts.hpp"
#include "draft_mapping_generator.h"
#include "feature_barcode_matrix.h"
#include "index.h"
#include "index_parameters.h"
#include "khash.h"
#include "mapping_generator.h"
#include "mapping_metadata.h"
#include "mapping_parameters.h"
#include "mapping_processor.h"
#include "mapping_writer.h"
#include "minimizer_generator.h"
#include "mmcache.hpp"
#include "paired_end_mapping_metadata.h"
#include "sequence_batch.h"
#include "sequence_effective_range.h"
#include "temp_mapping.h"
#include "utils.h"
#define CHROMAP_VERSION "0.3.3-r521"
namespace chromap {
class K_MinHash {
public:
/*
* MinHash Class - used to estimate the number of unique cache slots
* hit by each barcode
*
* @param k - size of MinHash sketch
* @param range - range of possible cache ids
*/
K_MinHash(size_t k, size_t range) : k_(k), range_(range) {}
inline void add(size_t num) {
/* If num is not present in queue, we will add it */
if (unique_slots_.find(num) == unique_slots_.end()) {
unique_slots_.insert(num);
pq_.push(num);
// only keep smallest k numbers
if (pq_.size() > k_) {
unique_slots_.erase(pq_.top());
pq_.pop();
}
}
}
inline size_t compute_cardinality() {
/* Use k-MinHash estimator to return estimated cardinality */
if (pq_.size() < k_) {return 0;}
size_t cardinality = (k_ * range_)/pq_.top() - 1;
return cardinality;
}
private:
size_t k_;
size_t range_;
/* Uses an unordered set to have O(1) find queries*/
std::priority_queue pq_; // max-heap
std::unordered_set unique_slots_; // keep track of unique values
};
class Chromap {
public:
Chromap() = delete;
// For index construction
Chromap(const IndexParameters &index_parameters)
: index_parameters_(index_parameters) {
barcode_lookup_table_ = NULL;
barcode_whitelist_lookup_table_ = NULL;
}
// For mapping
Chromap(const MappingParameters &mapping_parameters)
: mapping_parameters_(mapping_parameters) {
barcode_lookup_table_ = kh_init(k64_seq);
barcode_whitelist_lookup_table_ = kh_init(k64_seq);
ParseReadFormat(mapping_parameters.read_format);
}
~Chromap() {
if (barcode_whitelist_lookup_table_ != NULL) {
kh_destroy(k64_seq, barcode_whitelist_lookup_table_);
}
if (barcode_lookup_table_ != NULL) {
kh_destroy(k64_seq, barcode_lookup_table_);
}
if (read_lookup_tables_.size() > 0) {
for (uint32_t i = 0; i < read_lookup_tables_.size(); ++i) {
kh_destroy(k128, read_lookup_tables_[i]);
}
}
}
void ConstructIndex();
template
void MapSingleEndReads();
template
void MapPairedEndReads();
private:
uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch &read_batch,
SequenceBatch &barcode_batch,
bool parallel_parsing);
uint32_t LoadPairedEndReadsWithBarcodes(SequenceBatch &read_batch1,
SequenceBatch &read_batch2,
SequenceBatch &barcode_batch,
bool parallel_parsing);
void TrimAdapterForPairedEndRead(uint32_t pair_index,
SequenceBatch &read_batch1,
SequenceBatch &read_batch2);
bool PairedEndReadWithBarcodeIsDuplicate(uint32_t pair_index,
const SequenceBatch &barcode_batch,
const SequenceBatch &read_batch1,
const SequenceBatch &read_batch2);
uint32_t SampleInputBarcodesAndExamineLength();
void LoadBarcodeWhitelist();
void ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes);
void UpdateBarcodeAbundance(uint32_t num_loaded_barcodes,
const SequenceBatch &barcode_batch);
bool CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch &barcode_batch,
uint64_t &num_barcode_in_whitelist,
uint64_t &num_corrected_barcode);
void OutputBarcodeStatistics();
void OutputMappingStatistics();
void ParseReadFormat(const std::string &read_format);
// User custom rid order file contains a column of reference sequence names
// and there is one name on each row. The reference sequence name on the ith
// row means the rank of this sequence is i. This function loads the custom
// rid order file and generates a mapping from the original rids to their
// custom ranks, e.g., rid_ranks[i] is the custom rank of the ith rid in the
// reference.
void GenerateCustomRidRanks(const std::string &custom_rid_order_file_path,
uint32_t num_reference_sequences,
const SequenceBatch &reference,
std::vector &rid_ranks);
// TODO: generate reranked candidates directly.
void RerankCandidatesRid(std::vector &candidates);
// Parameters
const IndexParameters index_parameters_;
const MappingParameters mapping_parameters_;
// Default batch size, # reads for single-end reads, # read pairs for
// paired-end reads.
const uint32_t read_batch_size_ = 500000;
// 0-start, 1-end (includsive), 2-strand(-1:minus, 1:plus)
SequenceEffectiveRange barcode_effective_range_;
SequenceEffectiveRange read1_effective_range_;
SequenceEffectiveRange read2_effective_range_;
std::vector custom_rid_rank_;
std::vector pairs_custom_rid_rank_;
khash_t(k64_seq) * barcode_whitelist_lookup_table_;
// For identical read dedupe
khash_t(k64_seq) * barcode_lookup_table_;
std::vector read_lookup_tables_;
// For mapping.
const int min_unique_mapping_mapq_ = 4;
// For mapping stats.
uint64_t num_candidates_ = 0;
uint64_t num_mappings_ = 0;
uint64_t num_mapped_reads_ = 0;
uint64_t num_uniquely_mapped_reads_ = 0;
uint64_t num_reads_ = 0;
// # identical reads.
// uint64_t num_duplicated_reads_ = 0;
// For barcode stats.
const uint64_t initial_num_sample_barcodes_ = 20000000;
uint64_t num_sample_barcodes_ = 0;
uint64_t num_barcode_in_whitelist_ = 0;
uint64_t num_corrected_barcode_ = 0;
uint32_t barcode_length_ = 0;
};
template
void Chromap::MapSingleEndReads() {
double real_start_time = GetRealTime();
SequenceBatch reference;
reference.InitializeLoading(mapping_parameters_.reference_file_path);
reference.LoadAllSequences();
uint32_t num_reference_sequences = reference.GetNumSequences();
if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
num_reference_sequences, reference,
custom_rid_rank_);
reference.ReorderSequences(custom_rid_rank_);
}
Index index(mapping_parameters_.index_file_path);
index.Load();
const int kmer_size = index.GetKmerSize();
const int window_size = index.GetWindowSize();
// index.Statistics(num_sequences, reference);
SequenceBatch read_batch(read_batch_size_, read1_effective_range_);
SequenceBatch read_batch_for_loading(read_batch_size_,
read1_effective_range_);
SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
SequenceBatch barcode_batch_for_loading(read_batch_size_,
barcode_effective_range_);
std::vector> mappings_on_diff_ref_seqs;
mappings_on_diff_ref_seqs.reserve(num_reference_sequences);
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
mappings_on_diff_ref_seqs.emplace_back(std::vector());
}
std::vector> temp_mapping_file_handles;
// Preprocess barcodes for single cell data
if (!mapping_parameters_.is_bulk_data) {
barcode_length_ = SampleInputBarcodesAndExamineLength();
if (!mapping_parameters_.barcode_whitelist_file_path.empty()) {
LoadBarcodeWhitelist();
ComputeBarcodeAbundance(initial_num_sample_barcodes_);
}
}
MinimizerGenerator minimizer_generator(kmer_size, window_size);
CandidateProcessor candidate_processor(
mapping_parameters_.min_num_seeds_required_for_mapping,
mapping_parameters_.max_seed_frequencies);
MappingProcessor mapping_processor(mapping_parameters_,
min_unique_mapping_mapq_);
DraftMappingGenerator draft_mapping_generator(mapping_parameters_);
MappingGenerator mapping_generator(mapping_parameters_,
pairs_custom_rid_rank_);
MappingWriter mapping_writer(
mapping_parameters_, barcode_length_, pairs_custom_rid_rank_);
mapping_writer.OutputHeader(num_reference_sequences, reference);
uint32_t num_mappings_in_mem = 0;
uint64_t max_num_mappings_in_mem =
1 * ((uint64_t)1 << 30) / sizeof(MappingRecord);
if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM ||
mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAF ||
mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
max_num_mappings_in_mem = 1 * ((uint64_t)1 << 29) / sizeof(MappingRecord);
}
mm_cache mm_to_candidates_cache(2000003);
mm_to_candidates_cache.SetKmerLength(kmer_size);
struct _mm_history *mm_history = new struct _mm_history[read_batch_size_];
// Use bit encoding to represent mapping results
// bit 0: is barcode in whitelist
uint8_t *read_map_summary = NULL ;
if (!mapping_parameters_.summary_metadata_file_path.empty()) {
read_map_summary = new uint8_t[read_batch_size_];
memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_);
}
static uint64_t thread_num_candidates = 0;
static uint64_t thread_num_mappings = 0;
static uint64_t thread_num_mapped_reads = 0;
static uint64_t thread_num_uniquely_mapped_reads = 0;
static uint64_t thread_num_barcode_in_whitelist = 0;
static uint64_t thread_num_corrected_barcode = 0;
#pragma omp threadprivate( \
thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, \
thread_num_uniquely_mapped_reads, thread_num_barcode_in_whitelist, \
thread_num_corrected_barcode)
double real_start_mapping_time = GetRealTime();
for (size_t read_file_index = 0;
read_file_index < mapping_parameters_.read_file1_paths.size();
++read_file_index) {
read_batch_for_loading.InitializeLoading(
mapping_parameters_.read_file1_paths[read_file_index]);
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.InitializeLoading(
mapping_parameters_.barcode_file_paths[read_file_index]);
}
uint32_t num_loaded_reads_for_loading = 0;
uint32_t num_loaded_reads = LoadSingleEndReadsWithBarcodes(
read_batch_for_loading, barcode_batch_for_loading,
mapping_parameters_.num_threads >= 3 ? true : false);
read_batch_for_loading.SwapSequenceBatch(read_batch);
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
}
std::vector>>
mappings_on_diff_ref_seqs_for_diff_threads;
std::vector>>
mappings_on_diff_ref_seqs_for_diff_threads_for_saving;
mappings_on_diff_ref_seqs_for_diff_threads.reserve(
mapping_parameters_.num_threads);
mappings_on_diff_ref_seqs_for_diff_threads_for_saving.reserve(
mapping_parameters_.num_threads);
for (int ti = 0; ti < mapping_parameters_.num_threads; ++ti) {
mappings_on_diff_ref_seqs_for_diff_threads.emplace_back(
std::vector>(num_reference_sequences));
mappings_on_diff_ref_seqs_for_diff_threads_for_saving.emplace_back(
std::vector>(num_reference_sequences));
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
mappings_on_diff_ref_seqs_for_diff_threads[ti][i].reserve(
(num_loaded_reads + num_loaded_reads / 1000 *
mapping_parameters_.max_num_best_mappings) /
mapping_parameters_.num_threads / num_reference_sequences);
mappings_on_diff_ref_seqs_for_diff_threads_for_saving[ti][i].reserve(
(num_loaded_reads + num_loaded_reads / 1000 *
mapping_parameters_.max_num_best_mappings) /
mapping_parameters_.num_threads / num_reference_sequences);
}
}
#pragma omp parallel shared(num_reads_, mm_history, read_map_summary, reference, index, read_batch, barcode_batch, read_batch_for_loading, barcode_batch_for_loading, std::cerr, num_loaded_reads_for_loading, num_loaded_reads, num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs, temp_mapping_file_handles, mm_to_candidates_cache, mapping_writer, minimizer_generator, candidate_processor, mapping_processor, draft_mapping_generator, mapping_generator, num_mappings_in_mem, max_num_mappings_in_mem) num_threads(mapping_parameters_.num_threads) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_, num_barcode_in_whitelist_, num_corrected_barcode_)
{
thread_num_candidates = 0;
thread_num_mappings = 0;
thread_num_mapped_reads = 0;
thread_num_uniquely_mapped_reads = 0;
thread_num_barcode_in_whitelist = 0;
thread_num_corrected_barcode = 0;
MappingMetadata mapping_metadata;
#pragma omp single
{
while (num_loaded_reads > 0) {
double real_batch_start_time = GetRealTime();
num_reads_ += num_loaded_reads;
#pragma omp task
{
num_loaded_reads_for_loading = LoadSingleEndReadsWithBarcodes(
read_batch_for_loading, barcode_batch_for_loading,
mapping_parameters_.num_threads >= 12 ? true : false);
} // end of openmp loading task
uint32_t history_update_threshold =
mm_to_candidates_cache.GetUpdateThreshold(num_loaded_reads,
num_reads_,
false,
0.01);
// int grain_size = 10000;
//#pragma omp taskloop grainsize(grain_size) //num_tasks(num_threads_* 50)
#pragma omp taskloop num_tasks( \
mapping_parameters_.num_threads *mapping_parameters_.num_threads)
for (uint32_t read_index = 0; read_index < num_loaded_reads;
++read_index) {
bool current_barcode_is_whitelisted = true;
if (!mapping_parameters_.barcode_whitelist_file_path.empty()) {
current_barcode_is_whitelisted = CorrectBarcodeAt(
read_index, barcode_batch, thread_num_barcode_in_whitelist,
thread_num_corrected_barcode);
}
if (!(current_barcode_is_whitelisted ||
mapping_parameters_.output_mappings_not_in_whitelist)) {
if (read_map_summary != NULL)
read_map_summary[read_index] = 0;
continue;
}
if (read_batch.GetSequenceLengthAt(read_index) <
(uint32_t)mapping_parameters_.min_read_length) {
continue; // reads are too short, just drop.
}
read_batch.PrepareNegativeSequenceAt(read_index);
mapping_metadata.PrepareForMappingNextRead(
mapping_parameters_.max_seed_frequencies[0]);
minimizer_generator.GenerateMinimizers(
read_batch, read_index, mapping_metadata.minimizers_);
if (mapping_metadata.minimizers_.size() > 0) {
if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
RerankCandidatesRid(mapping_metadata.positive_candidates_);
RerankCandidatesRid(mapping_metadata.negative_candidates_);
}
if (mm_to_candidates_cache.Query(
mapping_metadata,
read_batch.GetSequenceLengthAt(read_index)) == -1) {
candidate_processor.GenerateCandidates(
mapping_parameters_.error_threshold, index,
mapping_metadata);
}
if (read_index < history_update_threshold) {
mm_history[read_index].timestamp = num_reads_;
mm_history[read_index].minimizers =
mapping_metadata.minimizers_;
mm_history[read_index].positive_candidates =
mapping_metadata.positive_candidates_;
mm_history[read_index].negative_candidates =
mapping_metadata.negative_candidates_;
mm_history[read_index].repetitive_seed_length =
mapping_metadata.repetitive_seed_length_;
}
size_t current_num_candidates =
mapping_metadata.GetNumCandidates();
if (current_num_candidates > 0) {
thread_num_candidates += current_num_candidates;
draft_mapping_generator.GenerateDraftMappings(
read_batch, read_index, reference, mapping_metadata);
const size_t current_num_draft_mappings =
mapping_metadata.GetNumDraftMappings();
if (current_num_draft_mappings > 0) {
std::vector>
&mappings_on_diff_ref_seqs =
mappings_on_diff_ref_seqs_for_diff_threads
[omp_get_thread_num()];
mapping_generator.GenerateBestMappingsForSingleEndRead(
read_batch, read_index, reference, barcode_batch,
mapping_metadata, mappings_on_diff_ref_seqs);
thread_num_mappings +=
std::min(mapping_metadata.GetNumBestMappings(),
mapping_parameters_.max_num_best_mappings);
++thread_num_mapped_reads;
if (mapping_metadata.GetNumBestMappings() == 1) {
++thread_num_uniquely_mapped_reads;
}
}
}
}
}
#pragma omp taskwait
for (uint32_t read_index = 0; read_index < history_update_threshold;
++read_index) {
if (mm_history[read_index].timestamp != num_reads_) continue;
mm_to_candidates_cache.Update(
mm_history[read_index].minimizers,
mm_history[read_index].positive_candidates,
mm_history[read_index].negative_candidates,
mm_history[read_index].repetitive_seed_length);
if (mm_history[read_index].positive_candidates.size() <
mm_history[read_index].positive_candidates.capacity() / 2) {
std::vector().swap(
mm_history[read_index].positive_candidates);
}
if (mm_history[read_index].negative_candidates.size() <
mm_history[read_index].negative_candidates.capacity() / 2) {
std::vector().swap(
mm_history[read_index].negative_candidates);
}
}
// std::cerr<<"cache memusage: " <<
// mm_to_candidates_cache.GetMemoryBytes() <<"\n" ;
if (!mapping_parameters_.summary_metadata_file_path.empty()) {
if (mapping_parameters_.is_bulk_data)
mapping_writer.UpdateSummaryMetadata(0, SUMMARY_METADATA_TOTAL,
num_loaded_reads) ;
else {
uint32_t nonwhitelist_count = 0;
for (uint32_t read_index = 0; read_index < num_loaded_reads; ++read_index)
if (read_map_summary[read_index] & 1) {
mapping_writer.UpdateSummaryMetadata(
barcode_batch.GenerateSeedFromSequenceAt(read_index, 0, barcode_length_),
SUMMARY_METADATA_TOTAL, 1);
} else {
++nonwhitelist_count;
}
mapping_writer.UpdateSpeicalCategorySummaryMetadata(/*nonwhitelist*/0,
SUMMARY_METADATA_TOTAL, nonwhitelist_count);
}
// By default, set the lowest bit to 1 (whether the barcode is in the whitelist)
memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_);
}
num_loaded_reads = num_loaded_reads_for_loading;
read_batch_for_loading.SwapSequenceBatch(read_batch);
barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
mappings_on_diff_ref_seqs_for_diff_threads.swap(
mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
#pragma omp task
{
num_mappings_in_mem +=
mapping_processor.MoveMappingsInBuffersToMappingContainer(
num_reference_sequences,
mappings_on_diff_ref_seqs_for_diff_threads_for_saving,
mappings_on_diff_ref_seqs);
if (mapping_parameters_.low_memory_mode &&
num_mappings_in_mem > max_num_mappings_in_mem) {
mapping_processor.ParallelSortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs, 0);
mapping_writer.OutputTempMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
temp_mapping_file_handles);
if (temp_mapping_file_handles.size() > 850
&& temp_mapping_file_handles.size() % 10 == 1) { // every 10 temp files, double the temp file size
max_num_mappings_in_mem <<= 1;
std::cerr << "Used " << temp_mapping_file_handles.size() << "temp files. Double the temp file volume to " << max_num_mappings_in_mem << "\n" ;
}
num_mappings_in_mem = 0;
}
}
std::cerr << "Mapped " << num_loaded_reads << " reads in "
<< GetRealTime() - real_batch_start_time << "s.\n";
}
} // end of openmp single
{
num_barcode_in_whitelist_ += thread_num_barcode_in_whitelist;
num_corrected_barcode_ += thread_num_corrected_barcode;
num_candidates_ += thread_num_candidates;
num_mappings_ += thread_num_mappings;
num_mapped_reads_ += thread_num_mapped_reads;
num_uniquely_mapped_reads_ += thread_num_uniquely_mapped_reads;
} // end of updating shared mapping stats
} // end of openmp parallel region
read_batch_for_loading.FinalizeLoading();
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.FinalizeLoading();
}
}
std::cerr << "Mapped all reads in " << GetRealTime() - real_start_mapping_time
<< "s.\n";
delete[] mm_history;
if (read_map_summary != NULL)
delete[] read_map_summary;
OutputMappingStatistics();
if (!mapping_parameters_.is_bulk_data) {
OutputBarcodeStatistics();
}
index.Destroy();
if (mapping_parameters_.low_memory_mode) {
// First, process the remaining mappings in the memory and save them on
// disk.
if (num_mappings_in_mem > 0) {
mapping_processor.SortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
mapping_writer.OutputTempMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
temp_mapping_file_handles);
num_mappings_in_mem = 0;
}
mapping_writer.ProcessAndOutputMappingsInLowMemory(
num_mappings_in_mem, num_reference_sequences, reference,
barcode_whitelist_lookup_table_, temp_mapping_file_handles);
} else {
if (mapping_parameters_.Tn5_shift) {
mapping_processor.ApplyTn5ShiftOnMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
}
if (mapping_parameters_.remove_pcr_duplicates) {
mapping_processor.RemovePCRDuplicate(num_reference_sequences,
mappings_on_diff_ref_seqs,
mapping_parameters_.num_threads);
std::cerr << "After removing PCR duplications, ";
mapping_processor.OutputMappingStatistics(num_reference_sequences,
mappings_on_diff_ref_seqs);
} else {
mapping_processor.ParallelSortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
mapping_parameters_.num_threads);
}
if (mapping_parameters_.allocate_multi_mappings) {
const uint64_t num_multi_mappings =
num_mapped_reads_ - num_uniquely_mapped_reads_;
mapping_processor.AllocateMultiMappings(
num_reference_sequences, num_multi_mappings,
mapping_parameters_.multi_mapping_allocation_distance,
mappings_on_diff_ref_seqs);
std::cerr << "After allocating multi-mappings, ";
mapping_processor.OutputMappingStatistics(num_reference_sequences,
mappings_on_diff_ref_seqs);
mapping_processor.SortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
}
mapping_writer.OutputMappings(num_reference_sequences, reference,
mappings_on_diff_ref_seqs);
}
mapping_writer.OutputSummaryMetadata();
reference.FinalizeLoading();
std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
}
template
void Chromap::MapPairedEndReads() {
double real_start_time = GetRealTime();
// Load reference
SequenceBatch reference;
reference.InitializeLoading(mapping_parameters_.reference_file_path);
reference.LoadAllSequences();
uint32_t num_reference_sequences = reference.GetNumSequences();
// Debugging Info (printing out reference information)
if (mapping_parameters_.debug_cache) {
for (size_t i = 0; i < num_reference_sequences; i++){
std::cout << "[DEBUG][INDEX] seq_i = " << i
<< " , seq_i_name = " << reference.GetSequenceNameAt(i) << std::endl;
}
}
if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
num_reference_sequences, reference,
custom_rid_rank_);
reference.ReorderSequences(custom_rid_rank_);
}
if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
GenerateCustomRidRanks(
mapping_parameters_.pairs_flipping_custom_rid_order_file_path,
num_reference_sequences, reference, pairs_custom_rid_rank_);
}
// Load index
Index index(mapping_parameters_.index_file_path);
index.Load();
const int kmer_size = index.GetKmerSize();
const int window_size = index.GetWindowSize();
// index.Statistics(num_sequences, reference);
// Initialize read batches
SequenceBatch read_batch1(read_batch_size_, read1_effective_range_);
SequenceBatch read_batch2(read_batch_size_, read2_effective_range_);
SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
SequenceBatch read_batch1_for_loading(read_batch_size_,
read1_effective_range_);
SequenceBatch read_batch2_for_loading(read_batch_size_,
read2_effective_range_);
SequenceBatch barcode_batch_for_loading(read_batch_size_,
barcode_effective_range_);
// Check cache-related parameters
std::cerr << "Cache Size: " << mapping_parameters_.cache_size << std::endl;
std::cerr << "Cache Update Param: " << mapping_parameters_.cache_update_param << std::endl;
std::vector seeds_for_batch(500000, 0);
// Variables used for counting number of associated cache slots
bool output_num_cache_slots_info = mapping_parameters_.output_num_uniq_cache_slots;
if (mapping_parameters_.summary_metadata_file_path.empty()) {
output_num_cache_slots_info = false;
}
const size_t k_for_minhash = mapping_parameters_.k_for_minhash;
std::cerr << "Output number of associated cache slots: " << output_num_cache_slots_info << std::endl;
std::cerr << "K for MinHash: " << k_for_minhash << std::endl;
int num_locks_for_map = 1000;
omp_lock_t map_locks[num_locks_for_map];
for (int i = 0; i < num_locks_for_map; ++i) {omp_init_lock(&map_locks[i]);}
std::vector> barcode_peak_map(num_locks_for_map);
// Parse out the parameters for chromap score (const, fric, dup, unmapped, lowmapq)
std::vector frip_est_params;
std::stringstream ss(mapping_parameters_.frip_est_params);
std::string token;
while(std::getline(ss, token, ';')) {
try {
auto curr_param = std::stod(token);
frip_est_params.push_back(curr_param);
} catch(...) {
chromap::ExitWithMessage(
"\nException occurred while processing chromap score parameters\n"
);
}
}
if (frip_est_params.size() != 5) {
chromap::ExitWithMessage(
"\nInvalid number of parameters, expecting 5 parameters but found "
+ std::to_string(frip_est_params.size())
+ " parameters\n"
);
}
// Initialize vector to keep track of cache hits for each thread
std::vector cache_hits_per_thread(mapping_parameters_.num_threads, 0);
// Initialize cache
mm_cache mm_to_candidates_cache(mapping_parameters_.cache_size);
mm_to_candidates_cache.SetKmerLength(kmer_size);
struct _mm_history *mm_history1 = new struct _mm_history[read_batch_size_];
struct _mm_history *mm_history2 = new struct _mm_history[read_batch_size_];
// The explanation for read_map_summary is in the single-end mapping function
uint8_t *read_map_summary = NULL ;
if (!mapping_parameters_.summary_metadata_file_path.empty()) {
read_map_summary = new uint8_t[read_batch_size_];
memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_);
}
std::vector> mappings_on_diff_ref_seqs;
// Initialize mapping container
mappings_on_diff_ref_seqs.reserve(num_reference_sequences);
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
mappings_on_diff_ref_seqs.emplace_back(std::vector());
}
std::vector> temp_mapping_file_handles;
// Preprocess barcodes for single cell data
if (!mapping_parameters_.is_bulk_data) {
barcode_length_ = SampleInputBarcodesAndExamineLength();
if (!mapping_parameters_.barcode_whitelist_file_path.empty()) {
LoadBarcodeWhitelist();
ComputeBarcodeAbundance(initial_num_sample_barcodes_);
}
}
MinimizerGenerator minimizer_generator(kmer_size, window_size);
CandidateProcessor candidate_processor(
mapping_parameters_.min_num_seeds_required_for_mapping,
mapping_parameters_.max_seed_frequencies);
MappingProcessor mapping_processor(mapping_parameters_,
min_unique_mapping_mapq_);
DraftMappingGenerator draft_mapping_generator(mapping_parameters_);
MappingGenerator mapping_generator(mapping_parameters_,
pairs_custom_rid_rank_);
MappingWriter mapping_writer(
mapping_parameters_, barcode_length_, pairs_custom_rid_rank_);
mapping_writer.OutputHeader(num_reference_sequences, reference);
uint32_t num_mappings_in_mem = 0;
uint64_t max_num_mappings_in_mem =
1 * ((uint64_t)1 << 30) / sizeof(MappingRecord);
if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM ||
mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAF ||
mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
max_num_mappings_in_mem = 1 * ((uint64_t)1 << 29) / sizeof(MappingRecord);
}
static uint64_t thread_num_candidates = 0;
static uint64_t thread_num_mappings = 0;
static uint64_t thread_num_mapped_reads = 0;
static uint64_t thread_num_uniquely_mapped_reads = 0;
static uint64_t thread_num_barcode_in_whitelist = 0;
static uint64_t thread_num_corrected_barcode = 0;
#pragma omp threadprivate( \
thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, \
thread_num_uniquely_mapped_reads, thread_num_barcode_in_whitelist, \
thread_num_corrected_barcode)
double real_start_mapping_time = GetRealTime();
for (size_t read_file_index = 0;
read_file_index < mapping_parameters_.read_file1_paths.size();
++read_file_index) {
// Set read batches to the current read files.
read_batch1_for_loading.InitializeLoading(
mapping_parameters_.read_file1_paths[read_file_index]);
read_batch2_for_loading.InitializeLoading(
mapping_parameters_.read_file2_paths[read_file_index]);
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.InitializeLoading(
mapping_parameters_.barcode_file_paths[read_file_index]);
}
// Load the first batches.
uint32_t num_loaded_pairs_for_loading = 0;
uint32_t num_loaded_pairs = LoadPairedEndReadsWithBarcodes(
read_batch1_for_loading, read_batch2_for_loading,
barcode_batch_for_loading, mapping_parameters_.num_threads >= 3 ? true : false);
read_batch1_for_loading.SwapSequenceBatch(read_batch1);
read_batch2_for_loading.SwapSequenceBatch(read_batch2);
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
}
// Setup thread private vectors to save mapping results.
std::vector>>
mappings_on_diff_ref_seqs_for_diff_threads;
std::vector>>
mappings_on_diff_ref_seqs_for_diff_threads_for_saving;
mappings_on_diff_ref_seqs_for_diff_threads.reserve(
mapping_parameters_.num_threads);
mappings_on_diff_ref_seqs_for_diff_threads_for_saving.reserve(
mapping_parameters_.num_threads);
for (int ti = 0; ti < mapping_parameters_.num_threads; ++ti) {
mappings_on_diff_ref_seqs_for_diff_threads.emplace_back(
std::vector>(num_reference_sequences));
mappings_on_diff_ref_seqs_for_diff_threads_for_saving.emplace_back(
std::vector>(num_reference_sequences));
for (uint32_t i = 0; i < num_reference_sequences; ++i) {
mappings_on_diff_ref_seqs_for_diff_threads[ti][i].reserve(
(num_loaded_pairs + num_loaded_pairs / 1000 *
mapping_parameters_.max_num_best_mappings) /
mapping_parameters_.num_threads / num_reference_sequences);
mappings_on_diff_ref_seqs_for_diff_threads_for_saving[ti][i].reserve(
(num_loaded_pairs + num_loaded_pairs / 1000 *
mapping_parameters_.max_num_best_mappings) /
mapping_parameters_.num_threads / num_reference_sequences);
}
}
#pragma omp parallel shared(num_reads_, num_reference_sequences, reference, index, read_batch1, read_batch2, barcode_batch, read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, minimizer_generator, candidate_processor, mapping_processor, draft_mapping_generator, mapping_generator, mapping_writer, std::cerr, num_loaded_pairs_for_loading, num_loaded_pairs, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving, mappings_on_diff_ref_seqs, num_mappings_in_mem, max_num_mappings_in_mem, temp_mapping_file_handles, mm_to_candidates_cache, mm_history1, mm_history2, read_map_summary) num_threads(mapping_parameters_.num_threads) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_, num_barcode_in_whitelist_, num_corrected_barcode_)
{
thread_num_candidates = 0;
thread_num_mappings = 0;
thread_num_mapped_reads = 0;
thread_num_uniquely_mapped_reads = 0;
thread_num_barcode_in_whitelist = 0;
thread_num_corrected_barcode = 0;
PairedEndMappingMetadata paired_end_mapping_metadata;
std::vector best_mapping_indices(
mapping_parameters_.max_num_best_mappings);
std::mt19937 generator(11);
#pragma omp single
{
double real_batch_start_time = GetRealTime();
while (num_loaded_pairs > 0) {
num_reads_ += num_loaded_pairs;
num_reads_ += num_loaded_pairs;
#pragma omp task
{
num_loaded_pairs_for_loading = LoadPairedEndReadsWithBarcodes(
read_batch1_for_loading, read_batch2_for_loading,
barcode_batch_for_loading,
mapping_parameters_.num_threads >= 12 ? true : false);
} // end of openmp loading task
int grain_size = 5000;
uint32_t history_update_threshold =
mm_to_candidates_cache.GetUpdateThreshold(num_loaded_pairs,
num_reads_,
true,
mapping_parameters_.cache_update_param
);
std::fill(cache_hits_per_thread.begin(), cache_hits_per_thread.end(), 0);
if (mapping_parameters_.debug_cache) {
std::cout << "[DEBUG][UPDATE] update_threshold = " << history_update_threshold << std::endl;
}
#pragma omp taskloop grainsize(grain_size)
for (uint32_t pair_index = 0; pair_index < num_loaded_pairs;
++pair_index) {
int thread_id = omp_get_thread_num();
bool current_barcode_is_whitelisted = true;
if (!mapping_parameters_.barcode_whitelist_file_path.empty()) {
current_barcode_is_whitelisted = CorrectBarcodeAt(
pair_index, barcode_batch, thread_num_barcode_in_whitelist,
thread_num_corrected_barcode);
}
// calculate seed value for each barcode to use later (below and summary update)
size_t curr_seed_val = barcode_batch.GenerateSeedFromSequenceAt(pair_index, 0, barcode_length_);
seeds_for_batch[pair_index] = curr_seed_val;
if (current_barcode_is_whitelisted ||
mapping_parameters_.output_mappings_not_in_whitelist) {
if (read_batch1.GetSequenceLengthAt(pair_index) <
(uint32_t)mapping_parameters_.min_read_length ||
read_batch2.GetSequenceLengthAt(pair_index) <
(uint32_t)mapping_parameters_.min_read_length) {
continue; // reads are too short, just drop.
}
read_batch1.PrepareNegativeSequenceAt(pair_index);
read_batch2.PrepareNegativeSequenceAt(pair_index);
if (mapping_parameters_.trim_adapters) {
TrimAdapterForPairedEndRead(pair_index, read_batch1,
read_batch2);
}
paired_end_mapping_metadata.PreparedForMappingNextReadPair(
mapping_parameters_.max_seed_frequencies[0]);
minimizer_generator.GenerateMinimizers(
read_batch1, pair_index,
paired_end_mapping_metadata.mapping_metadata1_.minimizers_);
minimizer_generator.GenerateMinimizers(
read_batch2, pair_index,
paired_end_mapping_metadata.mapping_metadata2_.minimizers_);
if (paired_end_mapping_metadata.BothEndsHaveMinimizers()) {
// declare temp local variable for cache result
int cache_query_result1 = 0;
int cache_query_result2 = 0;
int cache_miss = 0;
cache_query_result1 = mm_to_candidates_cache.Query(paired_end_mapping_metadata.mapping_metadata1_,
read_batch1.GetSequenceLengthAt(pair_index));
if (cache_query_result1 == -1)
{
candidate_processor.GenerateCandidates(
mapping_parameters_.error_threshold,
index,
paired_end_mapping_metadata.mapping_metadata1_
);
++cache_miss;
}
size_t current_num_candidates1 = paired_end_mapping_metadata.mapping_metadata1_.GetNumCandidates();
cache_query_result2 = mm_to_candidates_cache.Query(paired_end_mapping_metadata.mapping_metadata2_,
read_batch2.GetSequenceLengthAt(pair_index));
if (cache_query_result2 == -1)
{
candidate_processor.GenerateCandidates(
mapping_parameters_.error_threshold,
index,
paired_end_mapping_metadata.mapping_metadata2_
);
++cache_miss;
}
size_t current_num_candidates2 = paired_end_mapping_metadata.mapping_metadata2_.GetNumCandidates();
// increment variable for cache_hits
bool curr_read_hit_cache = false;
if (cache_query_result1 >= 0 || cache_query_result2 >= 0) {
cache_hits_per_thread[thread_id]++;
curr_read_hit_cache = true;
}
// update the peak counting data-structure
if (output_num_cache_slots_info && curr_read_hit_cache) {
// calculate which map this barcode is in
size_t map_id = curr_seed_val % num_locks_for_map;
// grab lock for this map, and add to the K-MinHash for this particular barcode
omp_set_lock(&map_locks[map_id]);
auto it = barcode_peak_map[map_id].emplace(curr_seed_val, K_MinHash(k_for_minhash, mapping_parameters_.cache_size)).first;
if (cache_query_result1 >= 0) {it->second.add(cache_query_result1);}
if (cache_query_result2 >= 0) {it->second.add(cache_query_result2);}
omp_unset_lock(&map_locks[map_id]);
}
if (pair_index < history_update_threshold) {
mm_history1[pair_index].timestamp =
mm_history2[pair_index].timestamp = num_reads_;
mm_history1[pair_index].minimizers =
paired_end_mapping_metadata.mapping_metadata1_
.minimizers_;
mm_history1[pair_index].positive_candidates =
paired_end_mapping_metadata.mapping_metadata1_
.positive_candidates_;
mm_history1[pair_index].negative_candidates =
paired_end_mapping_metadata.mapping_metadata1_
.negative_candidates_;
mm_history1[pair_index].repetitive_seed_length =
paired_end_mapping_metadata.mapping_metadata1_
.repetitive_seed_length_;
mm_history2[pair_index].minimizers =
paired_end_mapping_metadata.mapping_metadata2_
.minimizers_;
mm_history2[pair_index].positive_candidates =
paired_end_mapping_metadata.mapping_metadata2_
.positive_candidates_;
mm_history2[pair_index].negative_candidates =
paired_end_mapping_metadata.mapping_metadata2_
.negative_candidates_;
mm_history2[pair_index].repetitive_seed_length =
paired_end_mapping_metadata.mapping_metadata2_
.repetitive_seed_length_;
}
// Test whether we need to augment the candidate list with mate
// information.
int supplementCandidateResult = 0;
if (!mapping_parameters_.split_alignment) {
supplementCandidateResult =
candidate_processor.SupplementCandidates(
mapping_parameters_.error_threshold,
/*search_range=*/2 *
mapping_parameters_.max_insert_size,
index, paired_end_mapping_metadata);
current_num_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_
.GetNumCandidates();
current_num_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_
.GetNumCandidates();
}
if (current_num_candidates1 > 0 &&
current_num_candidates2 > 0 &&
!mapping_parameters_.split_alignment) {
paired_end_mapping_metadata.MoveCandidiatesToBuffer();
// Paired-end filter
candidate_processor.ReduceCandidatesForPairedEndRead(
mapping_parameters_.max_insert_size,
paired_end_mapping_metadata);
current_num_candidates1 =
paired_end_mapping_metadata.mapping_metadata1_
.GetNumCandidates();
current_num_candidates2 =
paired_end_mapping_metadata.mapping_metadata2_
.GetNumCandidates();
}
// Verify candidates
if (current_num_candidates1 > 0 &&
current_num_candidates2 > 0) {
thread_num_candidates +=
current_num_candidates1 + current_num_candidates2;
if (mapping_parameters_.custom_rid_order_file_path.length() >
0) {
RerankCandidatesRid(
paired_end_mapping_metadata.mapping_metadata1_
.positive_candidates_);
RerankCandidatesRid(
paired_end_mapping_metadata.mapping_metadata1_
.negative_candidates_);
RerankCandidatesRid(
paired_end_mapping_metadata.mapping_metadata2_
.positive_candidates_);
RerankCandidatesRid(
paired_end_mapping_metadata.mapping_metadata2_
.negative_candidates_);
}
draft_mapping_generator.GenerateDraftMappings(
read_batch1, pair_index, reference,
paired_end_mapping_metadata.mapping_metadata1_);
const size_t current_num_draft_mappings1 =
paired_end_mapping_metadata.mapping_metadata1_
.GetNumDraftMappings();
draft_mapping_generator.GenerateDraftMappings(
read_batch2, pair_index, reference,
paired_end_mapping_metadata.mapping_metadata2_);
const size_t current_num_draft_mappings2 =
paired_end_mapping_metadata.mapping_metadata2_
.GetNumDraftMappings();
if (current_num_draft_mappings1 > 0 &&
current_num_draft_mappings2 > 0) {
std::vector>
&mappings_on_diff_ref_seqs =
mappings_on_diff_ref_seqs_for_diff_threads
[omp_get_thread_num()];
if (!mapping_parameters_.split_alignment) {
// GenerateBestMappingsForPairedEndRead assumes the
// mappings are sorted by coordinate for non split
// alignments. In split alignment, we don't want to sort
// and this keeps mapping and split_sites vectors
// consistent.
paired_end_mapping_metadata.SortMappingsByPositions();
}
int force_mapq = -1;
if (supplementCandidateResult != 0) {
force_mapq = 0;
}
mapping_generator.GenerateBestMappingsForPairedEndRead(
pair_index, read_batch1, read_batch2, barcode_batch,
reference, best_mapping_indices, generator, force_mapq,
paired_end_mapping_metadata, mappings_on_diff_ref_seqs);
if (paired_end_mapping_metadata.GetNumBestMappings() == 1) {
++thread_num_uniquely_mapped_reads;
++thread_num_uniquely_mapped_reads;
}
thread_num_mappings += std::min(
paired_end_mapping_metadata.GetNumBestMappings(),
mapping_parameters_.max_num_best_mappings);
thread_num_mappings += std::min(
paired_end_mapping_metadata.GetNumBestMappings(),
mapping_parameters_.max_num_best_mappings);
if (paired_end_mapping_metadata.GetNumBestMappings() > 0) {
++thread_num_mapped_reads;
++thread_num_mapped_reads;
if (read_map_summary != NULL)
read_map_summary[pair_index] |= (cache_miss < 2 ? 2 : 0) ;
}
}
} // verify candidate
}
} else {
if (read_map_summary != NULL)
read_map_summary[pair_index] = 0 ;
}
} // end of for pair_index
// if (num_reads_ / 2 > initial_num_sample_barcodes_) {
// if (!is_bulk_data_) {
// if (!barcode_whitelist_file_path_.empty()) {
// UpdateBarcodeAbundance(num_loaded_pairs, barcode_batch);
// }
// }
//}
#pragma omp taskloop grainsize( std::max(history_update_threshold / mapping_parameters_.num_threads, (unsigned int)grain_size) )
// Update cache
for (uint32_t pair_index = 0; pair_index < history_update_threshold;
++pair_index) {
if (mm_history1[pair_index].timestamp != num_reads_) continue;
mm_to_candidates_cache.Update(
mm_history1[pair_index].minimizers,
mm_history1[pair_index].positive_candidates,
mm_history1[pair_index].negative_candidates,
mm_history1[pair_index].repetitive_seed_length,
mapping_parameters_.debug_cache);
mm_to_candidates_cache.Update(
mm_history2[pair_index].minimizers,
mm_history2[pair_index].positive_candidates,
mm_history2[pair_index].negative_candidates,
mm_history2[pair_index].repetitive_seed_length,
mapping_parameters_.debug_cache);
if (mm_history1[pair_index].positive_candidates.size() > 50) {
std::vector().swap(
mm_history1[pair_index].positive_candidates);
}
if (mm_history1[pair_index].negative_candidates.size() > 50) {
std::vector().swap(
mm_history1[pair_index].negative_candidates);
}
if (mm_history2[pair_index].positive_candidates.size() > 50) {
std::vector().swap(
mm_history2[pair_index].positive_candidates);
}
if (mm_history2[pair_index].negative_candidates.size() > 50) {
std::vector().swap(
mm_history2[pair_index].negative_candidates);
}
}
#pragma omp taskwait
if (!mapping_parameters_.summary_metadata_file_path.empty()) {
// Update total read count and number of cache hits
if (mapping_parameters_.is_bulk_data) {
// Sum up cache hits for each thread
int cache_hits_for_batch = 0;
for (int hits: cache_hits_per_thread) {
cache_hits_for_batch += hits;
}
mapping_writer.UpdateSummaryMetadata(0,
SUMMARY_METADATA_TOTAL,
num_loaded_pairs);
mapping_writer.UpdateSummaryMetadata(0,
SUMMARY_METADATA_CACHEHIT,
cache_hits_for_batch);
}
else {
uint32_t nonwhitelist_count = 0;
for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) {
uint64_t pair_seed = seeds_for_batch[pair_index];
if (read_map_summary[pair_index] & 1) {
mapping_writer.UpdateSummaryMetadata(
pair_seed,
SUMMARY_METADATA_TOTAL,
1);
} else {
++nonwhitelist_count ;
}
if (read_map_summary[pair_index] & 2) {
mapping_writer.UpdateSummaryMetadata(
pair_seed,
SUMMARY_METADATA_CACHEHIT,
1);
}
}
mapping_writer.UpdateSpeicalCategorySummaryMetadata(/*nonwhitelist*/0,
SUMMARY_METADATA_TOTAL, nonwhitelist_count);
}
memset(read_map_summary, 1, sizeof(*read_map_summary)*read_batch_size_);
}
std::cerr << "Mapped " << num_loaded_pairs << " read pairs in "
<< GetRealTime() - real_batch_start_time << "s.\n";
real_batch_start_time = GetRealTime();
// Swap to next batch
num_loaded_pairs = num_loaded_pairs_for_loading;
read_batch1_for_loading.SwapSequenceBatch(read_batch1);
read_batch2_for_loading.SwapSequenceBatch(read_batch2);
barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
mappings_on_diff_ref_seqs_for_diff_threads.swap(
mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
// Reset for next batch
std::fill(seeds_for_batch.begin(), seeds_for_batch.end(), 0);
#pragma omp task
{
// Handle output
num_mappings_in_mem +=
mapping_processor.MoveMappingsInBuffersToMappingContainer(
num_reference_sequences,
mappings_on_diff_ref_seqs_for_diff_threads_for_saving,
mappings_on_diff_ref_seqs);
if (mapping_parameters_.low_memory_mode &&
num_mappings_in_mem > max_num_mappings_in_mem) {
mapping_processor.ParallelSortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs, 0);
mapping_writer.OutputTempMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
temp_mapping_file_handles);
if (temp_mapping_file_handles.size() > 850
&& temp_mapping_file_handles.size() % 10 == 1) { // every 10 temp files, double the temp file size
max_num_mappings_in_mem <<= 1;
std::cerr << "Used " << temp_mapping_file_handles.size() << "temp files. Double the temp file volume to " << max_num_mappings_in_mem << "\n" ;
}
num_mappings_in_mem = 0;
}
} // end of omp task to handle output
} // end of while num_loaded_pairs
} // end of openmp single
num_barcode_in_whitelist_ += thread_num_barcode_in_whitelist;
num_corrected_barcode_ += thread_num_corrected_barcode;
num_candidates_ += thread_num_candidates;
num_mappings_ += thread_num_mappings;
num_mapped_reads_ += thread_num_mapped_reads;
num_uniquely_mapped_reads_ += thread_num_uniquely_mapped_reads;
} // end of openmp parallel region
read_batch1_for_loading.FinalizeLoading();
read_batch2_for_loading.FinalizeLoading();
if (!mapping_parameters_.is_bulk_data) {
barcode_batch_for_loading.FinalizeLoading();
}
} // end of for read_file_index
std::cerr << "Mapped all reads in " << GetRealTime() - real_start_mapping_time
<< "s.\n";
delete[] mm_history1;
delete[] mm_history2;
if (read_map_summary != NULL)
delete[] read_map_summary;
OutputMappingStatistics();
if (!mapping_parameters_.is_bulk_data) {
OutputBarcodeStatistics();
}
index.Destroy();
if (mapping_parameters_.low_memory_mode) {
// First, process the remaining mappings in the memory and save them on
// disk.
if (num_mappings_in_mem > 0) {
mapping_processor.SortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
mapping_writer.OutputTempMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
temp_mapping_file_handles);
num_mappings_in_mem = 0;
}
mapping_writer.ProcessAndOutputMappingsInLowMemory(
num_mappings_in_mem, num_reference_sequences, reference,
barcode_whitelist_lookup_table_, temp_mapping_file_handles);
}
else {
if (mapping_parameters_.Tn5_shift) {
mapping_processor.ApplyTn5ShiftOnMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
}
if (mapping_parameters_.remove_pcr_duplicates) {
mapping_processor.RemovePCRDuplicate(num_reference_sequences,
mappings_on_diff_ref_seqs,
mapping_parameters_.num_threads);
std::cerr << "After removing PCR duplications, ";
mapping_processor.OutputMappingStatistics(num_reference_sequences,
mappings_on_diff_ref_seqs);
} else {
mapping_processor.ParallelSortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs,
mapping_parameters_.num_threads);
}
if (mapping_parameters_.allocate_multi_mappings) {
const uint64_t num_multi_mappings =
num_mapped_reads_ - num_uniquely_mapped_reads_;
mapping_processor.AllocateMultiMappings(
num_reference_sequences, num_multi_mappings,
mapping_parameters_.multi_mapping_allocation_distance,
mappings_on_diff_ref_seqs);
std::cerr << "After allocating multi-mappings, ";
mapping_processor.OutputMappingStatistics(num_reference_sequences,
mappings_on_diff_ref_seqs);
mapping_processor.SortOutputMappings(num_reference_sequences,
mappings_on_diff_ref_seqs);
}
mapping_writer.OutputMappings(num_reference_sequences, reference,
mappings_on_diff_ref_seqs);
// Temporarily disable feature matrix output. Do not delete the following
// commented code.
// if (!is_bulk_data_ && !matrix_output_prefix_.empty()) {
// if constexpr (std::is_same::value) {
// FeatureBarcodeMatrix feature_barcode_matrix(
// cell_by_bin_, bin_size_, multi_mapping_allocation_distance_,
// depth_cutoff_to_call_peak_);
// std::vector> &mappings =
// allocate_multi_mappings_
// ? allocated_mappings_on_diff_ref_seqs
// : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs
// : mappings_on_diff_ref_seqs);
// feature_barcode_matrix.OutputFeatureMatrix(num_reference_sequences,
// reference, mappings,
// matrix_output_prefix_);
// }
//}
}
if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM)
mapping_writer.AdjustSummaryPairedEndOverCount() ;
// Destory the locks used for map
for (int i = 0; i < num_locks_for_map; ++i) {
omp_destroy_lock(&map_locks[i]);
}
// Add cardinality information to summary metadata
if (output_num_cache_slots_info) {
for (auto curr_map: barcode_peak_map) {
for (auto &pair: curr_map) {
size_t curr_seed = pair.first;
size_t est_num_slots = pair.second.compute_cardinality();
mapping_writer.UpdateSummaryMetadata(
curr_seed,
SUMMARY_METADATA_CARDINALITY,
est_num_slots);
}
}
}
mapping_writer.OutputSummaryMetadata(frip_est_params, output_num_cache_slots_info);
reference.FinalizeLoading();
if (mapping_parameters_.debug_cache) {mm_to_candidates_cache.PrintStats();}
std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
}
} // namespace chromap
#endif // CHROMAP_H_
================================================
FILE: src/chromap_driver.cc
================================================
#include "chromap_driver.h"
#include
#include
#include
#include
#include
#include "chromap.h"
#include "cxxopts.hpp"
namespace chromap {
namespace {
void AddIndexingOptions(cxxopts::Options &options) {
options.add_options("Indexing")("i,build-index", "Build index")(
"min-frag-length",
"Min fragment length for choosing k and w automatically [30]",
cxxopts::value(),
"INT")("k,kmer", "Kmer length [17]", cxxopts::value(), "INT")(
"w,window", "Window size [7]", cxxopts::value(), "INT");
}
void AddMappingOptions(cxxopts::Options &options) {
options.set_width(120).add_options("Mapping")(
"preset",
"Preset parameters for mapping reads (always applied before other "
"options) []\natac: mapping ATAC-seq/scATAC-seq reads\nchip: mapping "
"ChIP-seq reads\nhic: mapping Hi-C reads",
cxxopts::value(),
"STR")("split-alignment", "Allow split alignments")(
"e,error-threshold", "Max # errors allowed to map a read [8]",
cxxopts::value(), "INT")
//("A,match-score", "Match score [1]", cxxopts::value(), "INT")
//("B,mismatch-penalty", "Mismatch penalty [4]", cxxopts::value(),
//"INT")
//("O,gap-open-penalties", "Gap open penalty [6,6]",
// cxxopts::value>(), "INT[,INT]")
//("E,gap-extension-penalties", "Gap extension penalty [1,1]",
// cxxopts::value>(), "INT[,INT]")
("s,min-num-seeds", "Min # seeds to try to map a read [2]",
cxxopts::value(),
"INT")("f,max-seed-frequencies",
"Max seed frequencies for a seed to be selected [500,1000]",
cxxopts::value>(), "INT[,INT]")
//("n,max-num-best-mappings", "Only report n best mappings [1]",
// cxxopts::value(), "INT")
("l,max-insert-size",
"Max insert size, only for paired-end read mapping [1000]",
cxxopts::value(),
"INT")("q,MAPQ-threshold",
"Min MAPQ in range [0, 60] for mappings to be output [30]",
cxxopts::value(),
"INT")("min-read-length", "Min read length [30]",
cxxopts::value(), "INT")
//("multi-mapping-allocation-distance", "Uni-mappings within this distance
// from any end of multi-mappings are used for allocation [0]",
// cxxopts::value(), "INT")
//("multi-mapping-allocation-seed", "Seed for random number generator in
// multi-mapping allocation [11]", cxxopts::value(), "INT")
//("drop-repetitive-reads", "Drop reads with too many best mappings
//[500000]", cxxopts::value(), "INT")
("trim-adapters", "Try to trim adapters on 3'")("remove-pcr-duplicates",
"Remove PCR duplicates")(
"remove-pcr-duplicates-at-bulk-level",
"Remove PCR duplicates at bulk level for single cell data")(
"remove-pcr-duplicates-at-cell-level",
"Remove PCR duplicates at cell level for single cell data")
//("allocate-multi-mappings", "Allocate multi-mappings")
("Tn5-shift", "Perform Tn5 shift")("low-mem", "Use low memory mode")(
"bc-error-threshold",
"Max Hamming distance allowed to correct a barcode [1]",
cxxopts::value(),
"INT")("bc-probability-threshold",
"Min probability to correct a barcode [0.9]",
cxxopts::value(),
"FLT")("t,num-threads", "# threads for mapping [1]",
cxxopts::value(), "INT")
("frip-est-params", "coefficients used for frip est calculation, separated by semi-colons",
cxxopts::value(), "STR")
("turn-off-num-uniq-cache-slots", "turn off the output of number of cache slots in summary file");
}
void AddInputOptions(cxxopts::Options &options) {
options.add_options("Input")("r,ref", "Reference file",
cxxopts::value(), "FILE")(
"x,index", "Index file", cxxopts::value(), "FILE")(
"1,read1", "Single-end read files or paired-end read files 1",
cxxopts::value>(),
"FILE")("2,read2", "Paired-end read files 2",
cxxopts::value>(),
"FILE")("b,barcode", "Cell barcode files",
cxxopts::value>(), "FILE")(
"barcode-whitelist", "Cell barcode whitelist file",
cxxopts::value(),
"FILE")("read-format",
"Format for read files and barcode files [\"r1:0:-1,bc:0:-1\" "
"as 10x Genomics single-end format]",
cxxopts::value(), "STR");
}
void AddOutputOptions(cxxopts::Options &options) {
options.add_options("Output")("o,output", "Output file",
cxxopts::value(), "FILE")
//("p,matrix-output-prefix", "Prefix of matrix output files",
// cxxopts::value(), "FILE")
("output-mappings-not-in-whitelist",
"Output mappings with barcode not in the whitelist")(
"chr-order",
"Custom chromosome order file. If not specified, the order of "
"reference sequences will be used",
cxxopts::value(),
"FILE")("BED", "Output mappings in BED/BEDPE format")(
"TagAlign", "Output mappings in TagAlign/PairedTagAlign format")(
"SAM", "Output mappings in SAM format")(
"pairs",
"Output mappings in pairs format (defined by 4DN for HiC data)")(
"pairs-natural-chr-order",
"Custom chromosome order file for pairs flipping. If not specified, "
"the custom chromosome order will be used",
cxxopts::value(),
"FILE")("barcode-translate",
"Convert barcode to the specified sequences during output",
cxxopts::value(), "FILE")(
"summary",
"Summarize the mapping statistics at bulk or barcode level",
cxxopts::value(), "FILE");
//("PAF", "Output mappings in PAF format (only for test)");
}
void AddDevelopmentOptions(cxxopts::Options &options) {
options.add_options("Development options")("A,match-score", "Match score [1]",
cxxopts::value(), "INT")(
"B,mismatch-penalty", "Mismatch penalty [4]", cxxopts::value(),
"INT")("O,gap-open-penalties", "Gap open penalty [6,6]",
cxxopts::value>(), "INT[,INT]")(
"E,gap-extension-penalties", "Gap extension penalty [1,1]",
cxxopts::value>(),
"INT[,INT]")("n,max-num-best-mappings", "Only report n best mappings [1]",
cxxopts::value(),
"INT")("multi-mapping-allocation-distance",
"Uni-mappings within this distance from any end of "
"multi-mappings are used for allocation [0]",
cxxopts::value(), "INT")(
"multi-mapping-allocation-seed",
"Seed for random number generator in multi-mapping allocation [11]",
cxxopts::value(), "INT")(
"drop-repetitive-reads",
"Drop reads with too many best mappings [500000]", cxxopts::value(),
"INT")("allocate-multi-mappings", "Allocate multi-mappings")(
"PAF", "Output mappings in PAF format (only for test)")(
"skip-barcode-check",
"Do not check whether too few barcodes are in the whitelist")
("cache-size", "number of cache entries [4000003]", cxxopts::value(), "INT")
("cache-update-param", "value used to control number of reads sampled [0.01]", cxxopts::value(), "FLT")
("debug-cache", "verbose output for debugging cache used in chromap")
("k-for-minhash", "number of values stored in each MinHash sketch [250]", cxxopts::value(), "INT");
}
void AddPeakOptions(cxxopts::Options &options) {
options.add_options("Peak")("cell-by-bin", "Generate cell-by-bin matrix")(
"bin-size", "Bin size to generate cell-by-bin matrix [5000]",
cxxopts::value(),
"INT")("depth-cutoff", "Depth cutoff for peak calling [3]",
cxxopts::value(),
"INT")("peak-min-length", "Min length of peaks to report [30]",
cxxopts::value(), "INT")(
"peak-merge-max-length", "Peaks within this length will be merged [30]",
cxxopts::value(), "INT");
}
// Return all file paths that match the input pattern.
std::vector GetMatchedFilePaths(const std::string &pattern) {
glob_t glob_result;
memset(&glob_result, 0, sizeof(glob_result));
const int return_value =
glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result);
if (return_value != 0) {
globfree(&glob_result);
chromap::ExitWithMessage("glob() failed with return value " +
std::to_string(return_value) + "\n");
}
std::vector matched_file_paths;
matched_file_paths.reserve(glob_result.gl_pathc);
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
matched_file_paths.push_back(std::string(glob_result.gl_pathv[i]));
std::cerr << matched_file_paths.back() << "\n";
}
globfree(&glob_result);
return matched_file_paths;
}
// Return all file paths that match the input patterns.
std::vector GetMatchedFilePaths(
const std::vector &patterns) {
std::vector all_matched_file_paths;
for (const auto &pattern : patterns) {
std::vector matched_file_paths = GetMatchedFilePaths(pattern);
all_matched_file_paths.reserve(all_matched_file_paths.size() +
matched_file_paths.size());
all_matched_file_paths.insert(
std::end(all_matched_file_paths),
std::make_move_iterator(std::begin(matched_file_paths)),
std::make_move_iterator(std::end(matched_file_paths)));
}
return all_matched_file_paths;
}
} // namespace
void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
cxxopts::Options options(
"chromap", "Fast alignment and preprocessing of chromatin profiles");
options.add_options()("v,version", "Print version")("h,help", "Print help");
AddIndexingOptions(options);
AddMappingOptions(options);
// We don't support peak options for now.
// AddPeakOptions(options);
AddInputOptions(options);
AddOutputOptions(options);
AddDevelopmentOptions(options);
auto result = options.parse(argc, argv);
if (result.count("h")) {
std::cerr << options.help(
{"", "Indexing", "Mapping", "Peak", "Input", "Output"});
return;
}
if (result.count("v")) {
std::cerr << CHROMAP_VERSION << "\n";
return;
}
// Parameters and their default
IndexParameters index_parameters;
MappingParameters mapping_parameters;
if (result.count("preset")) {
std::string read_type = result["preset"].as();
if (read_type == "atac") {
std::cerr << "Preset parameters for ATAC-seq/scATAC-seq are used.\n";
mapping_parameters.max_insert_size = 2000;
mapping_parameters.trim_adapters = true;
mapping_parameters.remove_pcr_duplicates = true;
mapping_parameters.remove_pcr_duplicates_at_bulk_level = false;
mapping_parameters.Tn5_shift = true;
mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED;
mapping_parameters.low_memory_mode = true;
} else if (read_type == "chip") {
std::cerr << "Preset parameters for ChIP-seq are used.\n";
mapping_parameters.max_insert_size = 2000;
mapping_parameters.remove_pcr_duplicates = true;
mapping_parameters.low_memory_mode = true;
mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED;
} else if (read_type == "hic") {
std::cerr << "Preset parameters for Hi-C are used.\n";
mapping_parameters.error_threshold = 4;
mapping_parameters.mapq_threshold = 1;
mapping_parameters.split_alignment = true;
mapping_parameters.low_memory_mode = true;
mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAIRS;
} else {
chromap::ExitWithMessage("Unrecognized preset parameters " + read_type +
"\n");
}
}
// Optional parameters
if (result.count("min-frag-length")) {
int min_fragment_length = result["min-frag-length"].as();
if (min_fragment_length <= 60) {
index_parameters.kmer_size = 17;
index_parameters.window_size = 7;
} else if (min_fragment_length <= 80) {
index_parameters.kmer_size = 19;
index_parameters.window_size = 10;
} else {
index_parameters.kmer_size = 23;
index_parameters.window_size = 11;
}
}
if (result.count("k")) {
index_parameters.kmer_size = result["kmer"].as();
}
if (result.count("w")) {
index_parameters.window_size = result["window"].as();
}
if (result.count("e")) {
mapping_parameters.error_threshold = result["error-threshold"].as();
}
if (result.count("A")) {
mapping_parameters.match_score = result["match-score"].as();
}
if (result.count("B")) {
mapping_parameters.mismatch_penalty = result["mismatch-penalty"].as();
}
if (result.count("O")) {
mapping_parameters.gap_open_penalties =
result["gap-open-penalties"].as>();
}
if (result.count("E")) {
mapping_parameters.gap_extension_penalties =
result["gap-extension-penalties"].as>();
}
if (result.count("s")) {
mapping_parameters.min_num_seeds_required_for_mapping =
result["min-num-seeds"].as();
}
if (result.count("f")) {
mapping_parameters.max_seed_frequencies =
result["max-seed-frequencies"].as>();
}
if (result.count("n")) {
mapping_parameters.max_num_best_mappings =
result["max-num-best-mappings"].as();
}
if (result.count("l")) {
mapping_parameters.max_insert_size = result["max-insert-size"].as();
}
if (result.count("q")) {
mapping_parameters.mapq_threshold = result["MAPQ-threshold"].as();
}
if (result.count("t")) {
mapping_parameters.num_threads = result["num-threads"].as();
}
// check cache-related parameters
if (result.count("cache-update-param")) {
mapping_parameters.cache_update_param = result["cache-update-param"].as();
if (mapping_parameters.cache_update_param < 0.0 || mapping_parameters.cache_update_param > 1.0){
chromap::ExitWithMessage("cache update param is not approriate, must be in this range (0, 1]");
}
}
if (result.count("cache-size")) {
mapping_parameters.cache_size = result["cache-size"].as();
if (mapping_parameters.cache_size < 2000000 || mapping_parameters.cache_size > 15000000) {
chromap::ExitWithMessage("cache size is not in appropriate range\n");
}
}
if (result.count("debug-cache")) {
mapping_parameters.debug_cache = true;
}
if (result.count("frip-est-params")) {
mapping_parameters.frip_est_params = result["frip-est-params"].as();
}
if (result.count("turn-off-num-uniq-cache-slots")) {
mapping_parameters.output_num_uniq_cache_slots = false;
}
if (result.count("k-for-minhash")) {
mapping_parameters.k_for_minhash = result["k-for-minhash"].as();
if (mapping_parameters.k_for_minhash < 1 || mapping_parameters.k_for_minhash >= 2000) {
chromap::ExitWithMessage("Invalid paramter for size of MinHash sketch (--k-for-minhash)");
}
}
if (result.count("min-read-length")) {
mapping_parameters.min_read_length = result["min-read-length"].as();
}
if (result.count("bc-error-threshold")) {
mapping_parameters.barcode_correction_error_threshold =
result["bc-error-threshold"].as();
}
if (result.count("bc-probability-threshold")) {
mapping_parameters.barcode_correction_probability_threshold =
result["bc-probability-threshold"].as();
}
if (result.count("multi-mapping-allocation-distance")) {
mapping_parameters.multi_mapping_allocation_distance =
result["multi-mapping-allocation-distance"].as();
}
if (result.count("multi-mapping-allocation-seed")) {
mapping_parameters.multi_mapping_allocation_seed =
result["multi-mapping-allocation-seed"].as();
}
if (result.count("drop-repetitive-reads")) {
mapping_parameters.drop_repetitive_reads =
result["drop-repetitive-reads"].as();
}
if (result.count("trim-adapters")) {
mapping_parameters.trim_adapters = true;
}
if (result.count("remove-pcr-duplicates")) {
mapping_parameters.remove_pcr_duplicates = true;
}
if (result.count("remove-pcr-duplicates-at-bulk-level")) {
mapping_parameters.remove_pcr_duplicates_at_bulk_level = true;
}
if (result.count("remove-pcr-duplicates-at-cell-level")) {
mapping_parameters.remove_pcr_duplicates_at_bulk_level = false;
}
if (result.count("allocate-multi-mappings")) {
mapping_parameters.allocate_multi_mappings = true;
mapping_parameters.only_output_unique_mappings = false;
}
if (result.count("Tn5-shift")) {
mapping_parameters.Tn5_shift = true;
}
if (result.count("split-alignment")) {
mapping_parameters.split_alignment = true;
}
if (result.count("output-mappings-not-in-whitelist")) {
mapping_parameters.output_mappings_not_in_whitelist = true;
}
if (result.count("BED")) {
mapping_parameters.mapping_output_format = MAPPINGFORMAT_BED;
}
if (result.count("TagAlign")) {
mapping_parameters.mapping_output_format = MAPPINGFORMAT_TAGALIGN;
}
if (result.count("PAF")) {
mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAF;
}
if (result.count("pairs")) {
mapping_parameters.mapping_output_format = MAPPINGFORMAT_PAIRS;
}
if (result.count("SAM")) {
mapping_parameters.mapping_output_format = MAPPINGFORMAT_SAM;
}
if (result.count("low-mem")) {
mapping_parameters.low_memory_mode = true;
}
if (result.count("cell-by-bin")) {
mapping_parameters.cell_by_bin = true;
}
if (result.count("bin-size")) {
mapping_parameters.bin_size = result["bin-size"].as();
}
if (result.count("depth-cutoff")) {
mapping_parameters.depth_cutoff_to_call_peak =
result["depth-cutoff"].as();
}
if (result.count("peak-min-length")) {
mapping_parameters.peak_min_length = result["peak-min-length"].as();
}
if (result.count("peak-merge-max-length")) {
mapping_parameters.peak_merge_max_length =
result["peak-merge-max-length"].as();
}
std::cerr << std::setprecision(2) << std::fixed;
if (result.count("i")) {
if (result.count("r")) {
index_parameters.reference_file_path = result["ref"].as();
} else {
chromap::ExitWithMessage("No reference specified!");
}
if (result.count("o")) {
index_parameters.index_output_file_path =
result["output"].as();
} else {
chromap::ExitWithMessage("No output file specified!");
}
std::cerr << "Build index for the reference.\n";
std::cerr << "Kmer length: " << index_parameters.kmer_size
<< ", window size: " << index_parameters.window_size << "\n";
std::cerr << "Reference file: " << index_parameters.reference_file_path
<< "\n";
std::cerr << "Output file: " << index_parameters.index_output_file_path
<< "\n";
chromap::Chromap chromap_for_indexing(index_parameters);
chromap_for_indexing.ConstructIndex();
} else if (result.count("1")) {
std::cerr << "Start to map reads.\n";
if (result.count("r")) {
mapping_parameters.reference_file_path = result["ref"].as();
} else {
chromap::ExitWithMessage("No reference specified!");
}
if (result.count("o")) {
mapping_parameters.mapping_output_file_path =
result["output"].as();
} else {
chromap::ExitWithMessage("No output file specified!");
}
if (result.count("x")) {
mapping_parameters.index_file_path = result["index"].as();
} else {
chromap::ExitWithMessage("No index file specified!");
}
if (result.count("1")) {
mapping_parameters.read_file1_paths =
GetMatchedFilePaths(result["read1"].as>());
} else {
chromap::ExitWithMessage("No read file specified!");
}
if (result.count("2")) {
mapping_parameters.read_file2_paths =
GetMatchedFilePaths(result["read2"].as>());
}
if (result.count("b")) {
mapping_parameters.is_bulk_data = false;
mapping_parameters.barcode_file_paths =
GetMatchedFilePaths(result["barcode"].as>());
if (result.count("barcode-whitelist") == 0) {
std::cerr << "WARNING: there are input barcode files but a barcode "
"whitelist file is missing!\n";
}
}
if (result.count("barcode-whitelist")) {
if (mapping_parameters.is_bulk_data) {
chromap::ExitWithMessage(
"No barcode file specified but the barcode whitelist file is "
"given!");
}
mapping_parameters.barcode_whitelist_file_path =
result["barcode-whitelist"].as();
}
if (result.count("p")) {
mapping_parameters.matrix_output_prefix =
result["matrix-output-prefix"].as();
if (mapping_parameters.is_bulk_data) {
chromap::ExitWithMessage(
"No barcode file specified but asked to output matrix files!");
}
}
if (result.count("read-format")) {
mapping_parameters.read_format = result["read-format"].as();
}
if (result.count("chr-order")) {
mapping_parameters.custom_rid_order_file_path =
result["chr-order"].as();
}
if (result.count("pairs-natural-chr-order")) {
mapping_parameters.pairs_flipping_custom_rid_order_file_path =
result["pairs-natural-chr-order"].as