Repository: ryanlayer/samplot Branch: master Commit: 2929e4a90e54 Files: 60 Total size: 15.5 MB Directory structure: gitextract_qd67atv0/ ├── .circleci/ │ ├── config.yml │ └── setup.sh ├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── runtests.sh ├── samplot/ │ ├── __init__.py │ ├── __main__.py │ ├── samplot.py │ ├── samplot_vcf.py │ └── templates/ │ └── samplot_vcf.html ├── setup.py ├── ssshtest └── test/ ├── README.md ├── data/ │ ├── 2_59305747-59505747_X_151018513-151218513.BND.bam │ ├── 2_59305747-59505747_X_151018513-151218513.BND.bam.bai │ ├── Alu.2_X.bed.gz.tbi │ ├── Alu.2_X.csionly.bed.gz.csi │ ├── HG002_10X.bam │ ├── HG002_10X.bam.bai │ ├── HG002_1_89475845-89478561_DEL.tenx.bam │ ├── HG002_1_89475845-89478561_DEL.tenx.bam.bai │ ├── HG002_Illumina.bam │ ├── HG002_Illumina.bam.bai │ ├── HG002_ONT.cram │ ├── HG002_ONT.cram.crai │ ├── HG002_PacBio.bam │ ├── HG002_PacBio.bam.bai │ ├── HG003_Illumina.bam │ ├── HG003_Illumina.bam.bai │ ├── HG004_Illumina.bam │ ├── HG004_Illumina.bam.bai │ ├── Homo_sapiens.GRCh37.82.sort.2_X.gff3.gz.tbi │ ├── Homo_sapiens.GRCh37.csionly.2_X.gff3.gz.csi │ ├── NA12878_restricted.bam │ ├── NA12878_restricted.bam.bai │ ├── NA12889_restricted.bam │ ├── NA12889_restricted.bam.bai │ ├── NA12890_restricted.bam │ ├── NA12890_restricted.bam.bai │ ├── README.md │ ├── commands.sh │ ├── examples.bed │ ├── examples_padded.bed │ ├── hg19_chr1_58343117_58343622_deletion.bam │ ├── hg19_chr1_58343117_58343622_deletion.bam.bai │ ├── hg19_chr21_27373431_27375410_inversion.bam │ ├── hg19_chr21_27373431_27375410_inversion.bam.bai │ ├── nanopore-NA12878.bam │ ├── nanopore-NA12878.bam.bai │ ├── subset_alignments.sh │ ├── test.ped │ ├── test.vcf │ ├── test_site/ │ │ ├── README.md │ │ └── index.html │ └── test_site_cmds.sh ├── func/ │ ├── samplot_test.sh │ └── samplot_vcf_test.sh └── unit/ └── samplot_test.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .circleci/config.yml ================================================ version: 2 variables: setup_p3: &setup_p3 run: shell: /bin/bash name: Setup Samplot python3 dependencies command: bash .circleci/setup.sh 3 run_plot_tests: &run_plot_tests run: shell: /bin/bash name: Functional Tests for Samplot command: bash test/func/samplot_test.sh no_output_timeout: 1h run_vcf_tests: &run_vcf_tests run: shell: /bin/bash name: Functional Tests for Samplot command: bash test/func/samplot_vcf_test.sh no_output_timeout: 1h run_unit_tests: &run_unit_tests run: shell: /bin/bash name: Functional Tests for Samplot command: python test/unit/samplot_test.py no_output_timeout: 1h macos: &macos macos: xcode: "12.5.1" linux: &linux machine: ubuntu-2004:202201-02 install_samplot: &install_samplot run: name: Install Samplot command: python setup.py install jobs: test-linux-python3: <<: *linux steps: - checkout - *setup_p3 - *install_samplot - *run_plot_tests - *run_vcf_tests - *run_unit_tests test-macos-python3: <<: *macos steps: - checkout - *setup_p3 - *install_samplot - *run_plot_tests - *run_vcf_tests - *run_unit_tests workflows: version: 2 samplot-unit-tests: jobs: - test-linux-python3 - test-macos-python3 samplot-nightly-unit-tests: triggers: - schedule: cron: "0 0 * * *" filters: branches: only: - master jobs: - test-linux-python3 - test-macos-python3 ================================================ FILE: .circleci/setup.sh ================================================ #!/bin/bash set -exo pipefail WORKSPACE=$(pwd) # Set path echo "export PATH=$WORKSPACE/anaconda/bin:$PATH" >> $BASH_ENV source $BASH_ENV ## Passed from .circleci/config.yml (Only 3 permited) pythonversion=$1 if (( $pythonversion != 3 )) then echo -e "\nERROR: Python 3 designation required. Python version $pythonversion was supplied. Please correct and run again\n" exit 1 fi # setup conda and dependencies if [[ ! -d $WORKSPACE/anaconda ]]; then mkdir -p $WORKSPACE # step 1: download and install anaconda if [[ $OSTYPE == darwin* ]]; then tag="MacOSX" tag2="darwin" elif [[ $OSTYPE == linux* ]]; then tag="Linux" tag2="linux" else echo "Unsupported OS: $OSTYPE" exit 1 fi curl -O https://repo.anaconda.com/miniconda/Miniconda$pythonversion-latest-$tag-x86_64.sh sudo bash Miniconda$pythonversion-latest-$tag-x86_64.sh -b -p $WORKSPACE/anaconda/ sudo chown -R $USER $WORKSPACE/anaconda/ mkdir -p $WORKSPACE/anaconda/conda-bld/$tag-64 # step 3: setup channels conda config --system --add channels defaults conda config --system --add channels r conda config --system --add channels bioconda conda config --system --add channels conda-forge # step 3: install Samplot requirements conda install -y --file requirements.txt fi ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .vscode/ .DS_Store ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Ryan Layer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![CircleCI](https://circleci.com/gh/ryanlayer/samplot/tree/master.svg?style=svg)](https://circleci.com/gh/ryanlayer/samplot/tree/master) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/samplot/README.html)
`samplot` is a command line tool for rapid, multi-sample structural variant visualization. `samplot` takes SV coordinates and bam files and produces high-quality images that highlight any alignment and depth signals that substantiate the SV. If you use samplot, please cite https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02380-5 # Usage
samplot plot ``` usage: samplot plot [-h] [-n TITLES [TITLES ...]] [-r REFERENCE] [-z Z] -b BAMS [BAMS ...] [-o OUTPUT_FILE] [--output_dir OUTPUT_DIR] -s START -e END -c CHROM [-w WINDOW] [-d MAX_DEPTH] [-t SV_TYPE] [-T TRANSCRIPT_FILE] [--transcript_filename TRANSCRIPT_FILENAME] [--max_coverage_points MAX_COVERAGE_POINTS] [-A ANNOTATION_FILES [ANNOTATION_FILES ...]] [--annotation_filenames ANNOTATION_FILENAMES [ANNOTATION_FILENAMES ...]] [--coverage_tracktype {stack,superimpose,none}] [-a] [-H PLOT_HEIGHT] [-W PLOT_WIDTH] [-q INCLUDE_MQUAL] [--separate_mqual SEPARATE_MQUAL] [-j] [--start_ci START_CI] [--end_ci END_CI] [--long_read LONG_READ] [--ignore_hp] [--min_event_size MIN_EVENT_SIZE] [--xaxis_label_fontsize XAXIS_LABEL_FONTSIZE] [--yaxis_label_fontsize YAXIS_LABEL_FONTSIZE] [--legend_fontsize LEGEND_FONTSIZE] [--annotation_fontsize ANNOTATION_FONTSIZE] [--hide_annotation_labels] [--coverage_only] [--max_coverage MAX_COVERAGE] [--same_yaxis_scales] [--marker_size MARKER_SIZE] [--jitter [JITTER]] [--dpi DPI] [--annotation_scalar ANNOTATION_SCALAR] [--zoom ZOOM] [--debug DEBUG] options: -h, --help show this help message and exit -n TITLES [TITLES ...], --titles TITLES [TITLES ...] Space-delimited list of plot titles. Use quote marks to include spaces (i.e. "plot 1" "plot 2") -r REFERENCE, --reference REFERENCE Reference file for CRAM, required if CRAM files used -z Z, --z Z Number of stdevs from the mean (default 4) -b BAMS [BAMS ...], --bams BAMS [BAMS ...] Space-delimited list of BAM/CRAM file names -o OUTPUT_FILE, --output_file OUTPUT_FILE Output file name/type. Defaults to {type}_{chrom}_{start}_{end}.png --output_dir OUTPUT_DIR Output directory name. Defaults to working dir. Ignored if --output_file is set -s START, --start START Start position of region/variant (add multiple for translocation/BND events) -e END, --end END End position of region/variant (add multiple for translocation/BND events) -c CHROM, --chrom CHROM Chromosome (add multiple for translocation/BND events) -w WINDOW, --window WINDOW Window size (count of bases to include in view), default(0.5 * len) -d MAX_DEPTH, --max_depth MAX_DEPTH Max number of normal pairs to plot -t SV_TYPE, --sv_type SV_TYPE SV type. If omitted, plot is created without variant bar -T TRANSCRIPT_FILE, --transcript_file TRANSCRIPT_FILE GFF3 of transcripts --transcript_filename TRANSCRIPT_FILENAME Name for transcript track --max_coverage_points MAX_COVERAGE_POINTS number of points to plot in coverage axis (downsampled from region size for speed) -A ANNOTATION_FILES [ANNOTATION_FILES ...], --annotation_files ANNOTATION_FILES [ANNOTATION_FILES ...] Space-delimited list of bed.gz tabixed files of annotations (such as repeats, mappability, etc.) --annotation_filenames ANNOTATION_FILENAMES [ANNOTATION_FILENAMES ...] Space-delimited list of names for the tracks in --annotation_files --coverage_tracktype {stack,superimpose,none} type of track to use for low MAPQ coverage plot. -a, --print_args Print commandline arguments to a json file, useful with PlotCritic -H PLOT_HEIGHT, --plot_height PLOT_HEIGHT Plot height -W PLOT_WIDTH, --plot_width PLOT_WIDTH Plot width -q INCLUDE_MQUAL, --include_mqual INCLUDE_MQUAL Min mapping quality of reads to be included in plot (default 1) --separate_mqual SEPARATE_MQUAL coverage from reads with MAPQ <= separate_mqual plotted in lighter grey. To disable, pass in negative value -j, --json_only Create only the json file, not the image plot --start_ci START_CI confidence intervals of SV first breakpoint (distance from the breakpoint). Must be a comma-separated pair of ints (i.e. 20,40) --end_ci END_CI confidence intervals of SV end breakpoint (distance from the breakpoint). Must be a comma-separated pair of ints (i.e. 20,40) --long_read LONG_READ Min length of a read to be treated as a long-read (default 1000) --ignore_hp Choose to ignore HP tag in alignment files --min_event_size MIN_EVENT_SIZE Min size of an event in long-read CIGAR to include (default 20) --xaxis_label_fontsize XAXIS_LABEL_FONTSIZE Font size for X-axis labels (default 6) --yaxis_label_fontsize YAXIS_LABEL_FONTSIZE Font size for Y-axis labels (default 6) --legend_fontsize LEGEND_FONTSIZE Font size for legend labels (default 6) --annotation_fontsize ANNOTATION_FONTSIZE Font size for annotation labels (default 6) --hide_annotation_labels Hide the label (fourth column text) from annotation files, useful for regions with many annotations --coverage_only Hide all reads and show only coverage --max_coverage MAX_COVERAGE apply a maximum coverage cutoff. Unlimited by default --same_yaxis_scales Set the scales of the Y axes to the max of all --marker_size MARKER_SIZE Size of marks on pairs and splits (default 3) --jitter [JITTER] Add uniform random noise to insert sizes. This can be helpful to resolve overlapping entries. Either a custom value (<1.0) is supplied or 0.08 will be used. --dpi DPI Dots per inches (pixel count, default 300) --annotation_scalar ANNOTATION_SCALAR scaling factor for the optional annotation/trascript tracks --zoom ZOOM Only show +- zoom amount around breakpoints, much faster for large regions. Ignored if region smaller than --zoom (default 500000) --debug DEBUG Print debug statements ```
## Installing `Samplot` is available from bioconda and is installable via the conda package manager: ``` conda install -c bioconda samplot ``` ## Examples: Samplot requires either BAM files or CRAM files as primary input. If you use CRAM, you'll also need a reference genome. You can easily acquire a reference genome file with [GGD](https://github.com/gogetdata/ggd-cli), which is also available from conda. ### Basic use case Using data from NA12878, NA12889, and NA12890 in the [1000 Genomes Project](http://www.internationalgenome.org/about) (available in the test/data directory of samplot), we will inspect a possible deletion in NA12878 at 4:115928726-115931880 with respect to that same region in two unrelated samples NA12889 and NA12890. The following command will create an image of that region: ``` time samplot plot \ -n NA12878 NA12889 NA12890 \ -b samplot/test/data/NA12878_restricted.bam \ samplot/test/data/NA12889_restricted.bam \ samplot/test/data/NA12890_restricted.bam \ -o 4_115928726_115931880.png \ -c chr4 \ -s 115928726 \ -e 115931880 \ -t DEL real 0m3.882s user 0m3.831s sys 0m0.328s ``` The arguments used above are: `-n` The names to be shown for each sample in the plot `-b` The BAM/CRAM files of the samples (space-delimited) `-o` The name of the output file containing the plot `-c` The chromosome of the region of interest `-s` The start location of the region of interest `-e` The end location of the region of interest `-t` The type of the variant of interest This will create an image file named `4_115928726_115931880.png`, shown below: ### Gene and other genomic feature annotations Gene annotations (tabixed, gff3 file) and genome features (tabixed, bgzipped, bed file) can be included in the plots. Get the gene annotations: ``` wget ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/Homo_sapiens.GRCh37.82.gff3.gz bedtools sort -i Homo_sapiens.GRCh37.82.gff3.gz \ | bgzip -c > Homo_sapiens.GRCh37.82.sort.gff3.gz tabix Homo_sapiens.GRCh37.82.sort.gff3.gz ``` Get genome annotations, in this case Repeat Masker tracks and a mappability track: ``` wget http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig bigWigToBedGraph wgEncodeDukeMapabilityUniqueness35bp.bigWig wgEncodeDukeMapabilityUniqueness35bp.bed bgzip wgEncodeDukeMapabilityUniqueness35bp.bed tabix wgEncodeDukeMapabilityUniqueness35bp.bed.gz curl http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz \ | bgzip -d -c \ | cut -f 6,7,8,13 \ | bedtools sort -i stdin \ | bgzip -c > rmsk.bed.gz tabix rmsk.bed.gz ``` Plot: ``` samplot plot \ -n NA12878 NA12889 NA12890 \ -b samplot/test/data/NA12878_restricted.bam \ samplot/test/data/NA12889_restricted.bam \ samplot/test/data/NA12890_restricted.bam \ -o 4_115928726_115931880.d100.genes_reps_map.png \ -c chr4 \ -s 115928726 \ -e 115931880 \ -t DEL \ -d 100 \ -T Homo_sapiens.GRCh37.82.sort.gff3.gz \ -A rmsk.bed.gz wgEncodeDukeMapabilityUniqueness35bp.bed.gz ``` ## Generating images from a VCF file To plot images from structural variant calls in a VCF file, use samplot's `vcf` subcommand. This accepts a VCF file and the BAM files of samples you wish to plot, outputting images and an `index.html` page for review. ### Usage
samplot vcf ``` usage: samplot vcf [-h] [--vcf VCF] [-d OUT_DIR] [--ped PED] [--dn_only] [--min_call_rate MIN_CALL_RATE] [--filter FILTER] [-O {png,pdf,eps,jpg}] [--max_hets MAX_HETS] [--min_entries MIN_ENTRIES] [--max_entries MAX_ENTRIES] [--max_mb MAX_MB] [--min_bp MIN_BP] [--important_regions IMPORTANT_REGIONS] -b BAMS [BAMS ...] [--sample_ids SAMPLE_IDS [SAMPLE_IDS ...]] [--command_file COMMAND_FILE] [--format FORMAT] [--gff3 GFF3] [--downsample DOWNSAMPLE] [--manual_run] [--plot_all] [-t THREADS] [--debug] options: -h, --help show this help message and exit --vcf VCF, -v VCF VCF file containing structural variants (default: None) -d OUT_DIR, --out-dir OUT_DIR path to write output images (default: samplot-out) --ped PED path to ped (or .fam) file (default: None) --dn_only plots only putative de novo variants (PED file required) (default: False) --min_call_rate MIN_CALL_RATE only plot variants with at least this call-rate (default: None) --filter FILTER simple filter that samples must meet. Join multiple filters with '&' and specify --filter multiple times for 'or' e.g. DHFFC < 0.7 & SVTYPE = 'DEL' (default: []) -O {png,pdf,eps,jpg}, --output_type {png,pdf,eps,jpg} type of output figure (default: png) --max_hets MAX_HETS only plot variants with at most this many heterozygotes (default: None) --min_entries MIN_ENTRIES try to include homref samples as controls to get this many samples in plot (default: 6) --max_entries MAX_ENTRIES only plot at most this many heterozygotes (default: 10) --max_mb MAX_MB skip variants longer than this many megabases (default: None) --min_bp MIN_BP skip variants shorter than this many bases (default: 20) --important_regions IMPORTANT_REGIONS only report variants that overlap regions in this bed file (default: None) -b BAMS [BAMS ...], --bams BAMS [BAMS ...] Space-delimited list of BAM/CRAM file names (default: None) --sample_ids SAMPLE_IDS [SAMPLE_IDS ...] Space-delimited list of sample IDs, must have same order as BAM/CRAM file names. BAM RG tag required if this is omitted. (default: None) --command_file COMMAND_FILE store commands in this file. (default: samplot_vcf_cmds.tmp) --format FORMAT comma separated list of FORMAT fields to include in sample plot title (default: AS,AP,DHFFC) --gff3 GFF3 genomic regions (.gff with .tbi in same directory) used when building HTML table and table filters (default: None) --downsample DOWNSAMPLE Number of normal reads/pairs to plot (default: 1) --manual_run disables auto-run for the plotting commands (default: False) --plot_all plots all samples and all variants - limited by any filtering arguments set (default: False) -t THREADS, --threads THREADS Number of threads to use to generate plots. Default: 1 --debug prints out the reason for skipping any skipped variant entry (default: False) ```
`samplot vcf` can be used to quickly apply some basic filters to variants. Filters are applied via the `--filter` argument, which may be repeated as many times as desired. Each expression specified with the `--filter` option is applied separately in an OR fashion, which `&` characters may be used within a statement for AND operations. ### Example: ``` samplot vcf \ --filter "SVTYPE == 'DEL' & SU >= 8" \ --filter "SVTYPE == 'INV' & SU >= 5" \ --vcf example.vcf\ -d test/\ -O png\ --important_regions regions.bed\ -b example.bam > samplot_commands.sh ``` This example will create a directory named test (in the current working directory). A file named `index.html` will be created inside that directory to explore the images created. **Filters:** The above filters will remove all samples/variants from output except: * `DUP` variants with at least `SU` of 8 * `INV` variants with `SU` of at least 5 The specific `FORMAT` fields available in your VCF file may be different. I recommend SV VCF annotation with [duphold](https://github.com/brentp/duphold) by [brentp](https://github.com/brentp). For more complex expression-based VCF filtering, try brentp's [slivar](https://github.com/brentp/slivar), which provides similar but more broad options for filter expressions. **Region restriction.** Variants can also be filtered by overlap with a set of region (for example, gene coordinates for genes correlated with a disease). The `important_regions` argument provides a BED file of such regions for this example. **Filtering for de novo SVs** Using a [PED](https://gatkforums.broadinstitute.org/gatk/discussion/7696/pedigree-ped-files) file with `samplot vcf` allows filtering for variants that may be spontaneous/de novo variants. This filter is a simple Mendelian violation test. If a sample 1) has valid parent IDs in the PED file, 2) has a non-homref genotype (1/0, 0/1, or 1/1 in VCF), 3) passes filters, and 4) both parents have homref genotypes (0/0 in VCF), the sample may have a de novo variant. Filter parameters are not applied to the parents. The sample is plotted along with both parents, which are labeled as father and mother in the image. Example call with the addition of a PED file:
samplot vcf \
    --filter "SVTYPE == 'DEL' & SU >= 8" \
    --filter "SVTYPE == 'INV' & SU >= 5" \
    --vcf example.vcf\
    -d test/\
    -O png\
    --ped family.ped\
    --important_regions regions.bed\
    -b example.bam > samplot_commands.sh
**Additional notes.** * Variants where fewer than 95% of samples have a call (whether reference or alternate) will be excluded by default. This can be altered via the command-line argument `min_call_rate`. * If you're primarily interested in rare variants, you can use the `max_hets` filter to remove variants that appear in more than `max_hets` samples. * Large variants can now be plotted easily by samplot through use of `samplot plot`'s `zoom` argument. However, you can still choose to only plot variants larger than a given size using the `max_mb` argument. The `zoom` argument takes an integer parameter and shows only the intervals within +/- that parameter on either side of the breakpoints. A dotted line connects the ends of the variant call bar at the top of the window, showing that the region between breakpoint intervals is not shown. * By default, if fewer than 6 samples have a variant and additional homref samples are given, control samples will be added from the homref group to reach a total of 6 samples in the plot. This number may be altered using the `min_entries` argument. * Arguments that are optional in `samplot plot` can by given as arguments to `samplot vcf`. They will be applied to each image generated. #### CRAM inputs Samplot also support CRAM input, which requires a reference fasta file for reading as noted above. Notice that the reference file is not included in this repository due to size. This time we'll plot an interesting duplication at X:101055330-101067156. ``` samplot plot \ -n NA12878 NA12889 NA12890 \ -b samplot/test/data/NA12878_restricted.cram \ samplot/test/data/NA12889_restricted.cram \ samplot/test/data/NA12890_restricted.cram \ -o cramX_101055330_101067156.png -c chrX \ -s 101055330 \ -e 101067156 \ -t DUP \ -r hg19.fa ``` The arguments used above are the same as those used for the basic use case, with the addition of the following: `-r` The reference file used for reading CRAM files #### Plotting without the SV Samplot can also plot genomic regions that are unrelated to an SV. If you do not pass the SV type option (`-t`) then the top SV bar will go away and only the region that is given by `-c` `-s` and `-e` will be displayed. #### Long read (Oxford nanopore and PacBio) and linked read support Any alignment that is longer than 1000 bp is treated as a long read, and the plot design will focus on aligned regions and gaps. Aligned regions are in orange, and gaps follow the same DEL/DUP/INV color code used for short reads. The height of the alignment is based on the size of its largest gap. If the bam file has an MI tag, then the reads will be treated as linked reads. The plots will be similar to short read plots, but all alignments with the same MI is plotted at the same height according to alignment with the largest gap in the group. A green line connects all alignments in a group. ================================================ FILE: requirements.txt ================================================ matplotlib<3.7 numpy pysam>=0.15 wget Jinja2 ================================================ FILE: runtests.sh ================================================ echo "running unit tests:" python test/unit/samplot_test.py echo "finished unit tests" echo "running functional tests for \`plot\`:" bash test/func/samplot_test.sh printf "\n\nfinished functional tests for \`plot\`:\n" printf "running functional tests for \`vcf\`:\n" bash test/func/samplot_vcf_test.sh echo "finished functional tests for \`vcf\`:" ================================================ FILE: samplot/__init__.py ================================================ #!/usr/bin/env python __version__ = "1.3.1" ================================================ FILE: samplot/__main__.py ================================================ #!/usr/bin/env python import argparse import logging import sys from .__init__ import __version__ from .samplot import add_plot from .samplot_vcf import add_vcf def main(args=None): logging.basicConfig(level=logging.INFO, stream=sys.stderr, format="%(module)s - %(levelname)s: %(message)s") if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser( prog="samplot", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-v", "--version", help="Installed version", action="version", version="%(prog)s " + str(__version__), ) sub = parser.add_subparsers(title="[sub-commands]", dest="command") sub.required = True add_plot(sub) add_vcf(sub) args,extra_args = parser.parse_known_args(args) args.func(parser, args, extra_args) if __name__ == "__main__": sys.exit(main() or 0) ================================================ FILE: samplot/samplot.py ================================================ #!/usr/bin/env python from __future__ import print_function import logging import os import random import re import sys from argparse import SUPPRESS import matplotlib matplotlib.use("Agg") #must be before imports of submodules in matplotlib import matplotlib.gridspec as gridspec import matplotlib.patches as mpatches import matplotlib.path as mpath import matplotlib.pyplot as plt import matplotlib.ticker as ticker import numpy as np import pysam import warnings warnings.filterwarnings('ignore', 'FixedFormatter should only be used together with FixedLocator') from matplotlib.offsetbox import AnchoredText logger = logging.getLogger(__name__) INTERCHROM_YAXIS = 5000 COLORS = { "Deletion/Normal": "black", "Deletion": "black", "Duplication": "red", "Inversion": "blue", "InterChrmInversion": "blue", "InterChrm": "black", } READ_TYPES_USED = { "Deletion/Normal": False, "Duplication": False, "Inversion": False, "Aligned long read": False, "Linked read": False, "Split-read": False, "Paired-end read": False, } # pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples CIGAR_MAP = { "M": 0, "I": 1, "D": 2, "N": 3, "S": 4, "H": 5, "P": 6, "=": 7, "X": 8, "B": 9, } def strip_chr(chrom): """ safer way to replace chr string, to support non-human genomes """ if chrom[:3] == "chr": chrom = chrom[3:] return chrom # {{{class plan_step: class plan_step: step_events = ["Align", "ANNOTATION"] def __init__(self, start_pos, end_pos, event, info=None): self.start_pos = start_pos self.end_pos = end_pos self.event = event self.info = info def __str__(self): if self.info: return ( "Step(" + str(self.start_pos) + ", " + str(self.end_pos) + ", " + self.event + ", " + str(self.info) + ")" ) else: return ( "Step(" + str(self.start_pos) + ", " + str(self.end_pos) + ", " + self.event + ")" ) def __repr__(self): return str(self) # }}} # {{{class genome_interval: class genome_interval: def __init__(self, chrm, start, end): self.chrm = chrm self.start = start self.end = end def __str__(self): return "(" + self.chrm + "," + str(self.start) + "," + str(self.end) + ")" def __repr__(self): return str(self) def __eq__(self, gi2): return self.chrm == gi2.chrm and self.start == gi2.start and self.end == gi2.end """ return -1 if before, 0 if in, 1 if after """ def intersect(self, gi): if strip_chr(gi.chrm) < strip_chr(self.chrm) or gi.end < self.start: return -1 elif strip_chr(gi.chrm) > strip_chr(self.chrm) or gi.start > self.end: return 1 else: return 0 # }}} # {{{def get_range_hit(ranges, chrm, point): def get_range_hit(ranges, chrm, point): for j in range(len(ranges)): r = ranges[j] if ( strip_chr(r.chrm) == strip_chr(chrm) and r.start <= point and r.end >= point ): return j return None # }}} # {{{def map_genome_point_to_range_points(ranges, chrm, point): def map_genome_point_to_range_points(ranges, chrm, point): range_hit = get_range_hit(ranges, chrm, point) if range_hit == None: return None p = 1.0 / len(ranges) * range_hit + (1.0 / len(ranges)) * ( float(point - ranges[range_hit].start) / float(ranges[range_hit].end - ranges[range_hit].start) ) return p # }}} # {{{def points_in_window(points): def points_in_window(points): """Checks whether these points lie within the window of interest Points is a list of one start, one end coordinate (ints) """ if ( None in points or points[0] < -5 or points[1] < -5 or points[0] > 5 or points[1] > 5 ): return False return True # }}} # {{{ def get_tabix_iter(chrm, start, end, datafile): def get_tabix_iter(chrm, start, end, datafile): """Gets an iterator from a tabix BED/GFF3 file Used to avoid chrX vs. X notation issues when extracting data from annotation files """ try: tbx = pysam.TabixFile(datafile) except: tbx = pysam.TabixFile(datafile, index=datafile+".csi") itr = None try: itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000) except ValueError: # try and account for chr/no chr prefix if chrm[:3] == "chr": chrm = chrm[3:] else: chrm = "chr" + chrm try: itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000) except ValueError as e: logger.warning( "Could not fetch {}:{}-{} from {}".format( chrm, start, end, datafile ) ) print(e) return itr # }}} ##Coverage methods # {{{def add_coverage(bam_file, read, coverage, separate_mqual): def add_coverage(read, coverage_matrix, offset, column): """Adds a read to the known coverage Coverage from Pysam read is added to coverage_matrix. offset defines the start position of the current range column specifies which column to add to. """ curr_pos = read.reference_start if not read.cigartuples: return for op, length in read.cigartuples: if op in [CIGAR_MAP["M"], CIGAR_MAP["="], CIGAR_MAP["X"]]: coverage_matrix[curr_pos - offset: curr_pos + length - offset, column] += 1 curr_pos += length elif op == CIGAR_MAP["I"]: curr_pos = curr_pos elif op == CIGAR_MAP["D"]: curr_pos += length elif op == CIGAR_MAP["N"]: curr_pos = length elif op == CIGAR_MAP["S"]: curr_pos = curr_pos elif op == CIGAR_MAP["H"]: curr_pos = curr_pos else: curr_pos += length # }}} # {{{def plot_coverage(coverage, def plot_coverage( coverage, ax, ranges, hp_count, max_coverage, tracktype, yaxis_label_fontsize, max_coverage_points, ): """Plots high and low quality coverage for the region User may specify a preference between stacked and superimposed superimposed may cause unexpected behavior if low-quality depth is greater than high """ cover_x = [] cover_y_lowqual = [] cover_y_highqual = [] cover_y_all = [] for i in range(len(ranges)): r = ranges[i] region_len = r.end-r.start downsample = 1 if region_len > max_coverage_points: downsample = int(region_len / max_coverage_points) for i,pos in enumerate(range(r.start, r.end + 1)): if i%downsample != 0: continue cover_x.append(map_genome_point_to_range_points(ranges, r.chrm, pos)) if r.chrm in coverage and pos in coverage[r.chrm]: cover_y_all.append(coverage[r.chrm][pos][0] + coverage[r.chrm][pos][1]) cover_y_highqual.append(coverage[r.chrm][pos][0]) cover_y_lowqual.append(coverage[r.chrm][pos][1]) else: cover_y_lowqual.append(0) cover_y_highqual.append(0) cover_y_all.append(0) cover_y_lowqual = np.array(cover_y_lowqual) cover_y_highqual = np.array(cover_y_highqual) cover_y_all = np.array(cover_y_all) if max_coverage > 0: max_plot_depth = max_coverage elif cover_y_all.max() > 3 * cover_y_all.mean(): max_plot_depth = max( np.percentile(cover_y_all, 99.5), np.percentile(cover_y_all, 99.5) ) else: max_plot_depth = np.percentile(cover_y_all.max(), 99.5) ax2 = ax.twinx() ax2.set_xlim([0, 1]) if 0 == max_plot_depth: max_plot_depth = 0.01 ax2.set_ylim([0, max(1, max_plot_depth)]) bottom_fill = np.zeros(len(cover_y_all)) if tracktype == "stack": ax2.fill_between( cover_x, cover_y_highqual, bottom_fill, color="darkgrey", step="pre", alpha=0.4, ) ax2.fill_between( cover_x, cover_y_all, cover_y_highqual, color="grey", step="pre", alpha=0.15 ) elif tracktype == "superimpose": ax2.fill_between( cover_x, cover_y_lowqual, bottom_fill, color="grey", step="pre", alpha=0.15 ) ax2.fill_between( cover_x, cover_y_highqual, cover_y_lowqual, color="darkgrey", step="pre", alpha=0.4, ) ax2.fill_between( cover_x, cover_y_lowqual, bottom_fill, color="grey", step="pre", alpha=0.15 ) ## tracktype==None also allowed # number of ticks should be 6 if there's one hp, 3 otherwise tick_count = 5 if hp_count == 1 else 2 tick_count = max(int(max_plot_depth / tick_count), 1) # set axis parameters #ax2.yaxis.set_major_locator(ticker.FixedLocator(tick_count)) ax2.yaxis.set_major_locator(ticker.MultipleLocator(tick_count)) ax2.tick_params(axis="y", colors="grey", labelsize=yaxis_label_fontsize) ax2.spines["top"].set_visible(False) ax2.spines["bottom"].set_visible(False) ax2.spines["left"].set_visible(False) ax2.spines["right"].set_visible(False) ax2.tick_params(axis="x", length=0) ax2.tick_params(axis="y", length=0) # break the variant plot when we have multiple ranges for i in range(1, len(ranges)): ax2.axvline(x=1.0 / len(ranges), color="white", linewidth=5) return ax2 # }}} ##Pair End methods # {{{class PairedEnd: class PairedEnd: """container of paired-end read info Contains start(int), end(int), strand(bool True=forward), MI (int molecular identifier), HP (int haplotype) """ def __init__(self, chrm, start, end, is_reverse, MI_tag, HP_tag): """Create PairedEnd instance Genomic interval is defined by start and end integers Strand is opposite of is_reverse Molecular identifier and Haplotype are integers if present, else False """ self.pos = genome_interval(chrm, start, end) self.strand = not (is_reverse) # molecular identifier - linked reads only self.MI = None # haplotype - phased reads only self.HP = 0 if MI_tag: self.MI = MI_tag if HP_tag: self.HP = HP_tag def __repr__(self): return "PairedEnd(%s,%s,%s,%s,%s,%s)" % ( self.pos.chrm, self.pos.start, self.pos.end, self.strand, self.MI, self.HP, ) # }}} # {{{ def add_pair_end(bam_file, read, pairs, linked_reads): def add_pair_end(bam_file, read, pairs, linked_reads, ignore_hp): """adds a (mapped, primary, non-supplementary, and paired) read to the pairs list Pysam read is added as simpified PairedEnd instance to pairs Also added to linked_reads list if there is an associated MI tag """ if read.is_unmapped: return if not (read.is_paired): return if read.is_secondary: return if read.is_supplementary: return MI_tag = False HP_tag = False if read.has_tag("MI"): MI_tag = int(read.get_tag("MI")) if not ignore_hp and read.has_tag("HP"): HP_tag = int(read.get_tag("HP")) pe = PairedEnd( bam_file.get_reference_name(read.reference_id), read.reference_start, read.reference_end, read.is_reverse, MI_tag, HP_tag, ) if pe.HP not in pairs: pairs[pe.HP] = {} if read.query_name not in pairs[pe.HP]: pairs[pe.HP][read.query_name] = [] if pe.MI: if pe.HP not in linked_reads: linked_reads[pe.HP] = {} if pe.MI not in linked_reads[pe.HP]: linked_reads[pe.HP][pe.MI] = [[], []] linked_reads[pe.HP][pe.MI][0].append(read.query_name) pairs[pe.HP][read.query_name].append(pe) pairs[pe.HP][read.query_name].sort(key=lambda x: x.pos.start) # }}} # {{{def sample_normal(max_depth, pairs, z): def sample_normal(max_depth, pairs, z): """Downsamples paired-end reads Selects max_depth reads Does not remove discordant pairs, those with insert distance greater than z stdevs from mean Returns downsampled pairs list """ sampled_pairs = {} plus_minus_pairs = {} if max_depth == 0: return sampled_pairs for read_name in pairs: pair = pairs[read_name] if len(pair) != 2: continue if pair[0].strand == True and pair[1].strand == False: plus_minus_pairs[read_name] = pair else: sampled_pairs[read_name] = pair if len(plus_minus_pairs) > max_depth: lens = np.array( [pair[1].pos.end - pair[0].pos.start for pair in plus_minus_pairs.values()] ) mean = np.mean(lens) stdev = np.std(lens) inside_norm = {} for read_name in pairs: pair = pairs[read_name] if len(pair) != 2: continue if pair[1].pos.end - pair[0].pos.start >= mean + z * stdev: sampled_pairs[read_name] = pair else: inside_norm[read_name] = pair if len(inside_norm) > max_depth: for read_name in random.sample(list(inside_norm.keys()), max_depth): sampled_pairs[read_name] = inside_norm[read_name] else: for read_name in inside_norm: sampled_pairs[read_name] = inside_norm[read_name] else: for read_name in plus_minus_pairs: sampled_pairs[read_name] = plus_minus_pairs[read_name] return sampled_pairs # }}} # {{{def get_pairs_insert_sizes(pairs): def get_pairs_insert_sizes(ranges, pairs): """Extracts the integer insert sizes for all pairs Return list of integer insert sizes """ pair_insert_sizes = [] for hp in pairs: for read_name in pairs[hp]: if len(pairs[hp][read_name]) == 2: size = get_pair_insert_size(ranges, pairs[hp][read_name]) if size: pair_insert_sizes.append(size) return pair_insert_sizes # }}} # {{{def get_pair_insert_size(ranges, pair): def get_pair_insert_size(ranges, pair): """ Gives the outer distance """ first = pair[0] second = pair[1] # make sure both sides are in range if ( get_range_hit(ranges, first.pos.chrm, first.pos.start) != None or get_range_hit(ranges, first.pos.chrm, first.pos.end) != None ) and ( get_range_hit(ranges, second.pos.chrm, second.pos.start) != None or get_range_hit(ranges, second.pos.chrm, second.pos.end) != None ): if first.pos.chrm == second.pos.chrm: return abs(second.pos.end - first.pos.start) else: return INTERCHROM_YAXIS else: return None # }}} # {{{ def get_pairs_plan(ranges, pairs, linked_plan=False): def get_pairs_plan(ranges, pairs, linked_plan=False): steps = [] max_event = 0 insert_sizes = [] for read_name in pairs: pair = pairs[read_name] plan = get_pair_plan(ranges, pair) if plan: insert_size, step = plan insert_sizes.append(insert_size) steps.append(step) if len(insert_sizes) > 0: max_event = max(insert_sizes) plan = [max_event, steps] return plan # }}} # {{{def get_pair_plan(ranges, pair, linked_plan=False): def get_pair_plan(ranges, pair, linked_plan=False): if pair == None or len(pair) != 2: return None first = pair[0] second = pair[1] # see if they are part of a linked read if not linked_plan and (first.MI or second.MI): return None # make sure both ends are in the plotted region first_s_hit = get_range_hit(ranges, first.pos.chrm, first.pos.start) first_e_hit = get_range_hit(ranges, first.pos.chrm, first.pos.end) second_s_hit = get_range_hit(ranges, second.pos.chrm, second.pos.start) second_e_hit = get_range_hit(ranges, second.pos.chrm, second.pos.end) if (first_s_hit == None and first_e_hit == None) or ( second_s_hit == None and second_e_hit == None ): return None insert_size = get_pair_insert_size(ranges, pair) first_hit = first_s_hit if first_s_hit != None else first_e_hit second_hit = second_e_hit if second_e_hit != None else second_s_hit start = genome_interval( first.pos.chrm, max(first.pos.start, ranges[first_hit].start), max(first.pos.start, ranges[first_hit].start), ) end = genome_interval( second.pos.chrm, min(second.pos.end, ranges[second_hit].end), min(second.pos.end, ranges[second_hit].end), ) step = plan_step(start, end, "PAIREND") event_type = get_pair_event_type(pair) step.info = {"TYPE": event_type, "INSERTSIZE": insert_size} return insert_size, step # }}} # {{{def get_pair_event_type(pe_read): def get_pair_event_type(pe_read): """Decide what type of event the read supports (del/normal, dup, inv) """ event_by_strand = { (True, False): "Deletion/Normal", (False, True): "Duplication", (False, False): "Inversion", (True, True): "Inversion", } event_type = event_by_strand[pe_read[0].strand, pe_read[1].strand] return event_type # }}} def jitter(value, bounds: float = 0.1) -> float: """ Offset value by a random value within the defined bounds """ assert 0.0 <= bounds < 1.0 return value * (1 + bounds * random.uniform(-1, 1)) # {{{def plot_pair_plan(ranges, step, ax): def plot_pair_plan(ranges, step, ax, marker_size, jitter_bounds): p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points(ranges, step.end_pos.chrm, step.end_pos.end), ] if None in p: return False # some points are far outside of the printable area, so we ignore them if not points_in_window(p): return False READ_TYPES_USED["Paired-end read"] = True y = step.info["INSERTSIZE"] # Offset y-values using jitter to avoid overlapping lines y = jitter(y, bounds=jitter_bounds) event_type = step.info["TYPE"] READ_TYPES_USED[event_type] = True color = COLORS[event_type] # plot the individual pair ax.plot( p, [y, y], "-", color=color, alpha=0.25, lw=0.5, marker="s", markersize=marker_size, zorder=10, ) return True # }}} # {{{def plot_pairs(pairs, def plot_pairs( pairs, ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds, ): """Plots all PairedEnd reads for the region """ plan = get_pairs_plan(ranges, pairs) if not plan: [curr_min_insert_size, curr_max_insert_size] max_event, steps = plan for step in steps: plot_pair_plan(ranges, step, ax, marker_size, jitter_bounds) if not curr_min_insert_size or curr_min_insert_size > max_event: curr_min_insert_size = max_event if not curr_max_insert_size or curr_max_insert_size < max_event: curr_max_insert_size = max_event return [curr_min_insert_size, curr_max_insert_size] # }}} ##Split Read methods # {{{class SplitRead: class SplitRead: """container of split read info Contains start(int), end(int), strand(bool True=forward), query position (int), MI (int molecular identifier), HP (int haplotype) """ def __init__(self, chrm, start, end, strand, query_pos, MI_tag=None, HP_tag=None): """Create SplitRead instance Genomic interval is defined by start, end, and query_pos integers Strand is opposite of is_reverse Molecular identifier and Haplotype are integers if present, else False """ self.pos = genome_interval(chrm, start, end) self.strand = strand self.query_pos = query_pos # molecular identifier - linked reads only self.MI = None # haplotype - phased reads only self.HP = 0 if MI_tag: self.MI = MI_tag if HP_tag: self.HP = HP_tag def __repr__(self): return "SplitRead(%s,%s,%s,%s,%s,%s,%s)" % ( self.pos.chrm, self.pos.start, self.pos.end, self.strand, self.query_pos, self.MI, self.HP, ) # }}} # {{{def calc_query_pos_from_cigar(cigar, strand): def calc_query_pos_from_cigar(cigar, strand): """Uses the CIGAR string to determine the query position of a read The cigar arg is a string like the following: 86M65S The strand arg is a boolean, True for forward strand and False for reverse Returns pair of ints for query start, end positions """ cigar_ops = [[int(op[0]), op[1]] for op in re.findall("(\d+)([A-Za-z])", cigar)] order_ops = cigar_ops if not strand: # - strand order_ops = order_ops[::-1] qs_pos = 0 qe_pos = 0 q_len = 0 for op_position in range(len(cigar_ops)): op_len = cigar_ops[op_position][0] op_type = cigar_ops[op_position][1] if op_position == 0 and (op_type == "H" or op_type == "S"): qs_pos += op_len qe_pos += op_len q_len += op_len elif op_type == "H" or op_type == "S": q_len += op_len elif op_type == "M" or op_type == "I" or op_type == "X": qe_pos += op_len q_len += op_len return qs_pos, qe_pos # }}} # {{{def add_split(read, splits, bam_file, linked_reads): def add_split(read, splits, bam_file, linked_reads, ignore_hp): """adds a (primary, non-supplementary) read to the splits list Pysam read is added as simpified SplitRead instance to splits Also added to linked_reads list if there is an associated MI tag """ if read.is_secondary: return if read.is_supplementary: return if not read.has_tag("SA"): return qs_pos, qe_pos = calc_query_pos_from_cigar(read.cigarstring, (not read.is_reverse)) HP_tag = False MI_tag = False if read.has_tag("MI"): MI_tag = int(read.get_tag("MI")) if not ignore_hp and read.has_tag("HP"): HP_tag = int(read.get_tag("HP")) sr = SplitRead( bam_file.get_reference_name(read.reference_id), read.reference_start, read.reference_end, not (read.is_reverse), qs_pos, MI_tag, HP_tag, ) if sr.MI: if sr.HP not in linked_reads: linked_reads[sr.HP] = {} if sr.MI not in linked_reads[sr.HP]: linked_reads[sr.HP][sr.MI] = [[], []] linked_reads[sr.HP][sr.MI][1].append(read.query_name) if sr.HP not in splits: splits[sr.HP] = {} splits[sr.HP][read.query_name] = [sr] for sa in read.get_tag("SA").split(";"): if len(sa) == 0: continue A = sa.split(",") chrm = A[0] pos = int(A[1]) strand = A[2] == "+" cigar = A[3] #mapq and nm are never used, annotating this for code readability mapq = int(A[4]) nm = int(A[5]) qs_pos, qe_pos = calc_query_pos_from_cigar(cigar, strand) splits[sr.HP][read.query_name].append( SplitRead(chrm, pos, pos + qe_pos, strand, qs_pos) ) if len(splits[sr.HP][read.query_name]) == 1: del splits[sr.HP][read.query_name] else: splits[sr.HP][read.query_name].sort(key=lambda x: x.pos.start) # }}} # {{{def get_split_plan(ranges, split): def get_split_plan(ranges, split, linked_plan=False): """ There can be 2 or more alignments in a split. Plot only those that are in a range, and set the insert size to be the largest gap A split read acts like a long read, so we will covert the split read to a long read, then convert the long read plan back to a split read plan """ alignments = [] for s in split: # see if they are part of a linked read if not linked_plan and (s.MI): return None alignment = Alignment(s.pos.chrm, s.pos.start, s.pos.end, s.strand, s.query_pos) alignments.append(alignment) long_read = LongRead(alignments) long_reads = {} long_reads["convert"] = [long_read] plan = get_long_read_plan("convert", long_reads, ranges) if not plan: return None max_gap, lr_steps = plan if len(lr_steps) < 3: return None sr_steps = [] # a split read will include 3 long read steps, align, event, align for i in range(0, len(lr_steps), 2): if i + 2 > len(lr_steps): break if ( lr_steps[i].info["TYPE"] == "Align" and lr_steps[i + 1].info["TYPE"] != "Align" and lr_steps[i + 2].info["TYPE"] == "Align" ): start = genome_interval( lr_steps[i].end_pos.chrm, lr_steps[i].end_pos.end, lr_steps[i].end_pos.end, ) end = genome_interval( lr_steps[i + 2].start_pos.chrm, lr_steps[i + 2].start_pos.start, lr_steps[i + 2].start_pos.start, ) sr_steps.append( plan_step( start, end, "SPLITREAD", info={"TYPE": lr_steps[i + 1].info["TYPE"], "INSERTSIZE": max_gap}, ) ) return max_gap, sr_steps # }}} # {{{def get_splits_plan(ranges, splits, linked_plan=False): def get_splits_plan(ranges, splits, linked_plan=False): steps = [] max_event = 0 insert_sizes = [] for read_name in splits: split = splits[read_name] plan = get_split_plan(ranges, split) if plan: insert_size, step = plan insert_sizes.append(insert_size) steps += step if len(insert_sizes) > 0: max_event = max(insert_sizes) plan = [max_event, steps] return plan # }}} # {{{def plot_split(split, y, ax, ranges): def plot_split_plan(ranges, step, ax, marker_size, jitter_bounds): p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points(ranges, step.end_pos.chrm, step.end_pos.end), ] if None in p: return False # some points are far outside of the printable area, so we ignore them if not points_in_window(p): return False READ_TYPES_USED["Split-read"] = True y = step.info["INSERTSIZE"] # Offset y-values using jitter to avoid overlapping lines y = jitter(y, bounds=jitter_bounds) event_type = step.info["TYPE"] READ_TYPES_USED[event_type] = True color = COLORS[event_type] ax.plot( p, [y, y], ":", color=color, alpha=0.25, lw=1, marker="o", markersize=marker_size, ) # }}} # {{{def plot_splits(splits, def plot_splits( splits, ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds, ): """Plots all SplitReads for the region """ plan = get_splits_plan(ranges, splits) if not plan: [curr_min_insert_size, curr_max_insert_size] max_event, steps = plan for step in steps: plot_split_plan(ranges, step, ax, marker_size, jitter_bounds) if not curr_min_insert_size or curr_min_insert_size > max_event: curr_min_insert_size = max_event if not curr_max_insert_size or curr_max_insert_size < max_event: curr_max_insert_size = max_event return [curr_min_insert_size, curr_max_insert_size] # }}} ##Long Read methods # {{{class Alignment: class Alignment: """container of alignment info, from CIGAR string Contains start(int), end(int), strand(bool True=forward), query position (int) """ def __init__(self, chrm, start, end, strand, query_position): """Create Alignment instance Genomic interval is defined by start, end, and query_pos integers Strand is bool (True for forward) """ self.pos = genome_interval(chrm, start, end) self.strand = strand self.query_position = query_position def __str__(self): return ",".join( [ str(x) for x in [ self.pos.chrm, self.pos.start, self.pos.end, self.strand, self.query_position, ] ] ) def __repr__(self): return "Alignment(%s,%s,%s,%s,%s)" % ( self.pos.chrm, self.pos.start, self.pos.end, self.strand, self.query_position, ) # }}} # {{{class LongRead: class LongRead: """container of LongRead info Contains start(int), end(int), list of Alignments """ def __init__(self, alignments): """Create LongRead instance Genomic interval is defined by start, end integers List of Alignments set by parameter """ self.alignments = alignments def __str__(self): return ",".join([str(x) for x in self.alignments]) def __repr__(self): return "LongRead(" + str(self) + ")" # }}} # {{{def get_alignments_from_cigar(chrm, def get_alignments_from_cigar(chrm, curr_pos, strand, cigartuples, reverse=False): """Breaks CIGAR string into individual Aignments Starting point within genome given by curr_pos and strand Set of CIGAR operations and lengths as pairs passed in as cigartuples Direction of alignment set to reverse with reverse boolean Return list of Alignments """ alignments = [] q_pos = 0 if reverse: cigartuples = cigartuples[::-1] for op, length in cigartuples: if op in [CIGAR_MAP["M"], CIGAR_MAP["="], CIGAR_MAP["X"]]: alignments.append( Alignment(chrm, curr_pos, curr_pos + length, strand, q_pos) ) curr_pos += length q_pos += length elif op == CIGAR_MAP["I"]: q_pos += length elif op == CIGAR_MAP["D"]: curr_pos += length elif op == CIGAR_MAP["N"]: curr_pos += length elif op == CIGAR_MAP["S"]: q_pos += length return alignments # }}} # {{{def get_cigartuples_from_string(cigarstring): def get_cigartuples_from_string(cigarstring): """Extracts operations,lengths as tuples from cigar string" Returns list of tuples of [operation,length] """ cigartuples = [] for match in re.findall(r"(\d+)([A-Z]{1})", cigarstring): length = int(match[0]) op = match[1] cigartuples.append((CIGAR_MAP[op], length)) return cigartuples # }}} # {{{def merge_alignments(min_gap, alignments): def merge_alignments(min_gap, alignments): """Combines previously identified alignments if close together Alignments are combined if within min_gap distance Returns list of Alignments """ merged_alignments = [] for alignment in alignments: if len(merged_alignments) == 0: merged_alignments.append(alignment) else: if ( alignment.pos.chrm == merged_alignments[-1].pos.chrm and alignment.pos.start < merged_alignments[-1].pos.end + min_gap ): merged_alignments[-1].pos.end = alignment.pos.end else: merged_alignments.append(alignment) return merged_alignments # }}} # {{{def add_long_reads(bam_file, read, long_reads, min_event_size): def add_long_reads(bam_file, read, long_reads, min_event_size, ignore_hp): """Adds a (primary, non-supplementary, long) read to the long_reads list Read added to long_reads if within the inteval defined by ranges Alignments belonging to the LongRead instance combined if within the min_event_size distance apart """ if read.is_supplementary or read.is_secondary: return hp = 0 if not ignore_hp and read.has_tag("HP"): hp = int(read.get_tag("HP")) alignments = get_alignments_from_cigar( bam_file.get_reference_name(read.reference_id), read.pos, not read.is_reverse, read.cigartuples, ) min_gap = min_event_size merged_alignments = merge_alignments(min_gap, alignments) read_strand = not read.is_reverse if read.has_tag("SA"): for sa in read.get_tag("SA").split(";"): if len(sa) == 0: continue rname, pos, strand, cigar, mapq, nm = sa.split(",") sa_pos = int(pos) sa_strand = strand == "+" strand_match = read_strand != sa_strand sa_cigartuples = get_cigartuples_from_string(cigar) sa_alignments = get_alignments_from_cigar( rname, sa_pos, sa_strand, sa_cigartuples, reverse=strand_match ) sa_merged_alignments = merge_alignments(min_gap, sa_alignments) if len(sa_merged_alignments) > 0: merged_alignments += sa_merged_alignments if hp not in long_reads: long_reads[hp] = {} if read.query_name not in long_reads[hp]: long_reads[hp][read.query_name] = [] long_reads[hp][read.query_name].append(LongRead(merged_alignments)) # }}} # {{{def add_align_step(alignment, steps, ranges): def add_align_step(alignment, steps, ranges): # alignment can span ranges start_range_hit_i = get_range_hit(ranges, alignment.pos.chrm, alignment.pos.start) end_range_hit_i = get_range_hit(ranges, alignment.pos.chrm, alignment.pos.end) # neither end is in range, add nothing if start_range_hit_i == None and end_range_hit_i == None: return # start is not in range, use end hit if start_range_hit_i == None: start = genome_interval( alignment.pos.chrm, max(alignment.pos.start, ranges[end_range_hit_i].start), max(alignment.pos.start, ranges[end_range_hit_i].start), ) end = genome_interval( alignment.pos.chrm, min(alignment.pos.end, ranges[end_range_hit_i].end), min(alignment.pos.end, ranges[end_range_hit_i].end), ) steps.append(plan_step(start, end, "LONGREAD", info={"TYPE": "Align"})) # end is not in range, use start hit elif end_range_hit_i == None: start = genome_interval( alignment.pos.chrm, max(alignment.pos.start, ranges[start_range_hit_i].start), max(alignment.pos.start, ranges[start_range_hit_i].start), ) end = genome_interval( alignment.pos.chrm, min(alignment.pos.end, ranges[start_range_hit_i].end), min(alignment.pos.end, ranges[start_range_hit_i].end), ) steps.append(plan_step(start, end, "LONGREAD", info={"TYPE": "Align"})) # both are in the same range elif start_range_hit_i == end_range_hit_i: start = genome_interval( alignment.pos.chrm, max(alignment.pos.start, ranges[start_range_hit_i].start), max(alignment.pos.start, ranges[start_range_hit_i].start), ) end = genome_interval( alignment.pos.chrm, min(alignment.pos.end, ranges[end_range_hit_i].end), min(alignment.pos.end, ranges[end_range_hit_i].end), ) steps.append(plan_step(start, end, "LONGREAD", info={"TYPE": "Align"})) # in different ranges else: start_1 = genome_interval( alignment.pos.chrm, max(alignment.pos.start, ranges[start_range_hit_i].start), max(alignment.pos.start, ranges[start_range_hit_i].start), ) end_1 = genome_interval( alignment.pos.chrm, ranges[start_range_hit_i].end, ranges[start_range_hit_i].end, ) steps.append(plan_step(start_1, end_1, "LONGREAD", info={"TYPE": "Align"})) start_2 = genome_interval( alignment.pos.chrm, ranges[end_range_hit_i].start, ranges[end_range_hit_i].start, ) end_2 = genome_interval( alignment.pos.chrm, min(alignment.pos.end, ranges[end_range_hit_i].end), min(alignment.pos.end, ranges[end_range_hit_i].end), ) steps.append(plan_step(start_2, end_2, "LONGREAD", info={"TYPE": "Align"})) # }}} # {{{def get_long_read_plan(read_name, long_reads, ranges): def get_long_read_plan(read_name, long_reads, ranges): """Create a plan to render a long read Plan consists of the largest event within the read (used to determine the y-axis position of read) and the alignment types for plotting each Alignment within LongRead.alignments Align, Duplication, Deletion, Inversion, Inversion, InterChrmInversion, InterChrm Returns plan """ alignments = [] # only keep alignments that intersect a range seen = {} if read_name not in long_reads: logger.error("Read name {} not in list of long reads".format(read_name)) sys.exit(1) for long_read in long_reads[read_name]: for alignment in long_read.alignments: if alignment.query_position in seen: continue seen[alignment.query_position] = 1 # check to see if any part of this alignment overlaps a plot # range in_range = False for r in ranges: if r.intersect(alignment.pos) == 0: in_range = True if in_range: alignments.append(alignment) if len(alignments) <= 0: return None alignments.sort(key=lambda x: x.query_position) # we set the primary strand to be the one with the longest alignment # this will affect which alignment is inverted. There are clearly edge # cases here that we will need to address as we get more examples # of inversions longest_alignment = 0 longest_alignment_i = -1 for i in range(len(alignments)): l = alignments[i].pos.end - alignments[i].pos.start if longest_alignment < l: longest_alignment = l longest_alignment_i = i primary_strand = alignments[longest_alignment_i].strand steps = [] # long aglinments may spill over the edges, so we will clip that starts curr = alignments[0] add_align_step(curr, steps, ranges) for i in range(1, len(alignments)): last = alignments[i - 1] curr = alignments[i] # figure out what the event is # INTER CHROM if curr.pos.chrm != last.pos.chrm: if curr.strand != last.strand: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.end, curr.pos.end) info = {"TYPE": "InterChrmInversion"} steps.append(plan_step(start, end, "LONGREAD", info=info)) else: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "InterChrm"} steps.append(plan_step(start, end, "LONGREAD", info=info)) add_align_step(curr, steps, ranges) # Inversion elif curr.strand != last.strand: # it is possible that we have a complex even that # is an inverted Duplication if curr.pos.start < last.pos.end: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "Deletion"} steps.append(plan_step(start, end, "LONGREAD", info=info)) if curr.strand != primary_strand: # last (primary) | curr # +++++++++++++++|------- # ^.......^ # end end # last (primary) | curr # ---------------|+++++++ # ^.......^ # end end start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.end, curr.pos.end) info = {"TYPE": "Inversion"} steps.append(plan_step(start, end, "LONGREAD", info=info)) else: if curr.pos.start < last.pos.end: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "Duplication"} steps.append(plan_step(start, end, "LONGREAD", info=info)) # last | curr (primary) # +++++++|------------- # ^.......^ # start start # last | curr (primary) # -------|+++++++++++++++ # ^.......^ # start start start = genome_interval(last.pos.chrm, last.pos.start, last.pos.start) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "Inversion"} steps.append(plan_step(start, end, "LONGREAD", info=info)) add_align_step(curr, steps, ranges) # Duplication elif curr.pos.start < last.pos.end: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "Duplication"} steps.append(plan_step(start, end, "LONGREAD", info=info)) add_align_step(curr, steps, ranges) # Deletion else: start = genome_interval(last.pos.chrm, last.pos.end, last.pos.end) end = genome_interval(curr.pos.chrm, curr.pos.start, curr.pos.start) info = {"TYPE": "Deletion"} # steps.append(plan_step(start, end, 'LONGREAD', info=info)) steps.append(plan_step(start, end, "LONGREAD", info={"TYPE": "Deletion"})) add_align_step(curr, steps, ranges) # if either end is in a range, then add its gap to the list max_gap = None chrms = set([s.start_pos.chrm for s in steps] + [s.end_pos.chrm for s in steps]) # set interchrm dist to 5000 if len(chrms) > 1: max_gap = INTERCHROM_YAXIS else: step_sizes = [ abs(step.end_pos.end - step.start_pos.start) for step in steps if step.info["TYPE"] != "Align" and get_range_hit(ranges, step.start_pos.chrm, step.start_pos.start) != None and get_range_hit(ranges, step.end_pos.chrm, step.end_pos.end) != None ] max_gap = max(step_sizes) if len(step_sizes) > 0 else 0 plan = [max_gap, steps] return plan # }}} ##Variant methods # {{{def plot_variant(sv, sv_type, ax, ranges): def plot_variant(sv, sv_type, ax, ranges): """Plots the variant bar at the top of the image """ r = [ map_genome_point_to_range_points(ranges, sv[0].chrm, sv[0].start), map_genome_point_to_range_points(ranges, sv[-1].chrm, sv[-1].end), ] ax.plot(r, [0, 0], "-", color="black", lw=8, solid_capstyle="butt", alpha=0.5) ax.set_xlim([0, 1]) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.tick_params(axis="x", length=0) ax.tick_params(axis="y", length=0) ax.set_xticklabels([]) ax.set_yticklabels([]) ## make SV title sv_title = "" if sv[0].chrm == sv[-1].chrm: sv_size = float(sv[0].end) - float(sv[0].start) if len(sv) > 1: sv_size = abs(int(float(sv[0].end) - float(sv[-1].start))) sv_size_unit = "bp" if sv_size > 1000000: sv_size = "{0:0.2f}".format(sv_size / 1000000.0) sv_size_unit = "mb" elif sv_size > 1000: sv_size = "{0:0.2f}".format(sv_size / 1000.0) sv_size_unit = "kb" sv_title = str(sv_size) + " " + sv_size_unit + " " + sv_type else: sv_title = sv_type ax.set_title(sv_title, fontsize=8) # }}} # {{{def plot_confidence_interval(chrm, breakpoint,ci, ax, ranges): def plot_confidence_interval(chrm, breakpoint, ci, ax, ranges): """Plots a confidence interval on the variant bar """ r = [ map_genome_point_to_range_points(ranges, chrm, breakpoint - int(ci[0])), map_genome_point_to_range_points(ranges, chrm, breakpoint + int(ci[1])), ] if None in r: # confidence intervals are invalid return ax.plot(r, [0, 0], "-", color="black", lw=0.5, alpha=1) ax.axvline(r[0], color="black", lw=0.5, alpha=1, ymin=0.40, ymax=0.60) ax.axvline(r[1], color="black", lw=0.5, alpha=1, ymin=0.40, ymax=0.60) ax.set_xlim([0, 1]) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.tick_params(axis="x", length=0) ax.tick_params(axis="y", length=0) ax.set_xticklabels([]) ax.set_yticklabels([]) # }}} # {{{def create_variant_plot(grid, def create_variant_plot(grid, ax_i, sv, sv_type, ranges, start_ci, end_ci): """Plots the pieces of the variant bar at the top, including bar and confidence intervals """ ax = plt.subplot(grid[ax_i]) plot_variant(sv, sv_type, ax, ranges) ax_i += 1 # plot confidence intervals if provided if start_ci and start_ci != None: plot_confidence_interval(sv[0].chrm, sv[0].start, start_ci, ax, ranges) if end_ci and end_ci != None: plot_confidence_interval(sv[-1].chrm, sv[-1].end, end_ci, ax, ranges) # break the variant plot when we have multiple ranges for i in range(1, len(ranges)): ax.axvline(x=1.0 / len(ranges), color="white", linewidth=5) ax.text( 1.0 / len(ranges), 0, "...", fontsize=6, fontdict=None, horizontalalignment="center", ) return ax_i # }}} # Linked Reads methods # {{{ def get_linked_plan(ranges, pairs, splits, linked_reads, gem_name): def get_linked_plan(ranges, pairs, splits, linked_reads, gem_name): insert_sizes = [] gem_poss = [[] for i in range(len(ranges))] linked_pair_steps = [] # collect all the pairs in a gem for name in linked_reads[gem_name][0]: if name in pairs and len(pairs[name]) == 2: pair = pairs[name] plan = get_pair_plan(ranges, pair, linked_plan=True) if plan: insert_size, step = plan insert_sizes.append(insert_size) linked_pair_steps.append(step) # collect all the splits in a gem linked_split_steps = [] for name in linked_reads[gem_name][1]: if name in splits: split = splits[name] plan = get_split_plan(ranges, split, linked_plan=True) if plan: insert_size, steps = plan insert_sizes.append(insert_size) linked_split_steps += steps if len(linked_split_steps) == 0 and len(linked_pair_steps) == 0: return None for step in linked_split_steps + linked_pair_steps: poss = [ (step.start_pos.chrm, step.start_pos.start), (step.start_pos.chrm, step.start_pos.end), (step.end_pos.chrm, step.end_pos.start), (step.end_pos.chrm, step.end_pos.end), ] for pos in poss: hit = get_range_hit(ranges, pos[0], pos[1]) if hit > -1: gem_poss[hit].append(pos[1]) max_event_size = max(insert_sizes) gem_steps = [] for i in range(len(ranges)): if len(gem_poss[i]) == 0: continue start = genome_interval(ranges[i].chrm, min(gem_poss[i]), min(gem_poss[i])) end = genome_interval(ranges[i].chrm, max(gem_poss[i]), max(gem_poss[i])) gem_steps.append(plan_step(start, end, "LINKED")) # if the gem extends beyond the range, then push the end pos to the # end/begining of the range if len(gem_steps) > 1: gem_steps[0].end_pos.start = ranges[0].end gem_steps[0].end_pos.end = ranges[0].end gem_steps[1].start_pos.start = ranges[1].start gem_steps[1].start_pos.end = ranges[1].start info = { "INSERTSIZE": max_event_size, "PAIR_STEPS": linked_pair_steps, "SPLIT_STEPS": linked_split_steps, } gem_steps[0].info = info return max(insert_sizes), gem_steps # }}} # {{{ def plot_linked_reads(pairs, def plot_linked_reads( pairs, splits, linked_reads, ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds, ): """Plots all LinkedReads for the region """ for linked_read in linked_reads: plan = get_linked_plan(ranges, pairs, splits, linked_reads, linked_read) if not plan: continue insert_size, steps = plan insert_size = jitter(insert_size, bounds=jitter_bounds) if not curr_min_insert_size or curr_min_insert_size > insert_size: curr_min_insert_size = insert_size if not curr_max_insert_size or curr_max_insert_size < insert_size: curr_max_insert_size = insert_size for step in steps: p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points( ranges, step.end_pos.chrm, step.end_pos.end ), ] # ignore points outside window if not points_in_window(p): continue READ_TYPES_USED["Linked read"] = True ax.plot( p, [insert_size, insert_size], "-", color="green", alpha=0.75, lw=0.25 ) for pair_step in steps[0].info["PAIR_STEPS"]: pair_step.info["INSERTSIZE"] = insert_size plot_pair_plan(ranges, pair_step, ax, marker_size, jitter_bounds) for split_step in steps[0].info["SPLIT_STEPS"]: split_step.info["INSERTSIZE"] = insert_size plot_split_plan(ranges, split_step, ax, marker_size, jitter_bounds) return [curr_min_insert_size, curr_max_insert_size] # }}} # {{{def plot_long_reads(long_reads, def plot_long_reads(long_reads, ax, ranges, curr_min_insert_size, curr_max_insert_size, jitter_bounds): """Plots all LongReads for the region """ Path = mpath.Path colors = { "Align": "orange", "Deletion": "black", "Inversion": "blue", "Duplication": "red", "InterChrm": "black", "InterChrmInversion": "blue", } for read_name in long_reads: long_read_plan = get_long_read_plan(read_name, long_reads, ranges) if long_read_plan is None: continue max_gap = long_read_plan[0] steps = long_read_plan[1] for step in steps: p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points( ranges, step.end_pos.chrm, step.end_pos.end ), ] # some points are far outside of the printable area, so we # ignore them if not points_in_window(p): continue READ_TYPES_USED["Aligned long read"] = True event_type = step.info["TYPE"] READ_TYPES_USED[event_type] = True if event_type == "Align": ax.plot( p, [max_gap, max_gap], "-", color=colors[event_type], alpha=0.25, lw=1, ) curr_max_insert_size = max(curr_max_insert_size, max_gap) else: x1 = p[0] x2 = p[1] # get offset to bend the line up max_gap_offset = max(jitter(max_gap * 1.1, bounds=jitter_bounds), max_gap) pp = mpatches.PathPatch( Path( [ (x1, max_gap), (x1, max_gap_offset), (x2, max_gap_offset), (x2, max_gap), ], [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4], ), fc="none", color=colors[event_type], alpha=0.25, lw=1, ls=":", ) ax.add_patch(pp) # add some room for the bend line curr_max_insert_size = max(curr_max_insert_size, max_gap_offset) return [curr_min_insert_size, curr_max_insert_size] # }}} ##Setup # {{{def pair(arg): def pair(arg): """Defines behavior for ArgParse pairs Pairs must be comma-separated list of two items """ try: parsed_arg = [int(x) for x in arg.split(",")] if len(parsed_arg) == 2: return parsed_arg else: logger.error("Invalid number of pair values") sys.exit(1) except Exception as e: logger.error("Invalid pair values") print(e, file=sys.stderr) sys.exit(1) # }}} # {{{def print_arguments(options): def print_arguments(options): """Prints out the arguments to samplot as a json object Used as metadata for PlotCritic """ if options.print_args or options.json_only: import json args_filename = os.path.splitext(options.output_file)[0] + ".json" args_info = { "titles": options.titles if options.titles else "None", "reference": options.reference if options.reference else "None", "bams": options.bams, "output_file": options.output_file, "start": options.start, "end": options.end, "chrom": options.chrom, "window": options.window, "max_depth": options.max_depth if options.max_depth else "None", "sv_type": options.sv_type, "transcript_file": options.transcript_file if options.transcript_file else "None", } with open(args_filename, "w") as outfile: json.dump(args_info, outfile) # }}} # {{{def setup_arguments(): def add_plot(parent_parser): """Defines the allowed arguments for plot function """ parser = parent_parser.add_parser( "plot", help="Plot an image of a genome region from " + "CRAM/SAM alignments, " + "optimized for structural variant call review", ) parser.add_argument( "-n", "--titles", help="Space-delimited list of plot titles. " + "Use quote marks to include spaces " + '(i.e. "plot 1" "plot 2")', type=str, nargs="+", required=False, ) parser.add_argument( "-r", "--reference", help="Reference file for CRAM, required if " + "CRAM files used", type=str, required=False, ) parser.add_argument( "-z", "--z", type=int, default=4, help="Number of stdevs from the mean (default 4)", required=False, ) def bam_file(bam): if not os.path.isfile(bam): parser.error("alignment file {} does not exist or is not a valid file".format(bam)) options = ["sam", "bam", "cram"] idx_options = ["sai", "bai", "crai", "csi"] fields = os.path.splitext(bam) ext = fields[1][1:].lower() if ext not in options: parser.error("alignment file {} is not in SAM/BAM/CRAM format".format(bam)) idx_type = idx_options[options.index(ext)] #try the type-specific index name picard_bam = os.path.splitext(bam)[0] if (not os.path.isfile(bam + "." + idx_type) and not os.path.isfile(picard_bam + "." + idx_type)): idx_type = idx_options[3] #try the csi index name if not os.path.isfile(bam + "." + idx_type): parser.error("alignment file {} has no index".format(bam)) return bam parser.add_argument( "-b", "--bams", type=bam_file, nargs="+", help="Space-delimited list of BAM/CRAM file names", required=True, ) parser.add_argument( "-o", "--output_file", type=str, help="Output file name/type. " +"Defaults to {type}_{chrom}_{start}_{end}.png", required=False, ) parser.add_argument( "--output_dir", type=str, default=".", help="Output directory name. Defaults to working dir. " +"Ignored if --output_file is set", required=False, ) parser.add_argument( "-s", "--start", type=int, help="Start position of region/variant (add multiple for translocation/BND events)", action="append", required=True, ) parser.add_argument( "-e", "--end", type=int, help="End position of region/variant (add multiple for translocation/BND events)", action="append", required=True, ) parser.add_argument( "-c", "--chrom", type=str, help="Chromosome (add multiple for translocation/BND events)", action="append", required=True ) parser.add_argument( "-w", "--window", type=int, help="Window size (count of bases to include " + "in view), default(0.5 * len)", required=False, ) parser.add_argument( "-d", "--max_depth", type=int, help="Max number of normal pairs to plot", default=1, required=False, ) parser.add_argument( "-t", "--sv_type", type=str, help="SV type. If omitted, plot is created " + "without variant bar", required=False, ) def gff_file(transcript_file): if not os.path.isfile(transcript_file): parser.error("transcript file {} does not exist or is not a valid file".format(transcript_file)) options = ["gff", "gff3"] fields = os.path.splitext(transcript_file) ext = fields[1][1:] if ext == "gz": ext = os.path.splitext(fields[0])[1][1:] ext = ext.lower() if ext not in options: parser.error("transcript file {} is not in GFF3 format".format(transcript_file)) idx_file = transcript_file + ".tbi" if not os.path.isfile(idx_file): idx_file = transcript_file + ".csi" if not os.path.isfile(idx_file): parser.error("transcript file {} is missing .tbi/.csi index file".format(transcript_file)) return transcript_file parser.add_argument( "-T", "--transcript_file", help="GFF3 of transcripts", required=False, type=gff_file, ) parser.add_argument( "--transcript_filename", help="Name for transcript track", required=False, type=str, ) parser.add_argument( "--max_coverage_points", help="number of points to plot in coverage axis (downsampled from region size for speed)", required=False, type=int, default=1000, ) def bed_file(annotation_file): if not os.path.isfile(annotation_file): parser.error("annotation file {} does not exist or is not a valid file".format(annotation_file)) fields = os.path.splitext(annotation_file) ext = fields[1][1:] if ext == "gz": ext = os.path.splitext(fields[0])[1][1:] ext = ext.lower() if ext != "bed": parser.error("annotation file {} is not in BED format".format(annotation_file)) idx_file = annotation_file + ".tbi" if not os.path.isfile(idx_file): idx_file = annotation_file + ".csi" if not os.path.isfile(idx_file): parser.error("annotation file {} is missing .tbi index file".format(annotation_file)) return annotation_file parser.add_argument( "-A", "--annotation_files", type=bed_file, nargs="+", help="Space-delimited list of bed.gz tabixed " + "files of annotations (such as repeats, " + "mappability, etc.)", required=False, ) parser.add_argument( "--annotation_filenames", type=str, nargs="+", help="Space-delimited list of names for the tracks in --annotation_files", required=False, ) parser.add_argument( "--coverage_tracktype", type=str, help="type of track to use for low MAPQ " + "coverage plot.", choices=["stack", "superimpose", "none"], default="stack", required=False, ) parser.add_argument( "-a", "--print_args", action="store_true", default=False, help="Print commandline arguments to a json file, useful with PlotCritic", required=False, ) parser.add_argument( "-H", "--plot_height", type=int, help="Plot height", required=False ) parser.add_argument( "-W", "--plot_width", type=int, help="Plot width", required=False ) parser.add_argument( "-q", "--include_mqual", type=int, help="Min mapping quality of reads to be included in plot (default 1)", default=1, required=False, ) parser.add_argument( "--separate_mqual", type=int, help="coverage from reads with MAPQ <= separate_mqual " + "plotted in lighter grey. To disable, " + "pass in negative value", default=0, required=False, ) parser.add_argument( "-j", "--json_only", action="store_true", default=False, help="Create only the json file, not the " + "image plot", required=False, ) parser.add_argument( "--start_ci", help="confidence intervals of SV first " + "breakpoint (distance from the " + "breakpoint). Must be a " + "comma-separated pair of ints (i.e. 20,40)", type=pair, required=False, ) parser.add_argument( "--end_ci", help="confidence intervals of SV end " + "breakpoint (distance from the " + "breakpoint). Must be a " + "comma-separated pair of ints (i.e. 20,40)", type=pair, required=False, ) parser.add_argument( "--long_read", type=int, default=1000, help="Min length of a read to be treated as a " + "long-read (default 1000)", required=False, ) parser.add_argument( "--ignore_hp", action="store_true", help="Choose to ignore HP tag in alignment files", required=False, ) parser.add_argument( "--min_event_size", type=int, default=20, help="Min size of an event in long-read " + "CIGAR to include (default 20)", required=False, ) parser.add_argument( "--xaxis_label_fontsize", type=int, default=6, help="Font size for X-axis labels (default 6)", required=False, ) parser.add_argument( "--yaxis_label_fontsize", type=int, default=6, help="Font size for Y-axis labels (default 6)", required=False, ) parser.add_argument( "--legend_fontsize", type=int, default=6, help="Font size for legend labels (default 6)", required=False, ) parser.add_argument( "--annotation_fontsize", type=int, default=6, help="Font size for annotation labels (default 6)", required=False, ) parser.add_argument( "--hide_annotation_labels", action="store_true", default=False, help="Hide the label (fourth column text) " + "from annotation files, useful for regions " + "with many annotations", required=False, ) parser.add_argument( "--coverage_only", action="store_true", default=False, help="Hide all reads and show only coverage", required=False, ) parser.add_argument( "--max_coverage", default=0, type=int, help="apply a maximum coverage cutoff. Unlimited by default", ) parser.add_argument( "--same_yaxis_scales", action="store_true", default=False, help="Set the scales of the Y axes to the " + "max of all", required=False, ) parser.add_argument( "--marker_size", type=int, default=3, help="Size of marks on pairs and splits (default 3)", required=False, ) parser.add_argument( "--jitter", type=float, nargs="?", const=0.08, default=0.0, help="Add uniform random noise to insert sizes. This can be helpful " "to resolve overlapping entries. Either a custom value (<1.0) is " "supplied or %(const)s will be used." ) parser.add_argument( "--dpi", type=int, default=300, help="Dots per inches (pixel count, default 300)", required=False, ) parser.add_argument( "--annotation_scalar", type=float, default=.3, help="scaling factor for the optional annotation/trascript tracks", required=False, ) parser.add_argument( "--zoom", type=int, default=500000, help="Only show +- zoom amount around breakpoints, " +"much faster for large regions. " +"Ignored if region smaller than --zoom (default 500000)", required=False, ) parser.add_argument( "--debug", type=str, help="Print debug statements", required=False ) parser.add_argument( "--random-seed", type=int, default=9999, help=SUPPRESS, ) parser.set_defaults(func=plot) # }}} # {{{def estimate_fragment_len(bam) def estimate_fragment_len(bam, reference): try: if not reference: bam_file = pysam.AlignmentFile(bam, "rb") else: bam_file = pysam.AlignmentFile(bam, "rc", reference_filename=reference) except Exception as err: logger.error("Error opening file {}".format(bam_file)) print(err, file=sys.stderr) sys.exit(1) frag_lens = [] for i, read in enumerate(bam_file): if i >= 10000: break frag_lens.append(abs(read.tlen)) if len(frag_lens) >= 5000: return np.median(frag_lens) else: logger.warning( "Insufficient reads for fragment length estimate.\nContinuing with unmodified window size" ) return 0 # {{{def set_plot_dimensions(sv, def set_plot_dimensions( sv, sv_type, arg_plot_height, arg_plot_width, bams, reference, annotation_files, transcript_file, arg_window, zoom, ): """Chooses appropriate dimensions for the plot Includes the number of samples, whether a variant type is included, and any annotations in height. Includes the start, end, and window argument in width If height and width are chosen by user, these are used instead Return plot height, width, and window as integers """ plot_height = 5 plot_width = 8 if arg_plot_height: plot_height = arg_plot_height else: num_subplots = len(bams) if annotation_files: num_subplots += 0.3 * len(annotation_files) if transcript_file: num_subplots += 0.3 plot_height = 2 + num_subplots if arg_plot_width: plot_width = arg_plot_width window = 0 ranges = [] if arg_window: window = arg_window """ Several things determine the window size. 1) SV is not given, window = 0 1) SV is given 1) it is directly set 2) it is not directly set 2.1) single interval SV 2.2) zoom set 2.3) 2-interval SV """ # if an SV type is given, then expand the window around its bounds if sv_type: # if the sv has one interval then set the window proportional # to sv size and set one range if len(sv) == 1: if arg_window: window = arg_window else: window = int((sv[0].end - sv[0].start) / 2) frag_len = estimate_fragment_len(bams[0], reference) if (0 < frag_len) and (window < 1.5 * frag_len): old_window = window window = int(1.5 * frag_len) logger.warning( "Window size is under 1.5x the estimated fragment length " + "and will be resized to {}. Rerun with -w {} to override".format( window, old_window ) ) ranges = [ genome_interval( sv[0].chrm, max(0, sv[0].start - window), sv[0].end + window ) ] # if region is larger than zoom, set window to zoom and set two ranges if window >= zoom: window = zoom ranges = [ genome_interval( sv[0].chrm, max(0, sv[0].start - window), sv[0].start + window, ), genome_interval( sv[0].chrm, max(0, sv[0].end - window), sv[0].end + window ), ] elif len(sv) == 2: if arg_window: window = arg_window elif zoom: window = zoom else: window = 1000 ranges = [ genome_interval( sv[0].chrm, max(0, sv[0].start - window), sv[0].start + window ), genome_interval( sv[1].chrm, max(0, sv[1].end - window), sv[1].end + window ), ] else: logger.error("{} genome splits are not supported".format(str(len(sv)))) sys.exit(1) else: ranges = [genome_interval(sv[0].chrm, sv[0].start, sv[0].end)] return plot_height, plot_width, window, ranges # }}} # {{{def get_read_data(ranges, def get_read_data( ranges, bams, reference, separate_mqual, include_mqual, coverage_only, long_read_length, min_event_size, same_yaxis_scales, max_depth, z_score, ignore_hp, ): """Reads alignment files to extract reads for the region Region and alignment files given with chrom, start, end, bams If CRAM files are used, reference must be provided Reads with mapping quality below include_mqual will not be retrieved If coverage_only, reads are not kept and used only for checking coverage Reads longer than long_read_length will be treated as long reads Max coverages values will be set to same value for all samples if same_yaxis_scales If max_depth, only max_depth reads will be retrieved, although all will be included in coverage If PairedEnd read insert size is greater than z_score standard deviations from mean, read will be treated as discordant """ all_pairs = [] all_splits = [] all_coverages = [] all_long_reads = [] all_linked_reads = [] max_coverage = 0 haplotypes = [0, 1, 2] for bam_file_name in bams: bam_file = None try: if not reference: bam_file = pysam.AlignmentFile(bam_file_name, "rb") else: bam_file = pysam.AlignmentFile( bam_file_name, "rc", reference_filename=reference ) except Exception as err: logger.error("This can be caused by issues with the alignment file. " +"Please make sure that it is sorted and indexed before trying again") print(err, file=sys.stderr) sys.exit(1) pairs = {} splits = {} long_reads = {} coverage = {hp: {} for hp in haplotypes} linked_reads = {} for r in ranges: # Define range boundries range_start = max(0, r.start - 1000) range_end = r.end + 1000 try: bam_iter = bam_file.fetch(r.chrm, range_start, range_end) except ValueError: chrm = r.chrm if chrm[:3] == "chr": chrm = chrm[3:] else: chrm = "chr" + chrm bam_iter = bam_file.fetch(chrm, range_start, range_end) chrm = strip_chr(r.chrm) if chrm not in coverage[0]: for hp in haplotypes: coverage[hp][chrm] = {} # Define a zeros matrix to hold coverage value over the range for all # haplotyps. If using separate_mqual the first column will hold the coverage # for high quality reads and the second column low quality reads. Otherwise # all coverage will be in the second column. range_len = range_end - range_start range_hp_coverage = {hp: np.zeros((range_len, 2), dtype=int) for hp in haplotypes} for read in bam_iter: if ( read.is_qcfail or read.is_unmapped or read.is_duplicate or int(read.mapping_quality) < include_mqual ): continue if not coverage_only: if read.query_length >= long_read_length: add_long_reads(bam_file, read, long_reads, min_event_size, ignore_hp) else: add_pair_end(bam_file, read, pairs, linked_reads, ignore_hp) add_split(read, splits, bam_file, linked_reads, ignore_hp) # Add read coverage to the specified haplotype and column hp = 0 if ignore_hp or not read.has_tag("HP") else read.get_tag("HP") column = 0 if separate_mqual and (read.mapping_quality > separate_mqual) else 1 add_coverage(read, range_hp_coverage[hp], range_start, column) # Tally the coverage for each position and updata coverage dict. for hp, range_coverage in range_hp_coverage.items(): # Skip empty haplotypes if (range_coverage.sum() == 0).all(): continue for position in range(range_start, range_end): coverage[hp][chrm][position] = list(range_coverage[position-range_start]) if ( len(pairs) == 0 and len(splits) == 0 and len(long_reads) == 0 and len(linked_reads) == 0 ): if not coverage_only: logger.warning( "No data returned from fetch in regions {} from {}".format( " ".join([str(r) for r in ranges]), bam_file ) ) # Update max_coverage and remove any empty haplotype dict from coverage dict for hp in haplotypes: hp_covered = False for chrm in coverage[hp]: sn_coverages = [ v for values in coverage[hp][chrm].values() for v in values ] curr_max = 0 if len(sn_coverages) > 0: curr_max = np.percentile(sn_coverages, 99.5) if curr_max > max_coverage: max_coverage = curr_max if sum(sn_coverages) > 0: hp_covered = True if not hp_covered: del coverage[hp] all_coverages.append(coverage) all_pairs.append(pairs) all_splits.append(splits) all_long_reads.append(long_reads) all_linked_reads.append(linked_reads) read_data = { "all_pairs": all_pairs, "all_splits": all_splits, "all_coverages": all_coverages, "all_long_reads": all_long_reads, "all_linked_reads": all_linked_reads, } # Sample +/- pairs in the normal insert size range if max_depth: read_data["all_pairs"] = downsample_pairs( max_depth, z_score, read_data["all_pairs"] ) if not same_yaxis_scales: max_coverage = 0 return read_data, max_coverage # }}} # {{{def downsample_pairs(max_depth, z_score, all_pairs): def downsample_pairs(max_depth, z_score, all_pairs): """Downsamples to keep only max_depth normal pairs from all PairedEnd reads """ for bam_i in range(len(all_pairs)): for hp_i in all_pairs[bam_i]: all_pairs[bam_i][hp_i] = sample_normal( max_depth, all_pairs[bam_i][hp_i], z_score ) return all_pairs # }}} # {{{def set_haplotypes(curr_coverage): def set_haplotypes(curr_coverage): """Creates a list to manage counting haplotypes for subplots """ hps = sorted(curr_coverage.keys(), reverse=True) # if there are multiple haplotypes, must have 0,1,2 if len(hps) > 1 or (len(hps) == 1 and hps[0] != 0): if 0 not in hps: hps.append(0) if 1 not in hps: hps.append(1) if 2 not in hps: hps.append(2) elif 0 not in hps: hps.append(0) hps.sort(reverse=True) return hps # }}} # {{{def plot_samples(ranges, def plot_samples( ranges, read_data, grid, ax_i, number_of_axes, bams, chrom, coverage_tracktype, titles, same_yaxis_scales, xaxis_label_fontsize, yaxis_label_fontsize, annotation_files, transcript_file, max_coverage_points, max_coverage, marker_size, coverage_only, jitter_bounds, ): """Plots all samples """ max_insert_size = 0 # If jitter > 0.08 is use we need to shift the ylim a bit to not hide any entires. ylim_margin = max(1.02 + jitter_bounds, 1.10) for i in range(len(bams)): #ax is never used, annotating this for readability ax = plt.subplot(grid[ax_i]) hps = set_haplotypes(read_data["all_coverages"][i]) inner_axs = gridspec.GridSpecFromSubplotSpec( len(hps), 1, subplot_spec=grid[ax_i], wspace=0.0, hspace=0.5 ) axs = {} for j in range(len(hps)): axs[j] = plt.subplot(inner_axs[hps[j]]) curr_min_insert_size = None curr_max_insert_size = 0 cover_axs = {} for hp in hps: curr_ax = axs[hp] curr_splits = [] if hp in read_data["all_splits"][i]: curr_splits = read_data["all_splits"][i][hp] curr_linked_reads = [] if hp in read_data["all_linked_reads"][i]: curr_linked_reads = read_data["all_linked_reads"][i][hp] curr_long_reads = [] if hp in read_data["all_long_reads"][i]: curr_long_reads = read_data["all_long_reads"][i][hp] curr_pairs = [] if hp in read_data["all_pairs"][i]: curr_pairs = read_data["all_pairs"][i][hp] curr_coverage = {} if hp in read_data["all_coverages"][i]: curr_coverage = read_data["all_coverages"][i][hp] cover_ax = plot_coverage( curr_coverage, curr_ax, ranges, len(hps), max_coverage, coverage_tracktype, yaxis_label_fontsize, max_coverage_points, ) if len(curr_linked_reads) > 0: curr_min_insert_size, curr_max_insert_size = plot_linked_reads( curr_pairs, curr_splits, curr_linked_reads, curr_ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds ) elif len(curr_long_reads) > 0: curr_min_insert_size, curr_max_insert_size = plot_long_reads( curr_long_reads, curr_ax, ranges, curr_min_insert_size, curr_max_insert_size, jitter_bounds ) else: curr_min_insert_size, curr_max_insert_size = plot_pairs( curr_pairs, curr_ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds ) curr_min_insert_size, curr_max_insert_size = plot_splits( curr_splits, curr_ax, ranges, curr_min_insert_size, curr_max_insert_size, marker_size, jitter_bounds, ) cover_axs[hp] = cover_ax if curr_max_insert_size and (curr_max_insert_size > max_insert_size): max_insert_size = curr_max_insert_size # {{{ set axis parameters # set the axis title to be either one passed in or filename curr_ax = axs[hps[0]] if titles and len(titles) == len(bams): curr_ax.set_title(titles[i], fontsize=8, loc="left") else: curr_ax.set_title(os.path.basename(bams[i]), fontsize=8, loc="left") if len(axs) > 1: for j in axs: curr_ax = axs[j] fp = dict(size=8, backgroundcolor="white") text = "HP: " if j == 0: text += "Undef" else: text += str(j) at = AnchoredText( text, loc=2, prop=fp, borderpad=0, pad=0, frameon=False ) curr_ax.add_artist(at) for j in hps: curr_ax = axs[j] curr_ax.set_xlim([0, 1]) if same_yaxis_scales: curr_ax.set_ylim([0, max(1, max_insert_size * ylim_margin)]) else: curr_ax.set_ylim([0, max(1, curr_max_insert_size * ylim_margin)]) curr_ax.spines["top"].set_visible(False) curr_ax.spines["bottom"].set_visible(False) curr_ax.spines["left"].set_visible(False) curr_ax.spines["right"].set_visible(False) curr_ax.tick_params(axis="y", labelsize=yaxis_label_fontsize) # if there's one hp, 6 ticks fit. Otherwise, do 3 tick_count = 6 if len(hps) == 1 else 3 curr_ax.yaxis.set_major_locator(ticker.LinearLocator(tick_count)) curr_ax.ticklabel_format(useOffset=False, style='plain') curr_ax.tick_params(axis="both", length=0) curr_ax.set_xticklabels([]) if coverage_only: curr_ax.yaxis.set_visible(False) last_sample_num = number_of_axes - 1 if annotation_files: last_sample_num -= len(annotation_files) if transcript_file: last_sample_num -= 1 if ax_i == last_sample_num: curr_ax = axs[hps[-1]] labels = [] if len(ranges) == 1: labels = [ int(ranges[0].start + l * (ranges[0].end - ranges[0].start)) for l in curr_ax.xaxis.get_majorticklocs() ] elif len(ranges) == 2: x_ticks = curr_ax.xaxis.get_majorticklocs() labels_per_range = int( len(curr_ax.xaxis.get_majorticklocs()) / len(ranges) ) labels = [ int(ranges[0].start + l * (ranges[0].end - ranges[0].start)) for l in x_ticks[:labels_per_range] ] try: labels += [ int(ranges[-1].start + l * (ranges[-1].end - ranges[-1].start)) for l in x_ticks[labels_per_range:] ] except Exception as e: logger.error(labels_per_range) print(e, file=sys.stderr) sys.exit(1) else: logger.error("Ranges greater than 2 are not supported") sys.exit(1) curr_ax.set_xticklabels(labels, fontsize=xaxis_label_fontsize) chrms = [x.chrm for x in ranges] curr_ax.set_xlabel("Chromosomal position on " + "/".join(chrms), fontsize=8) curr_ax = axs[hps[int(len(hps) / 2)]] curr_ax.set_ylabel("Insert size", fontsize=8) cover_ax = cover_axs[hps[int(len(hps) / 2)]] cover_ax.set_ylabel("Coverage", fontsize=8) # }}} ax_i += 1 return ax_i # }}} # {{{def plot_legend(fig, legend_fontsize): def plot_legend(fig, legend_fontsize, marker_size): """Plots the figure legend """ marker_colors = [] marker_labels = [] read_colors = { "Deletion/Normal": "black", "Duplication": "red", "Inversion": "blue", "Aligned long read": "orange", "Linked read": "green", } for read_type in READ_TYPES_USED: if read_type in read_colors: color = read_colors[read_type] flag = READ_TYPES_USED[read_type] if flag: marker_colors.append(color) marker_labels.append(read_type) legend_elements = [] for color in marker_colors: legend_elements += [ plt.Line2D([0, 0], [0, 1], color=color, linestyle="-", lw=1) ] if READ_TYPES_USED["Split-read"]: marker_labels.append("Split read") legend_elements += [ plt.Line2D( [0, 0], [0, 1], markerfacecolor="None", markeredgecolor="grey", color="grey", marker="o", markersize=marker_size, linestyle=":", lw=1, ) ] if READ_TYPES_USED["Paired-end read"]: marker_labels.append("Paired-end read") legend_elements += [ plt.Line2D( [0, 0], [0, 1], markerfacecolor="None", markeredgecolor="grey", color="grey", marker="s", markersize=marker_size, linestyle="-", lw=1, ) ] fig.legend( legend_elements, marker_labels, loc=1, fontsize=legend_fontsize, frameon=False ) # }}} # {{{def create_gridspec(bams, transcript_file, annotation_files, sv_type ): def create_gridspec(bams, transcript_file, annotation_files, sv_type, read_data, annotation_scalar): """Helper function for creation of a correctly-sized GridSpec instance """ # give one axis to display each sample num_ax = len(bams) # add another if we are displaying the SV if sv_type: num_ax += 1 # add another if a annotation file is given if transcript_file: num_ax += 1 if annotation_files: num_ax += len(annotation_files) # set the relative sizes for each ratios = [] if sv_type: ratios = [1] for i in range(len(bams)): ratios.append(len(read_data["all_coverages"][i]) * 3) if len(read_data["all_coverages"]) > 0: ratios[-1] = 9 if annotation_files: ratios += [annotation_scalar] * len(annotation_files) if transcript_file: ratios.append(annotation_scalar * 3) return gridspec.GridSpec(num_ax, 1, height_ratios=ratios), num_ax # }}} ##Annotations/Transcript methods # {{{def get_plot_annotation_plan(ranges, annotation_file): def get_plot_annotation_plan(ranges, annotation_file): annotation_plan = [] for r in ranges: itr = get_tabix_iter(r.chrm, r.start, r.end, annotation_file) if not (itr): continue for row in itr: A = row.rstrip().split() A[0] = strip_chr(A[0]) chrm = A[0] start = int(A[1]) end = int(A[2]) interval = genome_interval(chrm, start, end) # check to see if any part of this alignment overlaps a plot # range in_range = False for r in ranges: if r.intersect(interval) == 0: in_range = True if in_range: step = plan_step( genome_interval(chrm, start, start), genome_interval(chrm, end, end), "ANNOTATION", ) if len(A) > 3: try: v = float(A[3]) step.event = "FLOAT_ANNOTATION" step.info = v except ValueError: step.event = "STRING_ANNOTATION" step.info = A[3] annotation_plan.append(step) return annotation_plan # }}} # {{{def plot_annotations(annotation_files, chrom, start, end, def plot_annotations( annotation_files, annotation_filenames, ranges, hide_annotation_labels, annotation_fontsize, grid, ax_i, annotation_scalar, ): """Plots annotation information from region """ if not annotation_filenames: annotation_filenames = [] for annotation_file in annotation_files: annotation_filenames.append(os.path.basename(annotation_file)) for i,annotation_file in enumerate(annotation_files): annotation_plan = get_plot_annotation_plan(ranges, annotation_file) annotation_filename = annotation_filenames[i] if len(annotation_plan) == 0: continue ax = plt.subplot(grid[ax_i]) ax_i += 1 for step in annotation_plan: p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points( ranges, step.end_pos.chrm, step.end_pos.end ), ] # if an annotation lies outside the window, its coordinate will be None, so we trim to the window if p[0] is None: p[0] = 0 if p[1] is None: p[1] = 1 if step.event == "ANNOTATION": ax.plot(p, [0, 0], "-", color="black", lw=5) elif step.event == "FLOAT_ANNOTATION": ax.plot(p, [0, 0], "-", color=str(step.info), lw=15) elif step.event == "STRING_ANNOTATION": ax.plot(p, [0, 0], "-", color="black", lw=15) if step.info and not hide_annotation_labels: ax.text( p[0], 0.06, step.info, color="black", fontsize=annotation_fontsize, ) else: logger.error("Unsupported annotation type: {}".format(step.event)) sys.exit(1) # set axis parameters ax.set_xlim([0, 1]) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_title(annotation_filename, fontsize=8, loc="left") ax.tick_params(axis="x", length=0) ax.tick_params(axis="y", length=0) ax.set_xticklabels([]) ax.set_yticklabels([]) # }}} # {{{def get_interval_range_plan_start_end(ranges, interval): def get_interval_range_plan_start_end(ranges, interval): # transcript can span ranges start_range_hit_i = get_range_hit(ranges, interval.chrm, interval.start) end_range_hit_i = get_range_hit(ranges, interval.chrm, interval.end) if start_range_hit_i is None and end_range_hit_i is None: for i, range_item in enumerate(ranges): if ( (strip_chr(range_item.chrm) == strip_chr(interval.chrm)) and (interval.start <= range_item.start <= interval.end) and (interval.start <= range_item.end <= interval.end) ): start_range_hit_i = i end_range_hit_i = i start = None end = None # neither end is in range, add nothing if start_range_hit_i == None and end_range_hit_i == None: return None, None # start is in, end is not elif end_range_hit_i == None: start = genome_interval( interval.chrm, max(interval.start, ranges[start_range_hit_i].start), max(interval.start, ranges[start_range_hit_i].start), ) end = genome_interval( interval.chrm, ranges[start_range_hit_i].end, ranges[start_range_hit_i].end ) # end is in, start is not elif start_range_hit_i == None: start = genome_interval( interval.chrm, ranges[end_range_hit_i].start, ranges[end_range_hit_i].start ) end = genome_interval( interval.chrm, min(interval.end, ranges[end_range_hit_i].end), min(interval.end, ranges[end_range_hit_i].end), ) # in same range or in different ranges else: start = genome_interval( interval.chrm, max(interval.start, ranges[start_range_hit_i].start), max(interval.start, ranges[start_range_hit_i].start), ) end = genome_interval( interval.chrm, min(interval.end, ranges[end_range_hit_i].end), min(interval.end, ranges[end_range_hit_i].end), ) return start, end # }}} # {{{def get_transcript_plan(ranges, transcript_file): def get_transcript_plan(ranges, transcript_file): genes = {} transcripts = {} cdss = {} for r in ranges: itr = get_tabix_iter(r.chrm, r.start, r.end, transcript_file) if not itr: continue for row in itr: gene_annotation = row.rstrip().split() if gene_annotation[2] == "gene": info = dict( [list(val.split("=")) for val in gene_annotation[8].split(";")] ) info["strand"] = gene_annotation[6] == "+" if "Name" not in info: continue genes[info["Name"]] = [ genome_interval( gene_annotation[0], int(gene_annotation[3]), int(gene_annotation[4]), ), info, ] elif gene_annotation[2] in ["transcript", "mRNA"]: info = dict( [list(val.split("=")) for val in gene_annotation[8].split(";")] ) info["strand"] = gene_annotation[6] == "+" if info["Parent"] not in transcripts: transcripts[info["Parent"]] = {} transcripts[info["Parent"]][info["ID"]] = [ genome_interval( gene_annotation[0], int(gene_annotation[3]), int(gene_annotation[4]), ), info, ] elif gene_annotation[2] == "CDS": info = dict( [list(val.split("=")) for val in gene_annotation[8].split(";")] ) info["strand"] = gene_annotation[6] == "+" if info["Parent"] not in cdss: cdss[info["Parent"]] = {} if info["ID"] not in cdss[info["Parent"]]: cdss[info["Parent"]][info["ID"]] = [] cdss[info["Parent"]][info["ID"]].append( genome_interval( gene_annotation[0], int(gene_annotation[3]), int(gene_annotation[4]), ) ) transcript_plan = [] for gene in genes: gene_id = genes[gene][1]["ID"] if gene_id not in transcripts: continue for transcript in transcripts[gene_id]: interval, info = transcripts[gene_id][transcript] start, end = get_interval_range_plan_start_end(ranges, interval) if not start or not end: continue step = plan_step(start, end, "TRANSCRIPT") step.info = {"Name": None, "Strand": None, "Exons": None} step.info["Name"] = info["Name"] step.info["Strand"] = info["strand"] exons = [] if transcript in cdss: for cds in cdss[transcript]: for exon in cdss[transcript][cds]: start, end = get_interval_range_plan_start_end(ranges, exon) if start and end: exons.append(plan_step(start, end, "EXON")) if len(exons) > 0: step.info["Exons"] = exons transcript_plan.append(step) return transcript_plan # }}} # {{{ def plot_transcript(transcript_file, chrom, start, end, def plot_transcript( transcript_file, transcript_filename, ranges, grid, annotation_fontsize, xaxis_label_fontsize, annotation_scalar, ): """Plots a transcript file annotation """ if not transcript_filename: transcript_filename = os.path.basename(transcript_file) transcript_idx = 0 transcript_idx_max = 0 currect_transcript_end = 0 ax = plt.subplot(grid[-1]) transcript_plan = get_transcript_plan(ranges, transcript_file) for step in transcript_plan: p = [ map_genome_point_to_range_points( ranges, step.start_pos.chrm, step.start_pos.start ), map_genome_point_to_range_points( ranges, step.end_pos.chrm, step.end_pos.end ), ] # if an annotation lies outside the window, its coordinate will be None, so we trim to the window if p[0] is None: p[0] = 0 if p[1] is None: p[1] = 0 # Reset transcript index outside of current stack if p[0] > currect_transcript_end: transcript_idx = 0 currect_transcript_end = max(p[1], currect_transcript_end) ax.plot( p, [transcript_idx, transcript_idx], "-", color="cornflowerblue", lw=0.5, solid_capstyle="butt", ) # Print arrows throughout gene to show direction. nr_arrows = 2 + int((p[1]-p[0])/0.02) arrow_locs = np.linspace(p[0], p[1], nr_arrows) arrowprops = dict(arrowstyle="->", color="cornflowerblue", lw=0.5, mutation_aspect=2, mutation_scale=3) if step.info["Strand"]: # Add left-facing arrows for arrow_loc in arrow_locs[1:]: ax.annotate( "", xy=(arrow_loc, transcript_idx), xytext=(p[0], transcript_idx), arrowprops=arrowprops, annotation_clip=True, ) else: # Add right-facing arrows for arrow_loc in arrow_locs[:-1]: ax.annotate( "", xy=(arrow_loc, transcript_idx), xytext=(p[1], transcript_idx), arrowprops=arrowprops, annotation_clip=True, ) if step.info["Exons"]: for exon in step.info["Exons"]: p_exon = [ map_genome_point_to_range_points( ranges, exon.start_pos.chrm, exon.start_pos.start ), map_genome_point_to_range_points( ranges, exon.end_pos.chrm, exon.end_pos.end ), ] if not points_in_window(p_exon): continue ax.plot( p_exon, [transcript_idx, transcript_idx], "-", color="cornflowerblue", solid_capstyle="butt", lw=4, ) ax.text( sum(p)/2, transcript_idx + 0.1, step.info["Name"], color="blue", fontsize=annotation_fontsize, ha="center" ) transcript_idx += 1 transcript_idx_max = max(transcript_idx, transcript_idx_max) # set axis parameters ax.set_xlim([0, 1]) ax.set_ylim([transcript_idx_max * -0.1, 0.01+(transcript_idx_max * 1.01)]) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.tick_params(axis="x", length=0) ax.tick_params(axis="y", length=0) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.set_title(transcript_filename, fontsize=8, loc="left") # }}} ######################################################################## # main block ######################################################################## def plot(parser, options, extra_args=None): """ To support translocations, the SVs are specified as an array of genome_interval. For now we let that array be size 1 or 2. """ if options.debug: logger.setLevel(logging.DEBUG) random.seed(options.random_seed) if options.print_args or options.json_only: print_arguments(options) if options.json_only: sys.exit(0) if options.output_file: output_file = options.output_file else: if not os.path.isdir(options.output_dir): os.mkdir(options.output_dir) name_fields = [ options.sv_type, "-".join(options.chrom), "-".join([str(s) for s in options.start]), "-".join([str(e) for e in options.end]), ] if options.sv_type: output_file = os.path.join(options.output_dir, "_".join(name_fields)) else: output_file = os.path.join(options.output_dir, "_".join(name_fields[1:])) if (options.annotation_files and options.annotation_filenames and len(options.annotation_files) != len(options.annotation_filenames)): logger.warning("annotation filenames do not match annotation files") sys.exit(1) for bam in options.bams: if ".cram" in bam: if not options.reference: logger.error("Missing argument reference (-r/--reference) required for CRAM") sys.exit(1) if len(options.chrom) != len(options.start) != len(options.end): logger.error("The number of chromosomes ({}), starts ({}), and ends ({}) do not match.".format( len(options.chrom), len(options.start), len(options.end) ) ) sys.exit() sv = [] for i in range(len(options.chrom)): options.chrom[i] = strip_chr(options.chrom[i]) sv.append(genome_interval(options.chrom[i], options.start[i], options.end[i])) # set up plot plot_height, plot_width, window, ranges = set_plot_dimensions( sv, options.sv_type, options.plot_height, options.plot_width, options.bams, options.reference, options.annotation_files, options.transcript_file, options.window, options.zoom, ) marker_size = options.marker_size # set up sub plots matplotlib.rcParams.update({"font.size": 12}) fig = plt.figure(figsize=(plot_width, plot_height)) # read alignment data read_data, max_coverage = get_read_data( ranges, options.bams, options.reference, options.separate_mqual, options.include_mqual, options.coverage_only, options.long_read, options.min_event_size, options.same_yaxis_scales, options.max_depth, options.z, options.ignore_hp, ) # set up grid organizer grid, num_ax = create_gridspec( options.bams, options.transcript_file, options.annotation_files, options.sv_type, read_data, options.annotation_scalar, ) current_axis_idx = 0 # plot variant on top if options.sv_type: current_axis_idx = create_variant_plot( grid, current_axis_idx, sv, options.sv_type, ranges, options.start_ci, options.end_ci, ) if options.max_coverage: max_coverage = options.max_coverage # Plot each sample current_axis_idx = plot_samples( ranges, read_data, grid, current_axis_idx, num_ax, options.bams, options.chrom, options.coverage_tracktype, options.titles, options.same_yaxis_scales, options.xaxis_label_fontsize, options.yaxis_label_fontsize, options.annotation_files, options.transcript_file, options.max_coverage_points, max_coverage, marker_size, options.coverage_only, options.jitter, ) # plot legend plot_legend(fig, options.legend_fontsize, marker_size) # Plot annotation files if options.annotation_files: plot_annotations( options.annotation_files, options.annotation_filenames, ranges, options.hide_annotation_labels, options.annotation_fontsize, grid, current_axis_idx, options.annotation_scalar, ) # Plot sorted/bgziped/tabixed transcript file if options.transcript_file: plot_transcript( options.transcript_file, options.transcript_filename, ranges, grid, options.annotation_fontsize, options.xaxis_label_fontsize, options.annotation_scalar, ) # save matplotlib.rcParams["agg.path.chunksize"] = 100000 plt.tight_layout(pad=0.8, h_pad=0.1, w_pad=0.1) try: plt.savefig(output_file, dpi=options.dpi) except Exception as e: logger.error( "Failed to save figure {}".format(output_file) ) print(e) plt.close(fig) # }}} ================================================ FILE: samplot/samplot_vcf.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ Create samplot vcf commands to execute and generate companion HTML image browser. Note: additional arguments are passed through to samplot plot """ from __future__ import print_function import argparse from collections import Counter import logging import operator import os import random import sys import re import pysam from jinja2 import Environment, FileSystemLoader, select_autoescape try: from shlex import quote except ImportError: from pipes import quote from .samplot import add_plot logger = logging.getLogger(__name__) cmp_lookup = { ">": operator.gt, # e.g. DHFC < 0.5 "<": operator.lt, "<=": operator.le, ">=": operator.ge, "==": operator.eq, "contains": operator.contains, # e.g. CSQ contains HIGH "exists": lambda a, b: True, # e.g. exists smoove_gene } class Sample(object): __slots__ = [ "family_id", "id", "paternal_id", "maternal_id", "mom", "dad", "kids", "i", ] def __init__(self, line): toks = line.rstrip().split() self.family_id = toks[0] self.id = toks[1] self.paternal_id = toks[2] self.maternal_id = toks[3] self.kids = [] self.i = -1 # index in the vcf. def __repr__(self): return "Sample(id:{id},paternal_id:{pid},maternal_id:{mid})".format( id=self.id, pid=self.paternal_id, mid=self.maternal_id ) def flatten(value, sep=","): """ >>> flatten([1,2,3,4]) '1,2,3,4' >>> flatten((5,6)) '5,6' >>> flatten(0.987654321) '0.987654' >>> flatten(7) '7' >>> flatten("flatten") 'flatten' """ flat = None # tuple or list if isinstance(value, tuple) or isinstance(value, list): flat = sep.join([str(i) for i in value]) # reformats long float values elif isinstance(value, float): flat = "%.6f" % (value,) # string and int else: flat = str(value) return flat def get_format_fields(ids, variant): """ args: ids (list) - list of FORMAT field IDs, e.g. ['AS', 'AP', 'DHFFC'] variant (pysam.libcbcf.VariantRecord) returns: list """ sample_format = [] for i, sample_fields in enumerate(variant.samples.values()): for field_id in ids: sample_field_val = flatten(sample_fields.get(field_id, "")) if sample_field_val: if len(sample_format) < i + 1: sample_format.append("") else: sample_format[i] += " " sample_format[i] += "{}={}".format(field_id, sample_field_val) return sample_format def get_format_title(samples, ids, variant): """ args: samples (list) - list of sample IDs in order of VCF annotations ids (list) - list of FORMAT field IDs, e.g. ['AS', 'AP', 'DHFFC'] variant (pysam.libcbcf.VariantRecord) returns: dict """ fields = get_format_fields(ids, variant) return dict(zip(samples, fields)) def make_plot_titles(samples, attr_values): """ keeping this method separate in the event we add more things to the title args: samples (list) - list of sample IDs attr_values (str) - string of VCF FORMAT values returns: dict >>> make_plot_titles( ['s1', 's2', 's3'], { 's1': 'AS=0 AP=0', 's2': 'AS=0 AP=1', 's3': 'AS=1 AP=1' } ) { 's1': "'s1 AS=0 AP=0'", 's2': "'s2 AS=0 AP=1'", 's3': "'s3 AS=1 AP=1'" } """ plot_titles = dict() for sample in samples: if sample in attr_values: plot_titles[sample] = quote("%s %s" % (sample, attr_values[sample])) return plot_titles def get_overlap( tabix, chrom, start, end, priority=["exon", "gene", "transcript", "cds"], no_hit="intergenic", fix_chr=True, ): """ args: tabix (pysam.libctabix.TabixFile) - open TabixFile chrom (str) start (int) end (int) priority (Optional[list]) - order of preferred region annotation no_hit (Optional[str]) - use this annotation if no matches among priority fix_chr (Optional[bool]) - try to fetch a region using both non-'chr' and 'chr' prefix on failures returns: str """ overlaps = None try: overlaps = set( [i.split("\t")[2].lower() for i in tabix.fetch(chrom, start, end)] ) except IndexError: # probably not a gff3 logger.warning("Invalid annotation file specified for --gff3") overlaps = None except ValueError: if fix_chr: # try removing chr if chrom.startswith("chr"): overlaps = get_overlap( tabix, chrom[3:], start, end, priority, no_hit, False ) # or adding chr else: overlaps = get_overlap( tabix, "chr{chrom}".format(chrom=chrom), start, end, priority, no_hit, False, ) except: # bad regions logger.warning( "Error fetching {chrom}:{start}-{end}".format( chrom=chrom, start=start, end=end ) ) overlaps = None overlap = "" if overlaps: for feature in priority: if feature in overlaps: overlap = feature break else: # fetching overlaps failed overlap = "unknown" if not overlap and no_hit: overlap = no_hit return overlap def parse_ped(path, vcf_samples=None): if path is None: return {} samples = [] look = {} for line in open(path): samples.append(Sample(line)) look[samples[-1].id] = samples[-1] for s in samples: s.dad = look.get(s.paternal_id) if s.dad is not None: s.dad.kids.append(s) s.mom = look.get(s.maternal_id) if s.mom is not None: s.mom.kids.append(s) # match these samples to the ones in the VCF. if vcf_samples is not None: result = [] for i, variant_sample in enumerate(vcf_samples): if variant_sample not in look: continue result.append(next(s for s in samples if s.id == variant_sample)) result[-1].i = i samples = result return {s.id: s for s in samples} def get_names_to_bams(bams, name_list=None): """ get mapping from names (read group samples) to bam paths) this is useful because the VCF has the names and we'll want the bam paths for those samples if name_list is passed in as a parameter those will be used instead """ names = {} if name_list: if len(name_list) != len(bams): logger.error("List of sample IDs does not match list of alignment files.") sys.exit(1) for i, p in enumerate(bams): names[name_list[i]] = p else: for p in bams: b = pysam.AlignmentFile(p) # TODO - catch specific exception try: names[b.header["RG"][0]["SM"]] = p except Exception as e: logger.error("No RG field in alignment file {}".format(p)) logger.error("Include ordered list of sample IDs to avoid this error") print(e, file=sys.stderr) sys.exit(1) return names def tryfloat(v): try: return float(v) except: return v def to_exprs(astr): """ an expr is just a 3-tuple of "name", fn, value" e.g. "DHFFC", operator.lt, 0.7" >>> to_exprs("DHFFC < 0.5 & SVTYPE == 'DEL'") [('DHFFC', , 0.5), ('SVTYPE', , 'DEL')] >>> to_exprs("CSQ contains 'HIGH'") [('CSQ', , 'HIGH')] """ astr = (x.strip() for x in astr.strip().split("&")) result = [] for a in astr: a = [x.strip() for x in a.split()] if len(a) == 2: assert a[1] == "exists", ("bad expression", a) a.append("extra_arg") assert len(a) == 3, ("bad expression", a) assert a[1] in cmp_lookup, ( "comparison:" + a[1] + " not supported. must be one of:" + ",".join(cmp_lookup.keys()) ) result.append((a[0], cmp_lookup[a[1]], tryfloat(a[2].strip("'").strip('"')))) return result def check_expr(vdict, expr): """ >>> check_expr({"CSQ": "asdfHIGHasdf"}, to_exprs("CSQ contains 'HIGH'")) True >>> check_expr({"CSQ": "asdfHIGHasdf", "DHFC": 1.1}, to_exprs("CSQ contains 'HIGH' & DHFC < 0.5")) False >>> check_expr({"CSQ": "asdfHIGHasdf", "DHFC": 1.1}, to_exprs("CSQ contains 'HIGH' & DHFC < 1.5")) True >>> check_expr({"smoove_gene": "asdf"}, to_exprs("smoove_gene exists")) True >>> check_expr({"smooe_gene": "asdf"}, to_exprs("smoove_gene exists")) False >>> check_expr({"smoove_gene": ""}, to_exprs("smoove_gene exists")) True """ # a single set of exprs must be "anded" for name, fcmp, val in expr: # NOTE: asking for a missing annotation will return false. if name not in vdict: return False if not fcmp(vdict[name], val): return False return True def make_single(vdict): """ >>> d = {"xx": (1,)} >>> make_single(d) {'xx': 1} """ for k in vdict.keys(): if isinstance(vdict[k], tuple) and len(vdict[k]) == 1: vdict[k] = vdict[k][0] return vdict def get_dn_row(ped_samples): for s in ped_samples.values(): if s.mom is not None and s.dad is not None: return '{title:"de novo", field:"dn"}' return "" def read_important_regions(bedfilename): if not bedfilename: return None important_regions = {} with open(bedfilename, "r") as bedfile: for line in bedfile: pos_fields = line.strip().split() region_string = "_".join(pos_fields[1:3]) if pos_fields[0] not in important_regions: important_regions[pos_fields[0]] = [] important_regions[pos_fields[0]].append(region_string) return important_regions def var_in_important_regions(important_regions, chrom, start, end, svtype): if not important_regions: # if no important regions are set all locations are valid return True if chrom in important_regions: for region in important_regions[chrom]: region_st, region_end = [int(x) for x in region.split("_")] if ( region_st <= start <= region_end or region_st <= end <= region_end or start <= region_st <= end ): return True logger.debug( "Skipping {} at {}:{}-{}, outside important_regions coordinates".format( svtype, chrom, start, end ) ) return False def cram_input(bams): for bam in bams: if bam.endswith(".cram"): return True return False def above_call_rate(gts, sample_count, min_call_rate, svtype, chrom, start, end): """ skips variants with call rate below min_call_rate if set """ if not min_call_rate: return True call_rate = (sample_count - sum(None in g for g in gts)) / sample_count if min_call_rate and (call_rate < min_call_rate): logger.debug( ( "Skipping {} at {}:{}-{}, call rate of variant " + "({}) below min_call_rate" ).format(svtype, chrom, start, end, call_rate), ) return False return True def below_max_hets(gts, max_hets, svtype, chrom, start, end): """ skips variants with more than max_hets heterozygotes if max_hets is set """ if not max_hets: return False # requisite hets/hom-alts het_count = sum(sum(x) >= 1 for x in gts if None not in x) if het_count > max_hets: logger.debug( "Skipping {} at {}:{}-{}, more than max_hets heterozygotes".format( svtype, chrom, start, end ) ) return False return True def no_variant_found(gts, svtype, chrom, start, end): """ skips variants with no non-ref samples """ if not any(sum(x) > 0 for x in gts if None not in x): logger.debug( "Skipping {} at {}:{}-{}, no samples have non-ref genotypes".format( svtype, chrom, start, end ) ) return True return False def get_plottable_samples( gts, variant, plot_all, filters, svtype, chrom, start, end, ): """ gets the samples and indices for all those which need to be plotted, which means passing filters and, if not plot_all, having a nonref genotype """ if plot_all: test_idxs = [i for i, gt in enumerate(gts)] if len(test_idxs) == 0: logger.debug( "No samples found for {} at {}:{}-{}".format(svtype, chrom, start, end) ) else: test_idxs = [i for i, gt in enumerate(gts) if None not in gt and sum(gt) > 0] if len(test_idxs) == 0: logger.debug( "No non-reference samples found for {} at {}:{}-{}".format( svtype, chrom, start, end ) ) test_samples = [s for i, s in enumerate(variant.samples.values()) if i in test_idxs] # apply filters if set if len(filters) == 0: idxs = test_idxs else: idxs = [] odict = make_single(dict(variant.info.items())) for i, ts in enumerate(test_samples): vdict = odict.copy() vdict.update(make_single(dict(ts.items()))) if any(check_expr(vdict, fs) for fs in filters): idxs.append(test_idxs[i]) if len(idxs) == 0: logger.debug( "No samples pass filters for {} at {}:{}-{}".format( svtype, chrom, start, end ) ) return idxs, test_samples def get_variant_samples( idxs, vcf_samples, names_to_bams, svtype, chrom, start, end, ): """ gets the samples that need to be plotted and have alignment files assigned """ variant_samples = [] for i in idxs: if vcf_samples[i] in names_to_bams: variant_samples.append(vcf_samples[i]) if len(variant_samples) == 0: logger.debug( ( "Skipping {} at {}:{}-{}, no plottable samples " + "with matched alignment files" ).format(svtype, chrom, start, end), ) return variant_samples def get_denovos( denovo_row, test_samples, variant_samples, ped_samples, svtype, chrom, start, end, dn_only, ): """ we call it a de novo if the sample passed the filters but the mom and dad had homref genotypes before filtering. so stringent filtering on the kid and lenient on parents. """ denovo_svs = [] if denovo_row != "": test_sample_names = {s.name for s in test_samples} for variant_sample in variant_samples: sample = ped_samples[variant_sample] if sample.mom is None or sample.dad is None: continue if ( sample.mom.id not in test_sample_names and sample.dad.id not in test_sample_names ): denovo_svs.append(sample.id) if len(denovo_svs) <= 0 and dn_only: logger.debug( "Skipping {} at {}:{}-{}, dn_only selected and no de novos found".format( svtype, chrom, start, end ), ) return denovo_svs def get_family_controls( ped, denovo_svs, variant_samples, ped_samples, max_hets, bams, names_to_bams, vcf_samples_set, ): """ tries to find family members to use as controls for putative de novos """ # do DN samples first so we can see parents. # TODO also need to do the non-denovos as they seem to have been forgotten for variant_sample in denovo_svs + [ x for x in variant_samples if x not in denovo_svs ]: sample = ped_samples.get(variant_sample) if sample is None: continue if ( sample.mom is not None and sample.mom.id not in variant_samples and sample.mom.id in vcf_samples_set ): variant_samples.append("mom-of-%s[%s]" % (variant_sample, sample.mom.id)) bams.append(names_to_bams[sample.mom.id]) if ( sample.dad is not None and sample.dad.id not in variant_samples and sample.dad.id in vcf_samples_set ): variant_samples.append("dad-of-%s[%s]" % (variant_sample, sample.dad.id)) bams.append(names_to_bams[sample.dad.id]) for kid in sample.kids: if kid.id not in variant_samples and kid.id in vcf_samples_set: variant_samples.append("kid-of-%s[%s]" % (variant_sample, kid.id)) bams.append(names_to_bams[kid.id]) if max_hets: if len(bams) > 1.5 * max_hets: break if max_hets: if len(bams) > 1.5 * max_hets: break return variant_samples, bams def get_nonfamily_controls( gts, vcf_samples, variant_samples, names_to_bams, min_entries, bams ): # extend with some controls: hom_ref_idxs = [ i for i, gt in enumerate(gts) if len(gt) == 2 and gt[0] == 0 and gt[1] == 0 ] if len(hom_ref_idxs) > 3: random.shuffle(hom_ref_idxs) hom_ref_samples = [] for i in hom_ref_idxs: if vcf_samples[i] in names_to_bams: hom_ref_samples.append(vcf_samples[i]) to_add_count = min_entries - len(bams) bams.extend(names_to_bams[s] for s in hom_ref_samples[:to_add_count]) variant_samples += ["control-sample:" + s for s in hom_ref_samples[:to_add_count]] return variant_samples, bams def create_metadata( variant, translocation_chrom, svtype, sample_str, n_samples, annotations, denovo_row, denovo_svs, ): """ creates a dict with the info about the SV that will be used in the website """ data_dict = { "chrom": variant.chrom, "chrom2": translocation_chrom, "start": variant.start, "end": variant.stop, "svtype": svtype, "svlength": variant.stop - variant.start, "samples": sample_str, "nsamples": n_samples, } if annotations: data_dict["overlaps"] = get_overlap( annotations, variant.chrom, variant.start, variant.stop ) if denovo_row != "": data_dict["dn"] = ",".join(denovo_svs) return data_dict def format_template( variant, data_dict, max_entries, bams, variant_samples, plot_titles, out_dir, output_type, svtype, downsample, pass_through_args, ): """ formates the template string for generation of the final command """ if data_dict["chrom2"] is None: figname_template = "{svtype}_{chrom}_{start}_{end}.{itype}" else: figname_template = "{svtype}_{chrom}_{start}_{chrom2}_{end}.{itype}" fig_path = os.path.join( out_dir, figname_template.format(itype=output_type, **data_dict), ) if "CIPOS" in variant.info: v = variant.info["CIPOS"] cipos = "--start_ci '%s,%s'" % (abs(int(v[0])), abs(int(v[1]))) else: cipos = "" if "CIEND" in variant.info: v = variant.info["CIEND"] ciend = "--end_ci '%s,%s'" % (abs(int(v[0])), abs(int(v[1]))) else: ciend = "" # dynamically set Z to speed drawing and remove noise for larger events z = 3 if variant.stop - variant.start > 2000: z = 4 if variant.stop - variant.start > 10000: z = 6 if variant.stop - variant.start > 20000: z = 9 if data_dict["chrom2"] is None: z = 3 if max_entries: bams = bams[:max_entries] variant_samples = variant_samples[:max_entries] # update titles based on FORMAT fields requested title_list = list() for variant_sample in variant_samples: if variant_sample in plot_titles: title_list.append(plot_titles[variant_sample]) else: title_list.append(variant_sample) start = variant.start stop = variant.stop start2 = None stop2 = None if data_dict["chrom2"] is None: template = ( "samplot plot {extra_args} -z {z} -n {titles} " + "{cipos} {ciend} {svtype} -c {chrom} -s {start} " + "-e {end} -o {fig_path} -d {downsample} -b {bams}" ) else: template = ( "samplot plot {extra_args} -z {z} -n {titles} " + "{cipos} {ciend} {svtype} -c {chrom} -s {start} " + "-e {end} -o {fig_path} -d {downsample} -b {bams} " + "-c {chrom2} -s {start2} -e {end2}" ) # For interchromosomal variants the 2nd breakpoint position should # not be encoded in INFO/END tag although some callers still do this. # Currently it is unclear if there is a good replacement. Delly uses # INFO/POS2 for this, GATK-SV uses INFO/END2, dysgu uses INFO/CHR2_POS. # see: https://github.com/dellytools/delly/issues/159 # see: https://gatk.broadinstitute.org/hc/en-us/articles/5334587352219-How-to-interpret-SV-VCFs # TODO - if the SV breakpoints are specified in the ALT field one # could use this info to get the 2nd breakpoint position if "POS2" in variant.info: start2 = variant.info["POS2"] elif "END2" in variant.info: start2 = variant.info["END2"] elif "CHR2_POS" in variant.info: start2 = variant.info["CHR2_POS"] else: start2 = stop # Update stop if INFO/END denotes the 2nd breakpoint stop = start + 1 stop2 = start2 + 1 command = template.format( extra_args=" ".join(pass_through_args), bams=" ".join(bams), titles=" ".join(title_list), z=z, cipos=cipos, ciend=ciend, svtype="-t " + svtype if svtype != "SV" else "", fig_path=fig_path, chrom=variant.chrom, start=start, end=stop, downsample=downsample, chrom2=data_dict["chrom2"], start2=start2, end2=stop2, ) + "\n" return command def write_site(table_data, out_dir, output_type, annotations, denovo_row): # grab the template env = Environment( loader=FileSystemLoader(os.path.join(os.path.dirname(__file__), "templates")), autoescape=select_autoescape(["html"]), ) html_template = env.get_template("samplot_vcf.html") # write index.html with open("{out_dir}/index.html".format(out_dir=out_dir), "w") as fh: print( html_template.render( data=table_data, plot_type=output_type, gff3="true" if annotations else "false", denovo="true" if denovo_row else "false", ), file=fh, ) def is_simply_skippable( variant, vcf_samples, gts, important_regions, max_mb, min_bp, min_call_rate, max_hets, plot_all, translocation_chrom, ): """ checks several basic terms that could filter this variant out specifically, if the variant type is INS, or fails the important regions, max_mb, min_bp, min_call_rate, or max_hets filters """ svtype = variant.info.get("SVTYPE", "SV") # skips variants outside important regions if those are set if not var_in_important_regions( important_regions, variant.chrom, variant.start, variant.stop, svtype, ): return True # skips insertions if svtype in ("INS"): logger.debug( "Skipping {} at {}:{}-{}, INS type not supported".format( svtype, variant.chrom, variant.start, variant.stop ) ) return True # skips variants over max_mb length, if set if max_mb and (variant.stop - variant.start > max_mb * 1000000): logger.debug( "Skipping {} at {}:{}-{}, variant length greater than max_mb".format( svtype, variant.chrom, variant.start, variant.stop ) ) return True # skips variants under min_bp, if set if (variant.stop - variant.start < min_bp) and translocation_chrom is None: logger.debug( "Skipping {} at {}:{}-{}, variant length shorter than min_bp".format( svtype, variant.chrom, variant.start, variant.stop ) ) return True # skips variants if the call rate is below min_call_rate, if set if not above_call_rate( gts, len(vcf_samples), min_call_rate, svtype, variant.chrom, variant.start, variant.stop, ): return True # skips variants if there are more hets than max_hets, if set if below_max_hets( gts, max_hets, svtype, variant.chrom, variant.start, variant.stop ): return True # skips variants where no sample is non-ref, if plot_all is not set if not plot_all: if no_variant_found( gts, svtype, variant.chrom, variant.start, variant.stop ): return True return False def generate_commands( vcf, plot_all, max_mb, min_bp, min_call_rate, max_hets, dn_only, ped, important_regions, format_field_ids, min_entries, max_entries, out_dir, output_type, downsample, filters, ped_samples, denovo_row, names_to_bams, annotations, pass_through_args, ): """ for every variant in vcf, process and output plot command - if and only if it passes filters """ commands = [] table_data = [] vcf_samples = vcf.header.samples vcf_samples_set = set(vcf_samples) vcf_samples_list = list(vcf_samples) vcf_stats = Counter() # Check if VCF samples match BAMs if vcf_samples_set != set(names_to_bams): missing_vcf_samples = vcf_samples_set - set(names_to_bams) missing_bam_samples = set(names_to_bams) - vcf_samples_set logger.warning( "VCF samples and BAMs do not match. " "This may be due to different sample names in the VCF and BAMs." ) if missing_vcf_samples: logger.warning( "VCF samples missing from BAMs: {}".format(", ".join(missing_vcf_samples)) ) if missing_bam_samples: logger.warning( "BAMs missing from VCF samples: {}".format(", ".join(missing_bam_samples)) ) for var_count, variant in enumerate(vcf): translocation_chrom = None svtype = variant.info.get("SVTYPE", "SV") # get genotypes gts = [s.get("GT", (None, None)) for s in variant.samples.values()] # handle translocations if svtype in ["BND", "TRA"]: try: translocation_chrom = variant.info.get("CHR2") except (KeyError, ValueError) as e: logger.debug(e) logger.info(f"Translocation {svtype} on {variant.chrom}:{variant.start}" "skipped due to missing CHR2 INFO field.") if is_simply_skippable( variant, vcf_samples, gts, important_regions, max_mb, min_bp, min_call_rate, max_hets, plot_all, translocation_chrom, ): vcf_stats["Skipped"] += 1 continue # gets the list of samples to plot # skips ref samples if plot_all isn't set # and applies user-defined filters idxs, test_samples = get_plottable_samples( gts, variant, plot_all, filters, svtype, variant.chrom, variant.start, variant.stop, ) if len(idxs) == 0: vcf_stats["No plottable samples"] += 1 continue # matches alignment files to variant samples variant_samples = get_variant_samples( idxs, vcf_samples, names_to_bams, svtype, variant.chrom, variant.start, variant.stop, ) if len(variant_samples) <= 0: vcf_stats["No plottable samples with matched BAM"] += 1 continue bams = [names_to_bams[s] for s in variant_samples] # finds putative de novo variants denovo_svs = get_denovos( denovo_row, test_samples, variant_samples, ped_samples, svtype, variant.chrom, variant.start, variant.stop, dn_only, ) if dn_only and (len(denovo_svs) <= 0): vcf_stats["Non de novo ('--dn_only' specified)"] += 1 continue # save fields for the html. n_samples = len(variant_samples) # semi-colon delimited eases CSV export from HTML sample_str = ";".join(variant_samples) # dict holding sample to FORMAT title string plot_titles = dict() if format_field_ids: format_attrs = get_format_title(vcf_samples_list, format_field_ids, variant) plot_titles = make_plot_titles(variant_samples, format_attrs) # get control samples if possible # try to get family members if ped is set # and reference samples is ped is not set if ped is not None: variant_samples, bams = get_family_controls( ped, denovo_svs, variant_samples, ped_samples, max_hets, bams, names_to_bams, vcf_samples_set, ) elif min_entries and len(bams) < min_entries: variant_samples, bams = get_nonfamily_controls( gts, vcf_samples, variant_samples, names_to_bams, min_entries, bams ) data_dict = create_metadata( variant, translocation_chrom, svtype, sample_str, n_samples, annotations, denovo_row, denovo_svs, ) table_data.append(data_dict) command = format_template( variant, data_dict, max_entries, bams, variant_samples, plot_titles, out_dir, output_type, svtype, downsample, pass_through_args, ) commands.append(command) logger.debug("VCF entry count: {}".format(var_count + 1)) if vcf_stats: logger.debug("VCF entrys filtered out: {}".format(sum(vcf_stats.values()))) for reason, count in vcf_stats.items(): logger.debug(" - {}: {}".format(reason, count)) return commands, table_data def run_plot_command(command_string: str): # Setup a parser for translating the command_string parent_parser = argparse.ArgumentParser() sub_parser = parent_parser.add_subparsers(title="[sub-commands]", dest="command") add_plot(sub_parser) # Convert command_string to list and remove leading 'samplot' argument # Taken from https://stackoverflow.com/a/524796. # NOTE: If python2 is dropped, `shlex.split` could be used for simpler syntax command = [p.strip("'") for p in re.split("( |\\\".*?\\\"|'.*?')", command_string.strip()) if p.strip()] command = command[1:] # Skipped parse_known_args here since extra_args are not used in `samplot plot`. # This means that any fauly extra arguments given to `samplot vcf` will raise # and error here args = parent_parser.parse_args(command) args.func(parent_parser, args) def vcf(parser, args, pass_through_args): """ Generate commands and html for plotting/reviewing variants from VCF """ if args.debug: logger.setLevel(logging.DEBUG) if args.dn_only and not args.ped: logger.error("Missing --ped, required when using --dn_only") sys.exit(1) if cram_input(args.bams): if "-r" not in pass_through_args and "--reference" not in pass_through_args: logger.error( "ERROR: missing reference file required for CRAM. " + "Use -r option. (Run `samplot.py -h` for more help)" ) sys.exit(1) vcf = pysam.VariantFile(args.vcf) vcf_samples = vcf.header.samples annotations = None if args.gff3: annotations = pysam.TabixFile(args.gff3) filters = [to_exprs(f) for f in args.filter] ped_samples = parse_ped(args.ped, vcf_samples) # this is empty unless we have a sample with both parents defined. denovo_row = get_dn_row(ped_samples) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # connect the sample IDs to bam files names_to_bams = get_names_to_bams(args.bams, args.sample_ids) # check that at least one sample is can be plotted if not any(vcf_sample in names_to_bams for vcf_sample in vcf_samples): other = "'--sample_ids'" if args.sample_ids else "BAM" logger.error("Samples in VCF do not match samples specified in {}".format(other)) logger.error("VCF samples: {}".format(', '.join(vcf_samples))) logger.error("{} samples: {}".format(other, ', '.join(vcf_samples))) sys.exit(1) # if important regions are included, load those intervals # and only show SVs inside them important_regions = read_important_regions(args.important_regions) # user-requested FORMAT fields to add to plot title format_field_ids = None if args.format: format_field_ids = args.format.split(",") # for every variant in vcf, process and output plot # command - if and only if it passes filters commands, table_data = generate_commands( vcf, args.plot_all, args.max_mb, args.min_bp, args.min_call_rate, args.max_hets, args.dn_only, args.ped, important_regions, format_field_ids, args.min_entries, args.max_entries, args.out_dir, args.output_type, args.downsample, filters, ped_samples, denovo_row, names_to_bams, annotations, pass_through_args, ) write_site(table_data, args.out_dir, args.output_type, annotations, denovo_row) if args.manual_run: with open(args.command_file, "w") as outfile: outfile.writelines(commands) else: if args.threads == 1: for command in commands: run_plot_command(command) else: from multiprocessing import Pool with Pool(processes=args.threads) as pool: pool.map(run_plot_command, commands) def add_vcf(parent_parser): """Defines allowed arguments for samplot's vcf plotter """ import doctest parser = parent_parser.add_parser( "vcf", formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="Generates commands to plot images with `samplot plot`," + " using VCF file to define regions", ) if len(sys.argv) > 1 and sys.argv[1] == "test": r = doctest.testmod() print(r) sys.exit(r.failed) parser.add_argument("--vcf", "-v", help="VCF file containing structural variants") parser.add_argument( "-d", "--out-dir", help="path to write output images", default="samplot-out", ) parser.add_argument( "--ped", help="path to ped (or .fam) file", ) parser.add_argument( "--dn_only", help="plots only putative de novo variants (PED file required)", action="store_true", ) parser.add_argument( "--min_call_rate", type=float, help="only plot variants with at least this call-rate", required=False, ) parser.add_argument( "--filter", action="append", help="simple filter that samples" + " must meet. Join multiple filters with '&' " + "and specify --filter multiple times for 'or'" + " e.g. DHFFC < 0.7 & SVTYPE = 'DEL'", default=[], ) parser.add_argument( "-O", "--output_type", choices=("png", "pdf", "eps", "jpg"), help="type of output figure", default="png", ) parser.add_argument( "--max_hets", type=int, help="only plot variants with at most this many heterozygotes", required=False, ) parser.add_argument( "--min_entries", type=int, help="try to include homref samples as controls to get this many samples in plot", default=6, required=False, ) parser.add_argument( "--max_entries", type=int, help="only plot at most this many heterozygotes", default=10, required=False, ) parser.add_argument( "--max_mb", type=int, help="skip variants longer than this many megabases", required=False, ) parser.add_argument( "--min_bp", type=int, help="skip variants shorter than this many bases", default=20, ) parser.add_argument( "--important_regions", help="only report variants that overlap regions in this bed file", required=False, ) parser.add_argument( "-b", "--bams", type=str, nargs="+", help="Space-delimited list of BAM/CRAM file names", required=True, ) parser.add_argument( "--sample_ids", type=str, nargs="+", help="Space-delimited list of sample IDs, " + "must have same order as BAM/CRAM file names. " + "BAM RG tag required if this is omitted.", required=False, ) parser.add_argument( "--command_file", help="store commands in this file.", default="samplot_vcf_cmds.tmp", required=False, ) parser.add_argument( "--format", default="AS,AP,DHFFC", help="comma separated list of FORMAT fields to include in sample plot title", required=False, ) parser.add_argument( "--gff3", help="genomic regions (.gff with .tbi in same directory) " + "used when building HTML table and table filters", required=False, ) parser.add_argument( "--downsample", help="Number of normal reads/pairs to plot", default=1, type=int ) parser.add_argument( "--manual_run", help="disables auto-run for the plotting commands", default=False, action="store_true", ) parser.add_argument( "--plot_all", help="plots all samples and all variants - " + "limited by any filtering arguments set", default=False, action="store_true", ) parser.add_argument( "-t", "--threads", type=int, default=1, help="Number of threads to use to generate plots. Default: %(default)s", ) parser.add_argument( "--debug", help="prints out the reason for skipping any skipped variant entry", default=False, action="store_true", ) parser.set_defaults(func=vcf) if __name__ == "__main__": print("Run as samplot module with `samplot vcf`") ================================================ FILE: samplot/templates/samplot_vcf.html ================================================ samplot
================================================ FILE: setup.py ================================================ import re from setuptools import find_packages, setup with open("README.md", "r") as fh: long_description = fh.read() with open("samplot/__init__.py", "r") as fd: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE ).group(1) with open("requirements.txt", "r") as f: requires = f.read().splitlines() setup( name="samplot", version=version, description="plotting package for genomic structural variation", long_description=long_description, long_description_content_type='text/markdown', author="Jonathan Belyeu", author_email="jrbelyeu@gmail.com", url="https://github.com/ryanlayer/samplot.git", packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), package_data={"": ["LICENSE", "README.md"]}, data_files=[("samplot", ["samplot/templates/samplot_vcf.html"])], include_package_data=True, install_requires=requires, license="MIT", zip_safe=False, entry_points={"console_scripts": ["samplot = samplot.__main__:main"]}, classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Bio-Informatics", ], ) ================================================ FILE: ssshtest ================================================ #!/bin/bash ############################################################ # Program: ssshtest # Authors : Ryan M Layer ryan.layer@gmail.com # Brent S Pedersen bpederse@gmail.com # (c) 2015 - Ryan Layer, Brent Pedersen ############################################################ PROGRAM_NAME=sshtest VERSION=0.1.5 RED='\033[0;31m' BRED='\033[1;31m' # bold GREEN='\033[0;32m' BGREEN='\033[1;32m' # bold BLUE='\033[0;33m' BOLD='\033[0;1m' NC='\033[0m' # No Color PASS=" ${BGREEN}PASS${NC}" FAIL=" ${BRED}FAIL${NC}" COLS=`tput cols` STDOUT_FILE=${TMPDIR:-/tmp}/o.$$ STDERR_FILE=${TMPDIR:-/tmp}/e.$$ OUTVAL= ERRVAL= RETVAL= CMD= VERBOSE= TOTAL=0 SUCCESSES=0 FAILS=0 FLAG=0 STOP_ON_FAIL=0 trap report EXIT TESTS_TO_RUN=($@) RUN_NAME= #{{{ Command line parsing usage() { cat << EOF usage: $0 OPTIONS OPTIONS can be: -h Show this message -v Print success messages EOF } # Check options passed in. while getopts "h v" OPTION do case $OPTION in h) usage exit 1 ;; v) VERBOSE=1 ;; ?) usage exit ;; esac done #}}} #{{{ exit codes EX_OK=0 #The command was used incorrectly, e.g., with the wrong number of arguments, a #bad flag, a bad syntax in a parameter, or whatever. EX_USAGE=64 #The input data was incorrect in some way. This should only be used for user's #data and not system files. EX_DATAERR=65 #An input file (not a system file) did not exist or was not readable. This #could also include errors like ``No message'' to a mailer (if it cared to #catch it). EX_NOINPUT=66 #The user specified did not exist. This might be used for mail addresses or #remote logins. EX_NOUSER=67 #The host specified did not exist. This is used in mail addresses or network #requests. EX_NOHOST=68 #A service is unavailable. This can occur if a support program or file does #not exist. This can also be used as a catchall message when something you #wanted to do doesn't work, but you don't know why. EX_UNAVAILABLE=69 #An internal software error has been detected. This should be limited to #non-operating system related errors as possible. EX_SOFTWARE=70 #An operating system error has been detected. This is intended to be used for #such things as ``cannot fork'', ``cannot create pipe'', or the like. It #includes things like getuid returning a user that does not exist in the passwd #file. EX_OSERR=71 #Some system file (e.g., /etc/passwd, /var/run/utmp, etc.) does not exist, #cannot be opened, or has some sort of error (e.g., syntax error). EX_OSFILE=72 #A (user specified) output file cannot be created. EX_CANTCREAT=73 #An error occurred while doing I/O on some file. EX_IOERR=74 #Temporary failure, indicating something that is not really an error. In #sendmail, this means that a mailer (e.g.) could not create a connection, and #the request should be reattempted later. EX_TEMPFAIL=75 #The remote system returned something that was ``not possible'' during a #protocol exchange. EX_PROTOCOL=76 #You did not have sufficient permission to perform the operation. This is not #intended for file system problems, which should use EX_NOINPUT or #EX_CANTCREAT, but rather for higher level permissions. EX_NOPERM=77 #Something was found in an unconfigured or misconfigured state. EX_CONFIG=78 #}}} #{{{ function report { function report { rm -f $STDOUT_FILE $STDERR_FILE echo -e "\n$PROGRAM_NAME v$VERSION\n" if [ "$STOP_ON_FAIL" -ne "0" ] then if [ "$FAILS" -ne "0" ] then printf "${BOLD}TESTING STOPPED ON FIRST FAIL${NC}\n\n" fi fi printf "${NC}%-10s${NC}Tests\n" $TOTAL if [ "$FAILS" -ne "0" ] then printf "${BRED}%-10s${NC}${BOLD}Failures${NC}\n" $FAILS printf "${BGREEN}%-10s${NC}Successes\n" $SUCCESSES else printf "${BRED}%-10s${NC}Failures\n" $FAILS printf "${BGREEN}%-10s${NC}${BOLD}Successes${NC}\n" $SUCCESSES fi tear_down exit $FAILS } #}}} #{{{ function run { function run { RUN_NAME=$1 shift FLAG=0 if [ "${#TESTS_TO_RUN[*]}" -eq 0 ] then FLAG=1 else for i in "${TESTS_TO_RUN[@]}" do if [ "$RUN_NAME" == "$i" ] then FLAG=1 break fi done fi if [ "$FLAG" -eq 0 ] then return else export $RUN_NAME=1 fi CMD="$@" START=$(date +%s); O="$("$@" >$STDOUT_FILE 2>$STDERR_FILE)" RETVAL=$? END=$(date +%s); TOTAL_TIME=$((END-START)) RUN_TIME="$TOTAL_TIME sec" OUTVAL=`cat $STDOUT_FILE` ERRVAL=`cat $STDERR_FILE` #make it pretty RUN_NAME=${BOLD}$RUN_NAME${NC} ELINES=$(wc -l $STDERR_FILE | awk '{print $1 }' &) OLINES=$(wc -l $STDOUT_FILE | awk '{print $1 }' &) wait echo -e "\n$RUN_NAME ran in $RUN_TIME with $ELINES/$OLINES lines to STDERR/OUT" } #}}} #{{{ function print_exit_code { function print_exit_code { case $1 in $EX_OK) echo "EX_OK" ;; $EX_USAGE) echo "EX_USAGE" ;; $EX_DATAERR) echo "EX_DATAERR" ;; $EX_NOINPUT) echo "EX_NOINPUT" ;; $EX_NOUSER) echo "EX_NOUSER" ;; $EX_NOHOST) echo "EX_NOHOST" ;; $EX_UNAVAILABLE) echo "EX_UNAVAILABLE" ;; $EX_SOFTWARE) echo "EX_SOFTWARE" ;; $EX_OSERR) echo "EX_OSERR" ;; $EX_OSFILE) echo "EX_OSFILE" ;; $EX_CANTCREAT) echo "EX_CANTCREAT" ;; $EX_IOERR) echo "EX_IOERR" ;; $EX_TEMPFAIL) echo "EX_TEMPFAIL" ;; $EX_PROTOCOL) echo "EX_PROTOCOL" ;; $EX_NOPERM) echo "EX_NOPERM" ;; $EX_CONFIG) echo "EX_CONFIG" ;; *) echo "Unknown code: $1" esac } #}}} #{{{function assert_exit_code { function assert_exit_code { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) E=$(print_exit_code $1) O=$(print_exit_code $RETVAL) if [ $RETVAL -ne $1 ] then FAILS=$((FAILS + 1)) echo -e "$FAIL EXIT CODE (LINE $LINE)" echo -e "-->\texpected $E, observed $O" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS EXIT CODE (LINE $LINE)" if [ $VERBOSE ] then echo -e "-->\texpected $E, observed $O" fi fi } #}}} #{{{ function assert_no_stdout { function assert_no_stdout { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -n "$OUTVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL NON-EMPTY STDOUT (LINE $LINE)" echo -e "-->\t$OUTVAL" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS EMPTY STDOUT (LINE $LINE)" fi } #}}} #{{{ function assert_no_stderr { function assert_no_stderr { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -n "$ERRVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL NON-EMPTY STDERR(LINE $LINE)" echo -e "-->\t$ERRVAL" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS EMPTY STDERR(LINE $LINE)" fi } #}}} #{{{function assert_stderr { function assert_stderr { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -z "$ERRVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL EMPTY STDERR(LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS EMPTY STDERR(LINE $LINE)" if [ $VERBOSE ] then echo -e "-->\t$ERRVAL" fi fi } #}}} #{{{function assert_stdout { function assert_stdout { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -z "$OUTVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL EMPTY STDOUT (LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS NON-EMPTY STDOUT (LINE $LINE)" if [ $VERBOSE ] then echo -e "-->\t$ERRVAL" fi fi } #}}} #{{{function assert_in_stderr { function assert_in_stderr { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -z "$ERRVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL EMPTY STDERR (LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else if [[ $ERRVAL == *"$1"* ]] then SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS STDERR CONTAINS \"$1\" (LINE $LINE)" if [ $VERBOSE ] then echo -e "-->\t$ERRVAL" fi else FAILS=$((FAILS + 1)) echo -e "$FAIL STDERR DOES NOT CONTAIN \"$1\" (LINE $LINE)" echo -e "-->\t$ERRVAL" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi fi fi } #}}} #{{{function assert_in_stdout { function assert_in_stdout { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ -z "$OUTVAL" ] then FAILS=$((FAILS + 1)) echo -e "$FAIL EMPTY STDOUT (LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi else if [[ $OUTVAL == *"$1"* ]] then SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS STDOUT CONTAINS \"$1\" (LINE $LINE)" if [ $VERBOSE ] then echo -e "-->\t$OUTVAL" fi else FAILS=$((FAILS + 1)) echo -e "$FAIL STDOUT DOES NOT CONTAIN \"$1\" (LINE $LINE)" echo -e "-->\t$OUTVAL" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi fi fi } #}}} #{{{ function assert_equal { function assert_equal { if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ "$1" == "$2" ] then SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS \"$1\" == \"$2\" (LINE $LINE)" else FAILS=$((FAILS + 1)) echo -e "$FAIL \"$1\" != \"$2\" (LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi fi } #}}} #{{{ function assert_true { function assert_true { COMMAND=("$@") RES=`${COMMAND[@]}` echo $RES || "AAAAAAAAAAA" if [ "$FLAG" -eq 0 ];then return; fi LINE=$(caller | cut -d " " -f1) TOTAL=$((TOTAL + 1)) if [ "${COMMAND[@]}" == true ] then SUCCESSES=$((SUCCESSES + 1)) echo -e "$PASS $* (LINE $LINE)" else FAILS=$((FAILS + 1)) echo -e "$FAIL $* (LINE $LINE)" tail $STDERR_FILE if [ $STOP_ON_FAIL -ne "0" ];then exit; fi fi } #}}} #{{{function tear_down function tear_down { : #define this function in your test to clean things up in the end } #}}} ================================================ FILE: test/README.md ================================================ These BAM files contain reads that align to the small set of SV-containing regions below. Regions included in BAM files ``` 2 89160083 89186670 INV 4 113984874 113987369 DEL 5 1021803 1026877 DEL 12 12543868 12547613 INV 12 47289448 47310758 INV 19 12693867 12699924 DEL ``` Variants: ``` 2 89161083 89185670 INV 4 113985874 113986369 DEL 5 1022803 1025877 DEL 12 12544868 12546613 INV 12 47290448 47309758 INV 19 12694867 12698924 DEL ``` ================================================ FILE: test/data/2_59305747-59505747_X_151018513-151218513.BND.bam ================================================ [File too large to display: 15.2 MB] ================================================ FILE: test/data/README.md ================================================ This directory contains data and scripts for the download of that data. Alignments are from Genome in a Bottle public resources. Running the `subset_alignments.sh` script will download the data available in these alignment files. These alignments only includes reads from the regions included in the `examples_padded.bed` file. The regions of interest (SVs and one normal region) are indicated in the `examples.bed` file. ================================================ FILE: test/data/commands.sh ================================================ set -e #download hg19 reference for cram FILE="hg19.fa.gz" if [ ! -f $FILE ]; then wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz gunzip hg19.fa.gz bgzip hg19.fa fi #images of each type with all technologies mkdir -p test_imgs samplot plot -n Illumina PacBio ONT 10X -t DEL -c 1 -s 24804397 -e 24807302 -o test_imgs/DEL_1_24804397_24807302.png -b HG002_Illumina.bam HG002_PacBio.bam HG002_ONT.cram HG002_10X.bam -r hg19.fa.gz samplot plot -n Illumina PacBio ONT 10X -t DUP -c 4 -s 99813786 -e 99817098 -o test_imgs/DUP_4_99813786_99817098.png -b HG002_Illumina.bam HG002_PacBio.bam HG002_ONT.cram HG002_10X.bam -r hg19.fa.gz samplot plot -n Illumina PacBio ONT 10X -t DUP -c 11 -s 67974431 -e 67975639 -o test_imgs/DUP_11_67974431_67975639.png -b HG002_Illumina.bam HG002_PacBio.bam HG002_ONT.cram HG002_10X.bam -r hg19.fa.gz samplot plot -n Illumina PacBio ONT 10X -t INV -c 12 -s 12544867 -e 12546613 -o test_imgs/INV_12_12544867_12546613.png -b HG002_Illumina.bam HG002_PacBio.bam HG002_ONT.cram HG002_10X.bam -r hg19.fa.gz #zoom example samplot plot -n Illumina PacBio ONT 10X -t DUP -c 4 -s 99813786 -e 99817098 -o test_imgs/DUP_4_99813786_99817098_zoom.png -b HG002_Illumina.bam HG002_PacBio.bam HG002_ONT.cram HG002_10X.bam -r hg19.fa.gz --zoom 1000 #trios with no variant samplot plot -n HG002 HG003 HG004 -c 1 -s 43059290 -e 43059950 -o test_imgs/1_43059290_43059950.png -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam #trios of each type samplot plot -n HG002 HG003 HG004 -t DEL -c 1 -s 24804397 -e 24807302 -o test_imgs/trio_DEL_1_24804397_24807302.png -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot plot -n HG002 HG003 HG004 -t DUP -c 4 -s 99813786 -e 99817098 -o test_imgs/trio_DUP_4_99813786_99817098.png -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot plot -n HG002 HG003 HG004 -t DUP -c 11 -s 67974431 -e 67975639 -o test_imgs/trio_DUP_11_67974431_67975639.png -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot plot -n HG002 HG003 HG004 -t INV -c 12 -s 12544867 -e 12546613 -o test_imgs/trio_INV_12_12544867_12546613.png -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam #create a temporary example website mkdir -p test_site samplot vcf -d test_site/ --vcf test.vcf --sample_ids HG002 HG003 HG004 -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam > test_site_cmds.sh ================================================ FILE: test/data/examples.bed ================================================ 1 24804398 24807302 DEL HET 1 43059290 43059950 NA HOM 4 99813787 99817098 DUP HOM 11 67974432 67975639 DUP HET 12 12544868 12546613 INV HOM 19 12694867 12698924 DEL HOM ================================================ FILE: test/data/examples_padded.bed ================================================ 1 24802398 24809302 DEL HET 1 43057290 43061950 NA HOM 4 99811787 99819098 DUP HOM 11 67972432 67977639 DUP HET 12 12542868 12548613 INV HOM 19 12692867 12700924 DEL HOM ================================================ FILE: test/data/subset_alignments.sh ================================================ #download example regions from GIAB 300X Illumina Ashkenazi Trio samtools view -h -b -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.hs37d5.300x.bam > HG002_Illumina.bam samtools view -h -b -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG003_NA24149_father/NIST_HiSeq_HG003_Homogeneity-12389378/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG003.hs37d5.300x.bam > HG003_Illumina.bam samtools view -h -b -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG004_NA24143_mother/NIST_HiSeq_HG004_Homogeneity-14572558/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG004.hs37d5.300x.bam > HG004_Illumina.bam #download example regions from GIAB 300X Illumina Ashkenazi Trio Son samtools view -h -b -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST/Baylor_NGMLR_bam_GRCh37/HG002_PB_70x_RG_HP10XtrioRTG.bam > HG002_PacBio.bam samtools view -h -b -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/analysis/10XGenomics_ChromiumGenome_LongRanger2.2_Supernova2.0.1_04122018/GRCh37/NA24385_300G/HG002_10x_84x_RG_HP10xtrioRTG.bam > HG002_10X.bam samtools view -h -b -C -L examples_padded.bed ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/Ultralong_OxfordNanopore/combined_2018-08-10/HG002_ONTrel2_16x_RG_HP10xtrioRTG.cram > HG002_ONT.cram #index new alignment files samtools index HG002_10X.bam samtools index HG002_Illumina.bam samtools index HG002_ONT.cram samtools index HG002_PacBio.bam samtools index HG003_Illumina.bam samtools index HG004_Illumina.bam ================================================ FILE: test/data/test.ped ================================================ 0001 HG004 0 0 0 0001 HG003 0 0 1 0001 HG002 HG003 HG004 0 ================================================ FILE: test/data/test.vcf ================================================ ##fileformat=VCFv4.1 ##fileDate=20170929 ##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##bcftools_viewVersion=1.3.1-1-g4d44e83-dirty+htslib-1.3.1-12-g0454d47 ##bcftools_viewCommand=view -c 1 -s NA12878 ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz ##bcftools_viewCommand=view -i '(SVTYPE="DEL" || SVTYPE="DUP" || SVTYPE="INV" || SVTYPE="INS")' test.vcf #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 HG003 HG004 1 24804398 1 T 2087.90 PASS SVTYPE=DEL;CIEND=100,100;CIPOS=1000,1000;END=24807302; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/0:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 1 24804399 1 T 2087.90 PASS SVTYPE=TRA;CIEND=0,0;CIPOS=0,0;END=43059290;CHR2=1 GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/0:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 1 24804400 1 T 2087.90 PASS SVTYPE=TRA;CIEND=0,0;CIPOS=0,0;END=99813787;CHR2=4 GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/0:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 1 43059290 2 T 2087.90 PASS SVTYPE=DEL;CIEND=0,0;CIPOS=0,0;END=43059950; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/0:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 0/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 4 99813787 3 T 2087.90 PASS SVTYPE=DUP;CIEND=0,0;CIPOS=0,0;END=99817098; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 1/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 1/1:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/1:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 11 67974432 4 T 2087.90 PASS SVTYPE=DUP;CIEND=0,0;CIPOS=0,0;END=67975639; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/1:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 0/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 12 12544868 5 T 2087.90 PASS SVTYPE=INV;CIEND=0,0;CIPOS=0,0;END=12546613; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/1:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 19 12694867 6 T 2087.90 PASS SVTYPE=DEL;CIEND=0,0;CIPOS=0,0;END=12698924; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 1/1:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 1/1:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 1/1:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 19 12694868 6 T 2087.90 PASS SVTYPE=DEL;CIEND=0,0;CIPOS=0,0;END=12698924; GT:GQ:SQ:GL:DP:RO:AO:QR:QA:RS:AS:ASC:RP:AP:AB 1/0:146:581.24:-64,-6,-51:109:77:31:76:30:52:9:2:24:18:0.28 0/0:130:1506.66:-153,-15,-2:52:0:52:0:51:0:23:3:0:24:1 0/0:200:0.00:-0,-54,-181:182:182:0:181:0:120:0:0:61:0:0 ================================================ FILE: test/data/test_site/README.md ================================================ Site generated using: ``` samplot vcf --vcf test/data/test.vcf -b test/data/HG002_Illumina.bam test/data/HG003_Illumina.bam test/data/HG004_Illumina.bam --format GT,GQ --sample_ids HG002 HG003 HG004 ``` ================================================ FILE: test/data/test_site/index.html ================================================ samplot
================================================ FILE: test/data/test_site_cmds.sh ================================================ samplot -z 4 --minq 0 -n HG002 HG004 control-sample:HG003 --start_ci '0,0' --end_ci '0,0' -t DEL -c 1 -s 24804397 -e 24807302 -o test_site/DEL_1_24804397_24807302.png -d 1 -b HG002_Illumina.bam HG004_Illumina.bam HG003_Illumina.bam samplot -z 4 --minq 0 -n HG002 HG003 HG004 --start_ci '0,0' --end_ci '0,0' -t DUP -c 4 -s 99813786 -e 99817098 -o test_site/DUP_4_99813786_99817098.png -d 1 -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot -z 3 --minq 0 -n HG002 HG003 control-sample:HG004 --start_ci '0,0' --end_ci '0,0' -t DUP -c 11 -s 67974431 -e 67975639 -o test_site/DUP_11_67974431_67975639.png -d 1 -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot -z 3 --minq 0 -n HG002 HG003 HG004 --start_ci '0,0' --end_ci '0,0' -t INV -c 12 -s 12544867 -e 12546613 -o test_site/INV_12_12544867_12546613.png -d 1 -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam samplot -z 4 --minq 0 -n HG002 HG003 HG004 --start_ci '0,0' --end_ci '0,0' -t DEL -c 19 -s 12694866 -e 12698924 -o test_site/DEL_19_12694866_12698924.png -d 1 -b HG002_Illumina.bam HG003_Illumina.bam HG004_Illumina.bam ================================================ FILE: test/func/samplot_test.sh ================================================ #!/bin/bash test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest . ssshtest STOP_ON_FAIL=0 data_path="test/data/" func_path="test/func/" bam_1=$data_path"NA12878_restricted.bam" bam_2=$data_path"NA12889_restricted.bam" bam_3=$data_path"NA12890_restricted.bam" sv_chrm=chr4 sv_start=115928730 sv_end=115931875 sv_type=DEL out_file_name=$func_path"test_del.png" rm -f $out_file_name run basic_operation \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type if [ $basic_operation ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm=chr4 sv_start=115928730 sv_end=115931875 sv_type=DEL out_file_name=$func_path"test_max_coverage.png" rm -f $out_file_name run max_coverage \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ --max_coverage 50\ -t $sv_type if [ $max_coverage ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm=chr4 sv_start=115928730 sv_end=115931875 sv_type=DEL out_file_name=$func_path"test_coverage_only.png" rm -f $out_file_name run coverage_only \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 \ -o $out_file_name \ --coverage_only \ -t $sv_type if [ $coverage_only ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"test_same_yaxis.png" sv_chrm=chrX sv_start=101055330 sv_end=101067156 sv_type=DUP rm -f $out_file_name run same_yaxis \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3\ -o $out_file_name \ -t $sv_type \ --same_yaxis_scales if [ $basic_operation ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm=chr4 sv_start=115928730 sv_end=115931875 sv_type=DEL out_file_name=$func_path"test_zoom.png" rm -f $out_file_name run basic_operation_zoom \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type \ --zoom 500 if [ $basic_operation_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"sample.png" run sampling_normal \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $sampling_normal ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"sample_zoom.png" run sampling_normal_zoom \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 500 if [ $sampling_normal_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm=chrX sv_start=101055330 sv_end=101067156 sv_type=DUP out_file_name=$func_path"dup.png" rm -f $out_file_name run common_insert_size_scale \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type \ -d 10 \ --common_insert_size if [ $common_insert_size_scale ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"dup_zoom.png" rm -f $out_file_name run common_insert_size_scale_zoom \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 500 \ --common_insert_size if [ $common_insert_size_scale_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"no_sv_type.png" rm -f $out_file_name run no_sv_type \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam_1 $bam_2 $bam_3 \ -o $out_file_name \ -d 10 \ --common_insert_size if [ $no_sv_type ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm=X sv_start=101055330 sv_end=101067156 sv_type=DUP out_file_name=$func_path"longread_nanopore_dup.png" bam=$data_path"nanopore-NA12878.bam" rm -f $out_file_name run nanopore_dup \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $nanopore_dup ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name out_file_name=$func_path"longread_nanopore_dup_zoom.png" rm -f $out_file_name run nanopore_dup_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 1000 if [ $nanopore_dup_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name sv_chrm=4 sv_start=115928730 sv_end=115931875 sv_type=DEL out_file_name=$func_path"longread_nanopore_del.png" bam=$data_path"nanopore-NA12878.bam" rm -f $out_file_name run nanopore_del \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $nanopore_del ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name out_file_name=$func_path"longread_nanopore_del_zoom.png" rm -f $out_file_name run nanopore_del_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 500 if [ $nanopore_del_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name sv_chrm=chr1 sv_start=58343117 sv_end=58343622 sv_type=DEL out_file_name=$func_path"longread_del.png" bam=$data_path"hg19_chr1_58343117_58343622_deletion.bam" rm -f $out_file_name run longread_del \ samplot plot \ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $longread_del ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name out_file_name=$func_path"longread_del_zoom_big_zoom.png" rm -f $out_file_name run longread_del_zoom_big_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 500 if [ $longread_del_zoom_big_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_in_stderr "Insufficient reads for fragment length estimate." assert_no_stdout fi rm $out_file_name out_file_name=$func_path"longread_del_zoom_zoom.png" rm -f $out_file_name run longread_del_zoom_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 200 if [ $longread_del_zoom_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name sv_chrm=chr21 sv_start=27373431 sv_end=27375410 sv_type=INV out_file_name=$func_path"longread_inv.png" bam=$data_path"hg19_chr21_27373431_27375410_inversion.bam" rm -f $out_file_name run longread_inv \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $longread_inv ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name out_file_name=$func_path"longread_inv_zoom.png" rm -f $out_file_name run longread_inv_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 750 if [ $longread_inv_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_in_stderr "Insufficient reads for fragment length estimate." fi rm $out_file_name sv_chrm=1 sv_start=89475845 sv_end=89478561 sv_type=DEL out_file_name=$func_path"linkedread_del.png" bam=$data_path"HG002_1_89475845-89478561_DEL.tenx.bam" rm -f $out_file_name run linkedread_del \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 if [ $linkedread_del ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"linkedread_del_zoom.png" rm -f $out_file_name run linkedread_del_zoom \ samplot plot\ -c $sv_chrm -s $sv_start -e $sv_end \ -b $bam \ -o $out_file_name \ -t $sv_type \ -d 10 \ --zoom 500 if [ $linkedread_del_zoom ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name sv_chrm_1=2 sv_start_1=59405943 sv_end_1=59405943 sv_chrm_2=X sv_start_2=151118533 sv_end_2=151118533 sv_type=BND out_file_name=$func_path"translocation.png" bam=$data_path"2_59305747-59505747_X_151018513-151218513.BND.bam" run translocation \ samplot plot\ -c $sv_chrm_1 -s $sv_start_1 -e $sv_end_1 \ -c $sv_chrm_2 -s $sv_start_2 -e $sv_end_2 \ -b $bam \ -o $out_file_name \ -t $sv_type \ -A $data_path"Alu.2_X.bed.gz" \ -T $data_path"Homo_sapiens.GRCh37.82.sort.2_X.gff3.gz" \ --zoom 10000 if [ $translocation ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name out_file_name=$func_path"csi-annotation.png" bam=$data_path"2_59305747-59505747_X_151018513-151218513.BND.bam" run translocation \ samplot plot\ -c $sv_chrm_1 -s $sv_start_1 -e $sv_end_1 \ -c $sv_chrm_2 -s $sv_start_2 -e $sv_end_2 \ -b $bam \ -o $out_file_name \ -t $sv_type \ -A $data_path"Alu.2_X.csionly.bed.gz" \ -T $data_path"Homo_sapiens.GRCh37.csionly.2_X.gff3.gz" \ --zoom 10000 if [ $translocation ]; then assert_exit_code 0 assert_equal $out_file_name $( ls $out_file_name ) assert_no_stdout assert_no_stderr fi rm $out_file_name rm -rf $func_path"img/" ssshtest ================================================ FILE: test/func/samplot_vcf_test.sh ================================================ #!/bin/bash test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest . ssshtest STOP_ON_FAIL=0 data_path="test/data/" func_path="test/func/" bam_1=$data_path"NA12878_restricted.bam" bam_2=$data_path"NA12889_restricted.bam" bam_3=$data_path"NA12890_restricted.bam" vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_dir" rm -f $cmd_file rm -rf $test_dir run from_vcf \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --manual_run\ --command_file $cmd_file if [ $from_vcf ]; then assert_no_stderr assert_exit_code 0 assert_equal $test_dir/index.html $( ls $test_dir/index.html ) assert_equal $cmd_file $( ls $cmd_file ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_gff3_dir" rm -f $cmd_file rm -rf $test_dir run from_vcf_gff3 \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --gff3 $data_path"Homo_sapiens.GRCh37.82.sort.2_X.gff3.gz"\ --manual_run\ --command_file $cmd_file if [ $from_vcf_gff3 ]; then assert_no_stderr assert_exit_code 0 assert_equal $test_dir/index.html $( ls $test_dir/index.html ) assert_equal $cmd_file $( ls $cmd_file ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_gff3_dir" rm -f $cmd_file rm -rf $test_dir run from_vcf_annotated \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ -T $data_path"Homo_sapiens.GRCh37.82.sort.2_X.gff3.gz"\ -A $data_path"Alu.2_X.bed.gz" \ --manual_run\ --command_file $cmd_file if [ $from_vcf_annotated ]; then assert_no_stderr assert_exit_code 0 assert_equal $test_dir/index.html $( ls $test_dir/index.html ) assert_equal $cmd_file $( ls $cmd_file ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_auto_dir" rm -rf $test_dir run from_vcf_auto \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" if [ $from_vcf_auto ]; then assert_in_stderr "Window size is under 1.5x the estimated fragment length and will be resized to 847. Rerun with -w 604 to override" assert_exit_code 0 assert_equal $test_dir/index.html $( ls $test_dir/index.html ) assert_equal $test_dir/DEL_1_24804397_24807302.png $( ls $test_dir/DEL_1_24804397_24807302.png ) assert_equal $test_dir/DUP_4_99813786_99817098.png $( ls $test_dir/DUP_4_99813786_99817098.png ) assert_equal $test_dir/DUP_11_67974431_67975639.png $( ls $test_dir/DUP_11_67974431_67975639.png ) assert_equal $test_dir/INV_12_12544867_12546613.png $( ls $test_dir/INV_12_12544867_12546613.png ) assert_equal $test_dir/DEL_19_12694866_12698924.png $( ls $test_dir/DEL_19_12694866_12698924.png ) assert_equal $test_dir/TRA_1_24804398_43059290.png $( ls $test_dir/TRA_1_24804398_43059290.png ) assert_equal $test_dir/TRA_1_24804399_99813787.png $( ls $test_dir/TRA_1_24804399_99813787.png ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_auto_multithread_dir" rm -rf $test_dir run from_vcf_auto_multithread \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ -t 2 if [ $from_vcf_auto_multithread ]; then assert_in_stderr "Window size is under 1.5x the estimated fragment length and will be resized to 847. Rerun with -w 604 to override" assert_exit_code 0 assert_equal $test_dir/index.html $( ls $test_dir/index.html ) assert_equal $test_dir/DEL_1_24804397_24807302.png $( ls $test_dir/DEL_1_24804397_24807302.png ) assert_equal $test_dir/DUP_4_99813786_99817098.png $( ls $test_dir/DUP_4_99813786_99817098.png ) assert_equal $test_dir/DUP_11_67974431_67975639.png $( ls $test_dir/DUP_11_67974431_67975639.png ) assert_equal $test_dir/INV_12_12544867_12546613.png $( ls $test_dir/INV_12_12544867_12546613.png ) assert_equal $test_dir/DEL_19_12694866_12698924.png $( ls $test_dir/DEL_19_12694866_12698924.png ) assert_equal $test_dir/TRA_1_24804398_43059290.png $( ls $test_dir/TRA_1_24804398_43059290.png ) assert_equal $test_dir/TRA_1_24804399_99813787.png $( ls $test_dir/TRA_1_24804399_99813787.png ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_plotall_dir" rm -f $cmd_file rm -rf $test_dir run plot_all \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --plot_all if [ $plot_all ]; then assert_in_stderr "Window size is under 1.5x the estimated fragment length and will be resized to 847. Rerun with -w 604 to override" assert_exit_code 0 assert_equal "$test_dir/index.html" $( ls $test_dir/index.html ) assert_equal "$test_dir/DEL_19_12694866_12698924.png" $( ls "$test_dir/DEL_19_12694866_12698924.png" ) assert_equal "$test_dir/DUP_4_99813786_99817098.png" $( ls "$test_dir/DUP_4_99813786_99817098.png" ) assert_equal "$test_dir/DUP_4_99813786_99817098.png" $( ls "$test_dir/DUP_4_99813786_99817098.png" ) assert_equal "$test_dir/TRA_1_24804398_43059290.png" $( ls $test_dir/TRA_1_24804398_43059290.png ) assert_equal "$test_dir/TRA_1_24804399_99813787.png" $( ls $test_dir/TRA_1_24804399_99813787.png ) assert_equal "$test_dir/DEL_1_24804397_24807302.png" $( ls "$test_dir/DEL_1_24804397_24807302.png" ) assert_equal "$test_dir/DUP_11_67974431_67975639.png" $( ls "$test_dir/DUP_11_67974431_67975639.png" ) assert_equal "$test_dir/INV_12_12544867_12546613.png" $( ls "$test_dir/INV_12_12544867_12546613.png" ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_plotall_multithread_dir" rm -f $cmd_file rm -rf $test_dir run plot_all_multithread \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --plot_all \ -t 2 if [ $plot_all_multithread ]; then assert_in_stderr "Window size is under 1.5x the estimated fragment length and will be resized to 847. Rerun with -w 604 to override" assert_exit_code 0 assert_equal "$test_dir/index.html" $( ls $test_dir/index.html ) assert_equal "$test_dir/DEL_19_12694866_12698924.png" $( ls "$test_dir/DEL_19_12694866_12698924.png" ) assert_equal "$test_dir/DUP_4_99813786_99817098.png" $( ls "$test_dir/DUP_4_99813786_99817098.png" ) assert_equal "$test_dir/DUP_4_99813786_99817098.png" $( ls "$test_dir/DUP_4_99813786_99817098.png" ) assert_equal "$test_dir/TRA_1_24804398_43059290.png" $( ls $test_dir/TRA_1_24804398_43059290.png ) assert_equal "$test_dir/TRA_1_24804399_99813787.png" $( ls $test_dir/TRA_1_24804399_99813787.png ) assert_equal "$test_dir/DEL_1_24804397_24807302.png" $( ls "$test_dir/DEL_1_24804397_24807302.png" ) assert_equal "$test_dir/DUP_11_67974431_67975639.png" $( ls "$test_dir/DUP_11_67974431_67975639.png" ) assert_equal "$test_dir/INV_12_12544867_12546613.png" $( ls "$test_dir/INV_12_12544867_12546613.png" ) fi rm -f $cmd_file rm -rf $test_dir vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_dir" ped_file=$data_path"test.ped" run denovo_only_noped \ samplot vcf \ -d $test_dir \ --vcf $vcf_file \ --sample_ids HG002 HG003 HG004 \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --dn_only if [ $denovo_only_noped ]; then assert_in_stderr "Missing --ped, required when using --dn_only" fi vcf_file=$data_path"test.vcf" cmd_file=$func_path"test.cmd" test_dir=$func_path"test_vcf_dir" ped_file=$data_path"test.ped" rm -f $cmd_file rm -rf $test_dir run denovo_only \ samplot vcf \ -d $test_dir \ --sample_ids HG002 HG003 HG004 \ --vcf $vcf_file \ -b $data_path"HG002_Illumina.bam" \ $data_path"HG003_Illumina.bam" \ $data_path"HG004_Illumina.bam" \ --dn_only \ --ped $data_path"test.ped" if [ $denovo_only ]; then assert_no_stderr assert_exit_code 0 echo "====================================================================" ls "$test_dir/DUP_4_99813786_99817098.png" echo "====================================================================" assert_equal "$test_dir/DEL_19_12694867_12698924.png" $( ls "$test_dir/DEL_19_12694867_12698924.png" ) assert_equal "" $( ls "$test_dir/DUP_4_99813786_99817098.png" ) assert_equal "" $( ls "$test_dir/TRA_1_24804399_43059290.png" ) assert_equal "" $( ls "$test_dir/TRA_1_24804398_99813787.png" ) assert_equal "" $( ls "$test_dir/DEL_1_24804397_24807302.png" ) fi # rm -rf ssshtest ================================================ FILE: test/unit/samplot_test.py ================================================ import unittest import sys from samplot import samplot bam_1 = 'test/data/NA12878_restricted.bam' bam_2 = 'test/data/NA12889_restricted.bam' bam_3 = 'test/data/NA12890_restricted.bam' bams=[bam_1, bam_2, bam_3] sv_chrm = 'chr4' sv_start = 115928730 sv_end = 115931875 sv_type = 'DEL' #{{{ class Test_set_plot_dimensions(unittest.TestCase): class Test_set_plot_dimensions(unittest.TestCase): #{{{ def test_set_plot_dimensions(self): def test_set_plot_dimensions(self): ''' def set_plot_dimensions(sv, sv_type, arg_plot_height, arg_plot_width, bams, reference, annotation_files, transcript_file, arg_window, zoom): ''' plot_height = None plot_width = None annotation_files = None transcript_file = None zoom = 500000 window = None sv = [samplot.genome_interval(sv_chrm,sv_start,sv_end)] # Test basic function where window is set to be proportional to SV size r_plot_height, r_plot_width, r_window, r_ranges = \ samplot.set_plot_dimensions(sv, sv_type, plot_height, plot_width, bams, None, annotation_files, transcript_file, window, zoom) self.assertEqual(r_plot_height, 5) self.assertEqual(r_plot_width, 8) this_window = int((sv_end - sv_start)/2) self.assertEqual( r_window, this_window) self.assertEqual( r_ranges[0], samplot.genome_interval(sv_chrm, sv_start - this_window, sv_end + this_window)) # Test to see if zoom is ignored when it is larger than window zoom = 10000 r_plot_height, r_plot_width, r_window, r_ranges = \ samplot.set_plot_dimensions(sv, sv_type, plot_height, plot_width, bams, None, annotation_files, transcript_file, window, zoom) self.assertEqual( r_ranges[0], samplot.genome_interval(sv_chrm, sv_start - this_window, sv_end + this_window)) # Test to see if zoom creates two ranges zoom = 100 r_plot_height, r_plot_width, r_window, r_ranges = \ samplot.set_plot_dimensions(sv, sv_type, plot_height, plot_width, bams, None, annotation_files, transcript_file, window, zoom) self.assertEqual( r_window, zoom) self.assertEqual( len(r_ranges), 2) self.assertEqual( r_ranges[0], samplot.genome_interval(sv_chrm, sv_start - zoom, sv_start + zoom,)) self.assertEqual( r_ranges[1], samplot.genome_interval(sv_chrm, sv_end - zoom, sv_end + zoom) ) # Test to multiple sv regions window = None zoom = None sv = [samplot.genome_interval(sv_chrm,sv_start,sv_start), samplot.genome_interval(sv_chrm,sv_end,sv_end)] r_plot_height, r_plot_width, r_window, r_ranges = \ samplot.set_plot_dimensions(sv, sv_type, plot_height, plot_width, bams, None, annotation_files, transcript_file, window, zoom) self.assertEqual( len(r_ranges), 2) self.assertEqual( r_ranges[0], samplot.genome_interval(sv_chrm, sv_start-1000, sv_start+1000) ) self.assertEqual( r_ranges[1], samplot.genome_interval(sv_chrm, sv_end-1000, sv_end+1000) ) #}}} #{{{def test_get_read_data(self): def test_get_read_data(self): ''' read_data,max_coverage = get_read_data(ranges, options.bams, options.reference, options.separate_mqual, options.include_mqual, options.coverage_only, options.long_read, options.same_yaxis_scales, options.max_depth, options.z, options.ignore_hp) ''' plot_height = None plot_width = None annotation_files = None transcript_file = None zoom = 500000 window = None sv = [samplot.genome_interval(sv_chrm,sv_start,sv_end)] # Test basic function where window is set to be proportional to SV size r_plot_height, r_plot_width, r_window, r_ranges = \ samplot.set_plot_dimensions(sv, sv_type, plot_height, plot_width, bams, None, annotation_files, transcript_file, window, zoom) reference = None separate_mqual = 0 include_mqual = 1 coverage_only = None long_read = 1000 long_event_size = 100 same_yaxis_scales = None max_depth = 100 z = 4 ignore_hp = False read_data,max_coverage = samplot.get_read_data(r_ranges, bams, reference, separate_mqual, include_mqual, coverage_only, long_read, long_event_size, same_yaxis_scales, max_depth, z, ignore_hp) #}}} #}}} #{{{ class Test_genome_interval(unittest.TestCase): class Test_genome_interval(unittest.TestCase): #{{{ def test_init(self): def test_init(self): gi = samplot.genome_interval('chr1', 1, 1000) self.assertEqual(gi.chrm, 'chr1') self.assertEqual(gi.start, 1) self.assertEqual(gi.end, 1000) #}}} #{{{ def test_init(self): def test_intersect(self): gi = samplot.genome_interval('chr8', 500, 1000) self.assertEqual(-1, gi.intersect(samplot.genome_interval('chr7', 500, 1000))) self.assertEqual(1, gi.intersect(samplot.genome_interval('chr9', 500, 1000))) self.assertEqual(-1, gi.intersect(samplot.genome_interval('chr8', 100, 499))) self.assertEqual(1, gi.intersect(samplot.genome_interval('chr8', 1001, 2000))) self.assertEqual(0, gi.intersect(samplot.genome_interval('chr8', 1, 500))) self.assertEqual(0, gi.intersect(samplot.genome_interval('chr8', 500, 501))) self.assertEqual(0, gi.intersect(samplot.genome_interval('chr8', 1000, 2000))) #}}} #{{{ def test_get_range_hit(self): def test_get_range_hit(self): gi_0 = samplot.genome_interval('chr8', 500, 1000) ranges = [gi_0] self.assertEqual(0, samplot.get_range_hit(ranges, 'chr8', 500)) gi_1 = samplot.genome_interval('chr8', 2000, 3000) ranges = [gi_0, gi_1] self.assertEqual(0, samplot.get_range_hit(ranges, 'chr8', 500)) self.assertEqual(1, samplot.get_range_hit(ranges, 'chr8', 2500)) self.assertEqual(None, samplot.get_range_hit(ranges, 'chr7', 2500)) self.assertEqual(None, samplot.get_range_hit(ranges, 'chr8', 100)) self.assertEqual(None, samplot.get_range_hit(ranges, 'chr8', 10000)) #}}} #{{{ def test_map_genome_point_to_range_points(self): def test_map_genome_point_to_range_points(self): gi_0 = samplot.genome_interval('chr8', 100, 200) ranges = [gi_0] self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 10)) self.assertEqual(0.0, samplot.map_genome_point_to_range_points(ranges, 'chr8', 100)) self.assertEqual(0.25, samplot.map_genome_point_to_range_points(ranges, 'chr8', 125)) self.assertEqual(0.5, samplot.map_genome_point_to_range_points(ranges, 'chr8', 150)) self.assertEqual(0.75, samplot.map_genome_point_to_range_points(ranges, 'chr8', 175)) self.assertEqual(1.0, samplot.map_genome_point_to_range_points(ranges, 'chr8', 200)) self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 201)) gi_1 = samplot.genome_interval('chr8', 300, 400) ranges = [gi_0, gi_1] self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 10)) self.assertEqual(0.0, samplot.map_genome_point_to_range_points(ranges, 'chr8', 100)) self.assertEqual(0.25/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 125)) self.assertEqual(0.5/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 150)) self.assertEqual(0.75/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 175)) self.assertEqual(1.0/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 200)) self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 201)) self.assertEqual(0.5, samplot.map_genome_point_to_range_points(ranges, 'chr8', 300)) self.assertEqual(0.5+0.25/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 325)) self.assertEqual(0.5+0.5/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 350)) self.assertEqual(0.5+0.75/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 375)) self.assertEqual(1.0, samplot.map_genome_point_to_range_points(ranges, 'chr8', 400)) gi_0 = samplot.genome_interval('chr8', 100, 200) gi_1 = samplot.genome_interval('chr9', 300, 400) ranges = [gi_0, gi_1] self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 10)) self.assertEqual(0.0, samplot.map_genome_point_to_range_points(ranges, 'chr8', 100)) self.assertEqual(0.25/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 125)) self.assertEqual(0.5/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 150)) self.assertEqual(0.75/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 175)) self.assertEqual(1.0/2, samplot.map_genome_point_to_range_points(ranges, 'chr8', 200)) self.assertEqual(None, samplot.map_genome_point_to_range_points(ranges, 'chr8', 201)) self.assertEqual(0.5, samplot.map_genome_point_to_range_points(ranges, 'chr9', 300)) self.assertEqual(0.5+0.25/2, samplot.map_genome_point_to_range_points(ranges, 'chr9', 325)) self.assertEqual(0.5+0.5/2, samplot.map_genome_point_to_range_points(ranges, 'chr9', 350)) self.assertEqual(0.5+0.75/2, samplot.map_genome_point_to_range_points(ranges, 'chr9', 375)) self.assertEqual(1.0, samplot.map_genome_point_to_range_points(ranges, 'chr9', 400)) #}}} #}}} #{{{ class Test_long_read_plan(unittest.TestCase): class Test_long_read_plan(unittest.TestCase): #{{{ def test_init(self): def test_add_align_step(self): alignment = samplot.Alignment('chr8', 100, 500, True, 0) # both are in the same range gi_0 = samplot.genome_interval('chr8', 100, 1000) ranges = [gi_0] steps = [] samplot.add_align_step(alignment, steps, ranges) self.assertEqual(1, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(500, steps[0].end_pos.start) self.assertEqual(500, steps[0].end_pos.end) self.assertEqual('Align', steps[0].info['TYPE']) # in different ranges gi_0 = samplot.genome_interval('chr8', 100, 200) gi_1 = samplot.genome_interval('chr8', 300, 1000) ranges = [gi_0, gi_1] steps = [] samplot.add_align_step(alignment, steps, ranges) self.assertEqual(2, len(steps)) #start self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) #end self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(200, steps[0].end_pos.start) self.assertEqual(200, steps[0].end_pos.end) #event self.assertEqual('Align', steps[0].info['TYPE']) #start self.assertEqual('chr8', steps[1].start_pos.chrm) self.assertEqual(300, steps[1].start_pos.start) self.assertEqual(300, steps[1].start_pos.end) #end self.assertEqual('chr8', steps[1].end_pos.chrm) self.assertEqual(500, steps[1].end_pos.start) self.assertEqual(500, steps[1].end_pos.end) #event self.assertEqual('Align', steps[1].info['TYPE']) # start is not in range, use end hit gi_0 = samplot.genome_interval('chr8', 10, 20) gi_1 = samplot.genome_interval('chr8', 300, 1000) ranges = [gi_0, gi_1] steps = [] samplot.add_align_step(alignment, steps, ranges) self.assertEqual(1, len(steps)) #start self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(300, steps[0].start_pos.start) self.assertEqual(300, steps[0].start_pos.end) #end self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(500, steps[0].end_pos.start) self.assertEqual(500, steps[0].end_pos.end) #event self.assertEqual('Align', steps[0].info['TYPE']) # end is not in range, use start hit gi_0 = samplot.genome_interval('chr8', 100, 200) gi_1 = samplot.genome_interval('chr8', 3000, 4000) ranges = [gi_0, gi_1] steps = [] samplot.add_align_step(alignment, steps, ranges) #start self.assertEqual(1, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) #end self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(200, steps[0].end_pos.end) self.assertEqual(200, steps[0].end_pos.start) #event self.assertEqual('Align', steps[0].info['TYPE']) # neither end is in range, add nothing gi_0 = samplot.genome_interval('chr8', 10, 20) gi_1 = samplot.genome_interval('chr8', 3000, 4000) ranges = [gi_0, gi_1] steps = [] samplot.add_align_step(alignment, steps, ranges) self.assertEqual(0, len(steps)) #}}} #{{{def test_get_alignments_from_cigar(self): def test_get_alignments_from_cigar(self): ''' alignments = get_alignments_from_cigar( bam_file.get_reference_name(read.reference_id), read.pos, not read.is_reverse, read.cigartuples) ''' CIGAR_MAP = { 'M' : 0, 'I' : 1, 'D' : 2, 'N' : 3, 'S' : 4, 'H' : 5, 'P' : 6, '=' : 7, 'X' : 8, 'B' : 9 } cigar = [(CIGAR_MAP['M'], 100), (CIGAR_MAP['D'], 100), (CIGAR_MAP['M'], 100)] alignments = samplot.get_alignments_from_cigar('chr8', 100, True, cigar) self.assertEqual(2,len(alignments)) self.assertEqual('chr8', alignments[0].pos.chrm) self.assertEqual(100, alignments[0].pos.start) self.assertEqual(200, alignments[0].pos.end) self.assertEqual(True, alignments[0].strand) self.assertEqual(0, alignments[0].query_position) self.assertEqual('chr8', alignments[1].pos.chrm) self.assertEqual(300, alignments[1].pos.start) self.assertEqual(400, alignments[1].pos.end) self.assertEqual(True, alignments[1].strand) self.assertEqual(100, alignments[1].query_position) #}}} #{{{def test_get_long_read_plan(self): def test_get_long_read_plan(self): gi_0 = samplot.genome_interval('chr8', 100, 250) gi_1 = samplot.genome_interval('chr8', 300, 400) ranges = [gi_0, gi_1] long_reads = {} read_name = 'Test' alignments = [samplot.Alignment('chr8', 100, 200, True, 0)] long_reads[read_name] = [ samplot.LongRead(alignments) ] max_gap, steps = samplot.get_long_read_plan(read_name, long_reads, ranges) self.assertEqual(0, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(200, steps[0].end_pos.start) self.assertEqual(200, steps[0].end_pos.end) self.assertEqual('LONGREAD', steps[0].event) self.assertEqual('Align', steps[0].info['TYPE']) alignments = [samplot.Alignment('chr8', 100, 299, True, 0)] long_reads[read_name] = [ samplot.LongRead(alignments) ] max_gap, steps = samplot.get_long_read_plan(read_name, long_reads, ranges) self.assertEqual(0, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(250, steps[0].end_pos.start) self.assertEqual(250, steps[0].end_pos.end) self.assertEqual('Align', steps[0].info['TYPE']) alignments = [samplot.Alignment('chr8', 100, 350, True, 0)] long_reads[read_name] = [ samplot.LongRead(alignments) ] max_gap, steps = samplot.get_long_read_plan(read_name, long_reads, ranges) self.assertEqual(0, max_gap) self.assertEqual(2, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(250, steps[0].end_pos.start) self.assertEqual(250, steps[0].end_pos.end) self.assertEqual('Align', steps[0].info['TYPE']) self.assertEqual('chr8', steps[1].start_pos.chrm) self.assertEqual(300, steps[1].start_pos.start) self.assertEqual(300, steps[1].start_pos.end) self.assertEqual('chr8', steps[1].end_pos.chrm) self.assertEqual(350, steps[1].end_pos.start) self.assertEqual(350, steps[1].end_pos.end) self.assertEqual('Align', steps[1].info['TYPE']) alignments = [samplot.Alignment('chr8', 100, 250, True, 0), samplot.Alignment('chr8', 300, 350, True, 150)] long_reads[read_name] = [ samplot.LongRead(alignments) ] max_gap, steps = samplot.get_long_read_plan(read_name, long_reads, ranges) self.assertEqual(50, max_gap) self.assertEqual(3, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(250, steps[0].end_pos.start) self.assertEqual(250, steps[0].end_pos.end) self.assertEqual('Align', steps[0].info['TYPE']) self.assertEqual('chr8', steps[1].start_pos.chrm) self.assertEqual(250, steps[1].start_pos.start) self.assertEqual(250, steps[1].start_pos.end) self.assertEqual('chr8', steps[1].end_pos.chrm) self.assertEqual(300, steps[1].end_pos.start) self.assertEqual(300, steps[1].end_pos.end) self.assertEqual('Deletion', steps[1].info['TYPE']) self.assertEqual('chr8', steps[2].start_pos.chrm) self.assertEqual(300, steps[2].start_pos.start) self.assertEqual(300, steps[2].start_pos.end) self.assertEqual('chr8', steps[2].end_pos.chrm) self.assertEqual(350, steps[2].end_pos.start) self.assertEqual(350, steps[2].end_pos.end) self.assertEqual('Align', steps[2].info['TYPE']) gi_0 = samplot.genome_interval('chr8', 100, 250) gi_1 = samplot.genome_interval('chr9', 300, 400) ranges = [gi_0, gi_1] alignments = [samplot.Alignment('chr8', 100, 250, True, 0), samplot.Alignment('chr9', 300, 350, True, 150)] long_reads[read_name] = [ samplot.LongRead(alignments) ] max_gap, steps = samplot.get_long_read_plan(read_name, long_reads, ranges) self.assertEqual(5000, max_gap) self.assertEqual(3, len(steps)) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(250, steps[0].end_pos.start) self.assertEqual(250, steps[0].end_pos.end) self.assertEqual('Align', steps[0].info['TYPE']) self.assertEqual('chr8', steps[1].start_pos.chrm) self.assertEqual(250, steps[1].start_pos.start) self.assertEqual(250, steps[1].start_pos.end) self.assertEqual('chr9', steps[1].end_pos.chrm) self.assertEqual(300, steps[1].end_pos.start) self.assertEqual(300, steps[1].end_pos.end) self.assertEqual('InterChrm', steps[1].info['TYPE']) self.assertEqual('chr9', steps[2].start_pos.chrm) self.assertEqual(300, steps[2].start_pos.start) self.assertEqual(300, steps[2].start_pos.end) self.assertEqual('chr9', steps[2].end_pos.chrm) self.assertEqual(350, steps[2].end_pos.start) self.assertEqual(350, steps[2].end_pos.end) self.assertEqual('Align', steps[2].info['TYPE']) #}}} #}}} #{{{class Test_annotation_plan(unittest.TestCase): class Test_annotation_plan(unittest.TestCase): #{{{def test_get_alignments_from_cigar(self): def test_get_alignments_from_cigar(self): gi_1 = samplot.genome_interval('chr8', 100, 200) gi_2 = samplot.genome_interval('chr8', 300, 400) ranges = [gi_1, gi_2] i = samplot.genome_interval('chr8', 110, 120) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual('chr8',s.chrm) self.assertEqual(110,s.start) self.assertEqual(110,s.end) self.assertEqual('chr8',e.chrm) self.assertEqual(120,e.start) self.assertEqual(120,e.end) i = samplot.genome_interval('chr8', 110, 220) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual('chr8',s.chrm) self.assertEqual(110,s.start) self.assertEqual(110,s.end) self.assertEqual('chr8',e.chrm) self.assertEqual(200,e.start) self.assertEqual(200,e.end) i = samplot.genome_interval('chr8', 220, 320) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual('chr8',s.chrm) self.assertEqual(300,s.start) self.assertEqual(300,s.end) self.assertEqual('chr8',e.chrm) self.assertEqual(320,e.start) self.assertEqual(320,e.end) i = samplot.genome_interval('chr8', 120, 320) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual('chr8',s.chrm) self.assertEqual(120,s.start) self.assertEqual(120,s.end) self.assertEqual('chr8',e.chrm) self.assertEqual(320,e.start) self.assertEqual(320,e.end) i = samplot.genome_interval('chr8', 320, 520) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual('chr8',s.chrm) self.assertEqual(320,s.start) self.assertEqual(320,s.end) self.assertEqual('chr8',e.chrm) self.assertEqual(400,e.start) self.assertEqual(400,e.end) i = samplot.genome_interval('chr8', 30, 50) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual(None, s) self.assertEqual(None, e) i = samplot.genome_interval('chr8', 3000, 5000) s, e = samplot.get_interval_range_plan_start_end(ranges, i) self.assertEqual(None, s) self.assertEqual(None, e) #}}} #}}} #{{{class Test_splits(unittest.TestCase): class Test_splits(unittest.TestCase): #{{{def test_get_split_plan(self): def test_get_split_plan(self): splits = {} hp = 0 splits[hp] = {} read_name_1 = 'Test1' ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr8', 600, 800) ] #both in same ragne #Deletion splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 170, 180, True, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(20, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('Deletion', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(150, steps[0].start_pos.start) self.assertEqual(150, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(170, steps[0].end_pos.start) self.assertEqual(170, steps[0].end_pos.end) #Duplication splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 130, 180, True, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(20, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('Duplication', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(150, steps[0].start_pos.start) self.assertEqual(150, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(130, steps[0].end_pos.start) self.assertEqual(130, steps[0].end_pos.end) #Inversion splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 151, 180, False, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(30, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('Inversion', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(150, steps[0].start_pos.start) self.assertEqual(150, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(151, steps[0].end_pos.start) self.assertEqual(151, steps[0].end_pos.end) #both in same ragne splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) self.assertEqual(None, plan) #both in same ragne splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 550, 650, True, 0, False, False), samplot.SplitRead('chr8', 700, 750, True, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(50, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('Deletion', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(650, steps[0].start_pos.start) self.assertEqual(650, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(700, steps[0].end_pos.start) self.assertEqual(700, steps[0].end_pos.end) #both in same ragne splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 150, 175, True, 0, False, False), samplot.SplitRead('chr8', 650, 675, True, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(475, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('Deletion', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(175, steps[0].start_pos.start) self.assertEqual(175, steps[0].start_pos.end) self.assertEqual('chr8', steps[0].end_pos.chrm) self.assertEqual(650, steps[0].end_pos.start) self.assertEqual(650, steps[0].end_pos.end) #inter chrom ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr9', 600, 800) ] splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 150, 175, True, 0, False, False), samplot.SplitRead('chr9', 650, 675, True, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(samplot.INTERCHROM_YAXIS, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('InterChrm', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(175, steps[0].start_pos.start) self.assertEqual(175, steps[0].start_pos.end) self.assertEqual('chr9', steps[0].end_pos.chrm) self.assertEqual(650, steps[0].end_pos.start) self.assertEqual(650, steps[0].end_pos.end) splits[hp][read_name_1] = [\ samplot.SplitRead('chr8', 150, 175, True, 0, False, False), samplot.SplitRead('chr9', 650, 675, False, 50, False, False)] plan = samplot.get_split_plan(ranges, splits[hp][read_name_1]) max_gap, steps = plan self.assertEqual(samplot.INTERCHROM_YAXIS, max_gap) self.assertEqual(1, len(steps)) self.assertEqual('SPLITREAD', steps[0].event) self.assertEqual('InterChrmInversion', steps[0].info['TYPE']) self.assertEqual('chr8', steps[0].start_pos.chrm) self.assertEqual(175, steps[0].start_pos.start) self.assertEqual(175, steps[0].start_pos.end) self.assertEqual('chr9', steps[0].end_pos.chrm) self.assertEqual(650, steps[0].end_pos.start) self.assertEqual(650, steps[0].end_pos.end) #}}} #{{{def test_get_splits_plan(self): def test_get_splits_plan(self): splits = {} hp = 0 splits[hp] = {} ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr9', 600, 800) ] #Deletion splits[hp]['del'] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 170, 180, True, 50, False, False)] #Duplication splits[hp]['dup'] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 130, 180, True, 50, False, False)] #Inversion splits[hp]['inv'] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False), samplot.SplitRead('chr8', 151, 180, False, 50, False, False)] #Bad split splits[hp]['bad'] = [\ samplot.SplitRead('chr8', 100, 150, True, 0, False, False)] #Interchm splits[hp]['interchm'] = [\ samplot.SplitRead('chr8', 150, 175, True, 0, False, False), samplot.SplitRead('chr9', 650, 675, True, 50, False, False)] #InterchmInv splits[hp]['interchminv'] = [\ samplot.SplitRead('chr8', 150, 175, True, 0, False, False), samplot.SplitRead('chr9', 650, 675, False, 50, False, False)] plan = samplot.get_splits_plan(ranges, splits[hp]) max_gap, steps = plan self.assertEqual(samplot.INTERCHROM_YAXIS, max_gap) self.assertEqual(5, len(steps)) #}}} #}}} #{{{ class Test_pairs(unittest.TestCase): class Test_pairs(unittest.TestCase): #{{{ def test_get_pair_insert_size(self): def test_get_pair_insert_size(self): ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr8', 600, 800) ] pairs = {} hp = 0 pairs[hp] = {} read_name_1 = 'Test1' #both in same ragne pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 100, 150, True, False, False), samplot.PairedEnd('chr8', 170, 180, False, False, False)] read_name_2 = 'Test2' pairs[hp][read_name_2] = [\ samplot.PairedEnd('chr8', 100, 150, True, False, False), samplot.PairedEnd('chr8', 170, 180, False, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(2, len(pair_insert_sizes)) self.assertEqual(80, pair_insert_sizes[0]) self.assertEqual(80, pair_insert_sizes[1]) #one starting in range ends out of range pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 100, 150, True, False, False), samplot.PairedEnd('chr8', 190, 240, False, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(2, len(pair_insert_sizes)) self.assertEqual(140, pair_insert_sizes[0]) self.assertEqual(80, pair_insert_sizes[1]) #one out of range pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr9', 100, 150, True, False, False), samplot.PairedEnd('chr8', 190, 240, False, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(1, len(pair_insert_sizes)) self.assertEqual(80, pair_insert_sizes[0]) #DUP pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 125, 150, True, False, False), samplot.PairedEnd('chr8', 175, 200, False, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(2, len(pair_insert_sizes)) self.assertEqual(75, pair_insert_sizes[0]) self.assertEqual(80, pair_insert_sizes[1]) #INV pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 125, 150, True, False, False), samplot.PairedEnd('chr8', 175, 200, True, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(2, len(pair_insert_sizes)) self.assertEqual(75, pair_insert_sizes[0]) self.assertEqual(80, pair_insert_sizes[1]) #interchrm ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr9', 600, 800) ] pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 125, 150, True, False, False), samplot.PairedEnd('chr9', 675, 700, True, False, False)] pair_insert_sizes = samplot.get_pairs_insert_sizes(ranges, pairs) self.assertEqual(2, len(pair_insert_sizes)) self.assertEqual(samplot.INTERCHROM_YAXIS, pair_insert_sizes[0]) self.assertEqual(80, pair_insert_sizes[1]) #}}} #{{{ def test_get_pair_plan(self): def test_get_pair_plan(self): ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr8', 600, 800) ] pairs = {} hp = 0 pairs[hp] = {} read_name_1 = 'Test1' #both in same ragne pairs[hp][read_name_1] = [\ samplot.PairedEnd('chr8', 100, 150, False, False, False), samplot.PairedEnd('chr8', 170, 180, True, False, False)] read_name_2 = 'Test2' pairs[hp][read_name_2] = [\ samplot.PairedEnd('chr8', 100, 150, False, False, False), samplot.PairedEnd('chr8', 170, 180, True, False, False)] max_event, steps = samplot.get_pairs_plan(ranges, pairs[hp]) self.assertEqual(80, max_event) self.assertEqual(2, len(steps)) #}}} #}}} #{{{ class Test_linked(unittest.TestCase): class Test_linked(unittest.TestCase): #{{{def test_get_split_insert_size(self): def test_get_linked_plan(self): ranges = [samplot.genome_interval('chr8', 100, 200), samplot.genome_interval('chr8', 600, 800) ] pairs = {} hp = 0 pairs[hp] = {} pairs[hp]['PE_1'] = [\ samplot.PairedEnd('chr8', 100, 150, False, False, False), samplot.PairedEnd('chr8', 170, 180, True, False, False)] pairs[hp]['PE_2'] = [\ samplot.PairedEnd('chr8', 110, 160, False, False, False), samplot.PairedEnd('chr8', 680, 690, True, False, False)] splits = {} splits[hp] = {} splits[hp]['SR_1'] = [\ samplot.SplitRead('chr8', 155, 160, True, 0, False, False), samplot.SplitRead('chr8', 670, 675, True, 50, False, False)] linked_reads = {} linked_reads[hp] = {} MI = 5 linked_reads[hp][MI] = [[],[]] linked_reads[hp][MI][0].append('PE_1') linked_reads[hp][MI][0].append('PE_2') linked_reads[hp][MI][1].append('SR_1') max_event, steps = samplot.get_linked_plan(ranges, pairs[hp], splits[hp], linked_reads[hp], MI) self.assertEqual(580, max_event) self.assertEqual(2, len(steps)) self.assertEqual(2, len(steps[0].info['PAIR_STEPS'])) self.assertEqual(1, len(steps[0].info['SPLIT_STEPS'])) self.assertEqual(100, steps[0].start_pos.start) self.assertEqual(100, steps[0].start_pos.end) self.assertEqual(ranges[0].end, steps[0].end_pos.start) self.assertEqual(ranges[0].end, steps[0].end_pos.end) self.assertEqual(ranges[1].start, steps[1].start_pos.start) self.assertEqual(ranges[1].start, steps[1].start_pos.end) self.assertEqual(690, steps[1].end_pos.start) self.assertEqual(690, steps[1].end_pos.end) self.assertEqual(100,steps[0].info['PAIR_STEPS'][0].start_pos.start) self.assertEqual(100,steps[0].info['PAIR_STEPS'][0].start_pos.end) self.assertEqual(180,steps[0].info['PAIR_STEPS'][0].end_pos.start) self.assertEqual(180,steps[0].info['PAIR_STEPS'][0].end_pos.end) self.assertEqual(110,steps[0].info['PAIR_STEPS'][1].start_pos.start) self.assertEqual(110,steps[0].info['PAIR_STEPS'][1].start_pos.end) self.assertEqual(690,steps[0].info['PAIR_STEPS'][1].end_pos.start) self.assertEqual(690,steps[0].info['PAIR_STEPS'][1].end_pos.end) self.assertEqual(160,steps[0].info['SPLIT_STEPS'][0].start_pos.start) self.assertEqual(160,steps[0].info['SPLIT_STEPS'][0].start_pos.end) self.assertEqual(670,steps[0].info['SPLIT_STEPS'][0].end_pos.start) self.assertEqual(670,steps[0].info['SPLIT_STEPS'][0].end_pos.end) #}}} #}}} if __name__ == '__main__': unittest.main()