Repository: PacktPublishing/Bioinformatics-with-Python-Cookbook-third-edition Branch: main Commit: 9b10894b1a19 Files: 93 Total size: 188.9 KB Directory structure: gitextract_jf5fqbmn/ ├── .gitignore ├── Chapter01/ │ ├── Interfacing_R.py │ ├── R_magic.py │ ├── base_setup.sh │ └── bioinformatics_base.txt ├── Chapter02/ │ ├── .gitignore │ ├── Arrow.py │ ├── Matplotlib.py │ ├── NumPy.py │ ├── Pandas_Basic.py │ ├── Pandas_Join.py │ └── Pandas_Memory.py ├── Chapter03/ │ ├── Accessing_Databases.py │ ├── Basic_Sequence_Processing.py │ ├── Filtering_SNPs.py │ ├── LCT.bed │ ├── Processing_BED_with_HTSeq.py │ ├── Working_with_BAM.py │ ├── Working_with_FASTQ.py │ └── Working_with_VCF.py ├── Chapter04/ │ ├── 2L.py │ ├── Exploration.py │ ├── Mendel.py │ ├── Preparation.py │ ├── QIIME2_Metagenomics.py │ └── samples.tsv ├── Chapter05/ │ ├── .gitignore │ ├── Annotations.py │ ├── Gene_Ontology.py │ ├── Getting_Gene.py │ ├── Low_Quality.py │ ├── Orthology.py │ └── Reference_Genome.py ├── Chapter06/ │ ├── .gitignore │ ├── Admixture.py │ ├── Data_Formats.py │ ├── Exploratory_Analysis.py │ ├── PCA.py │ ├── Pop_Stats.py │ └── Sgkit.py ├── Chapter07/ │ ├── .gitignore │ ├── Alignment.py │ ├── Comparison.py │ ├── Exploration.py │ ├── Reconstruction.py │ ├── Selection.py │ ├── Trees.py │ └── Visualization.py ├── Chapter08/ │ ├── .gitignore │ ├── Distance.py │ ├── Intro.py │ ├── Mass.py │ ├── PDB.py │ ├── Parser.py │ ├── PyMol_Intro.py │ ├── PyMol_Movie.py │ ├── Stats.py │ └── mmCIF.py ├── Chapter09/ │ ├── galaxy/ │ │ ├── .gitignore │ │ ├── LCT.bed │ │ ├── api.py │ │ ├── encrypt.py │ │ └── galaxy.yaml │ ├── nextflow/ │ │ ├── .gitignore │ │ └── pipeline.nf │ └── snakemake/ │ ├── .gitignore │ ├── Snakefile │ └── plot_pca.py ├── Chapter10/ │ ├── Clustering.py │ ├── Decision_Tree.py │ ├── PCA.py │ └── Random_Forest.py ├── Chapter11/ │ ├── .gitignore │ ├── Dask_Distributed.py │ ├── Dask_Intro.py │ ├── MP_intro.py │ └── Zarr_Intro.py ├── Chapter12/ │ ├── Builtin.py │ ├── Lazy.py │ ├── Mutability.py │ ├── Persistence1.py │ ├── Persistence2.py │ ├── Pure.py │ ├── Recursion.py │ ├── Tools.py │ ├── my_genes.csv │ └── my_genes.csv.base ├── Datasets.py ├── LICENSE ├── README.md ├── Welcome.ipynb └── docker/ ├── Chapter01/ │ └── Dockerfile └── main/ └── Dockerfile ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .ipynb_checkpoints .Rhistory __pycache__ ================================================ FILE: Chapter01/Interfacing_R.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # %% [markdown] # ## The next cell will get a ~65 MB data file 'sequence.index', you only need to run the cell once # %% # !rm sequence.index 2>/dev/null # !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index # %% [markdown] # # Interfacing with R # %% import os from IPython.display import Image import rpy2.robjects as robjects import rpy2.robjects.lib.ggplot2 as ggplot2 from rpy2.robjects.functions import SignatureTranslatedFunction import pandas as pd import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter # %% read_delim = robjects.r('read.delim') seq_data = read_delim('sequence.index', header=True, stringsAsFactors=False) #In R: # seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE) # %% print('This data frame has %d columns and %d rows' % (seq_data.ncol, seq_data.nrow)) print(seq_data.colnames) #In R: # print(colnames(seq.data)) # print(nrow(seq.data)) # print(ncol(seq.data)) print('Columns in Python %d ' % robjects.r.ncol(seq_data)[0]) #access some functions as_integer = robjects.r('as.integer') match = robjects.r.match my_col = match('READ_COUNT', seq_data.colnames)[0] # Vector returned print('Type of read count before as.integer: %s' % seq_data[my_col - 1].rclass[0]) seq_data[my_col - 1] = as_integer(seq_data[my_col - 1]) print('Type of read count after as.integer: %s' % seq_data[my_col - 1].rclass[0]) my_col = match('BASE_COUNT', seq_data.colnames)[0] # Vector returned seq_data[my_col - 1] = as_integer(seq_data[my_col - 1]) my_col = match('CENTER_NAME', seq_data.colnames)[0] seq_data[my_col - 1] = robjects.r.toupper(seq_data[my_col - 1]) robjects.r.assign('seq.data', seq_data) robjects.r('print(c("Column names in R: ",colnames(seq.data)))') robjects.r('seq.data <- seq.data[seq.data$WITHDRAWN==0, ]') #Lets remove all withdrawn sequences robjects.r("seq.data <- seq.data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]") #Lets shorten the dataframe #Population as factor robjects.r('seq.data$POPULATION <- as.factor(seq.data$POPULATION)') # %% ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme, init_prm_translate = {'axis_text_x': 'axis.text.x'}) bar = ggplot2.ggplot(seq_data) + ggplot2.geom_bar() + ggplot2.aes_string(x='CENTER_NAME') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1, size=40), axis_text_y=ggplot2.element_text(size=40), text=ggplot2.element_text(size=40)) robjects.r.png('out.png', width=16, height=9, units="in", res=600) bar.plot() dev_off = robjects.r('dev.off') dev_off() Image(filename='out.png') # %% #Get Yoruba and CEU robjects.r('yri_ceu <- seq.data[seq.data$POPULATION %in% c("YRI", "CEU") & seq.data$BASE_COUNT < 2E9 & seq.data$READ_COUNT < 3E7, ]') yri_ceu = robjects.r('yri_ceu') # %% scatter = ggplot2.ggplot(yri_ceu) + ggplot2.aes_string(x='BASE_COUNT', y='READ_COUNT', shape='factor(POPULATION)', col='factor(ANALYSIS_GROUP)') + ggplot2.geom_point() robjects.r.png('out.png', width=16, height=9, units="in", res=600) scatter.plot() dev_off = robjects.r('dev.off') dev_off() Image(filename='out.png') # %% with localconverter(ro.default_converter + pandas2ri.converter): pd_yri_ceu = ro.conversion.rpy2py(yri_ceu) del pd_yri_ceu['PAIRED_FASTQ'] # no_paired = pandas2ri.py2ri(pd_yri_ceu) with localconverter(ro.default_converter + pandas2ri.converter): no_paired = ro.conversion.py2rpy(pd_yri_ceu) robjects.r.assign('no.paired', no_paired) robjects.r("print(colnames(no.paired))") # %% ================================================ FILE: Chapter01/R_magic.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # %% [markdown] # ## The cell below will get the data file, you only need to run it once # %% [markdown] # (you do not need to do this if you have done it in the Interfacing_R notebook) # %% # !rm sequence.index 2>/dev/null # !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index # %% import rpy2.robjects as robjects import rpy2.robjects.lib.ggplot2 as ggplot2 # %load_ext rpy2.ipython # %% language="R" # seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE) # seq.data$READ_COUNT <- as.integer(seq.data$READ_COUNT) # seq.data$BASE_COUNT <- as.integer(seq.data$BASE_COUNT) # %% # seq_data = %R seq.data print(type(seq_data)) #pandas dataframe??? # %% my_col = list(seq_data.columns).index("CENTER_NAME") seq_data['CENTER_NAME'] = seq_data['CENTER_NAME'].apply(lambda x: x.upper()) # %% # %R -i seq_data # %R print(colnames(seq_data)) # %% language="R" # seq_data <- seq_data[seq_data$WITHDRAWN==0, ] # seq_data$POPULATION <- as.factor(seq_data$POPULATION) # %% language="R" # bar <- ggplot(seq_data) + aes(factor(CENTER_NAME)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # print(bar) # %% language="R" # seq_data$POPULATION <- as.factor(seq_data$POPULATION) # yri_ceu <- seq_data[seq_data$POPULATION %in% c("YRI", "CEU") & seq_data$BASE_COUNT < 2E9 & seq_data$READ_COUNT < 3E7, ] # %% language="R" # scatter <- ggplot(yri_ceu, aes(x=BASE_COUNT, y=READ_COUNT, col=factor(ANALYSIS_GROUP), shape=POPULATION)) + geom_point() # print(scatter) # %% language="R" # library(gridExtra) # library(grid) # g <- grid.arrange(bar, scatter, ncol=1) # g # %% language="R" # png('fig.png') # g # dev.off() ================================================ FILE: Chapter01/base_setup.sh ================================================ conda create -n bioinformatics_base python=3.9.7  conda activate bioinformatics_base conda config --add channels bioconda conda config --add channels conda-forge conda install \ biopython==1.79 \ jupyterlab==3.2.1 \ jupytext==1.13 \ matplotlib==3.4.3 \ numpy==1.21.3 \ pandas==1.3.4 \ scipy==1.7.1 conda list --explicit > bioinformatics_base.txt ================================================ FILE: Chapter01/bioinformatics_base.txt ================================================ # This file may be used to create an environment using: # $ conda create --name --file # platform: linux-64 @EXPLICIT https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2021.10.8-ha878542_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.36.1-hea4e1c9_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-11.2.0-h5c6108e_11.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-11.2.0-he4da1e4_11.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/mysql-common-8.0.27-ha770c72_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pandoc-2.15-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/tzdata-2021e-he74cb21_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-11.2.0-h69a702a_11.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgomp-11.2.0-h1d223b6_11.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-1_gnu.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-11.2.0-h1d223b6_11.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.3-h516909a_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/expat-2.4.1-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/icu-68.2-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/jbig-2.1-h7f98852_2003.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/jpeg-9d-h36c2ea0_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/lerc-3.0-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.8-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h9c3ff4c_4.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.16-h516909a_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.4-h7f98852_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.18-pthreads_h8fe5266_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libopus-1.3.1-h7f98852_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.18-h36c2ea0_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h7f98852_1000.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.2.1-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.11-h36c2ea0_1013.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.3-h9c3ff4c_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.2-h58526e2_4.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/nspr-4.32-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1l-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pcre-8.45-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.5-h516909a_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h516909a_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-h73d1719_1008.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-12_linux64_openblas.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.10-h9b69904_4.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h7f98852_1003.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/readline-8.1-h46c0cb4_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.4-h9c3ff4c_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h36c2ea0_1013.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-12_linux64_openblas.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libglib-2.70.0-h174f98d_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-12_linux64_openblas.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libllvm11-11.1.0-hf817b99_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-h21135ba_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.12-h72842e0_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.36.0-h9cd32fc_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.11-h27826a3_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.0-ha95c52a_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.4-h0708190_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.70.0-h780b84a_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.18.5-h76c114f_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/krb5-1.19.2-hcc1bbae_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libclang-11.1.0-default_ha53f305_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.3.0-h6f004c6_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.0.3-he3ba5ed_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-8.0.27-hfa10184_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/nss-3.69-hb5efdd6_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/python-3.9.7-hb7a2778_3_cpython.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-hba837de_1005.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/glib-2.70.0-h780b84a_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.18.5-hf529b03_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.12-hddcbb42_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libpq-13.3-hd57d9b9_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.4.0-hb52868f_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.11.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd3deb0d_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pycparser-2.20-pyh9f0ad1d_2.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.3-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.9-2_cp39.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/certifi-2021.10.8-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/cffi-1.14.6-py39h4bc2ebd_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/chardet-4.0.0-py39hf3d152e_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h48d8840_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/debugpy-1.4.1-py39he80948d_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/importlib-metadata-4.8.1-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/jedi-0.18.0-py39hf3d152e_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/jupyter_core-4.9.1-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.3.2-py39h1a9c180_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-1.1.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/markupsafe-2.0.1-py39h3811e60_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/mistune-0.8.4-py39h3811e60_1004.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/numpy-1.21.3-py39hdbf815f_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pexpect-4.8.0-pyh9f0ad1d_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pillow-8.3.2-py39ha612740_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-4.19.18-py39he80948d_7.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyrsistent-0.17.3-py39h3811e60_2.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pysocks-1.7.1-py39hf3d152e_3.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0-py39h3811e60_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyzmq-22.3.0-py39h37b5a0c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/setuptools-58.2.0-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/sniffio-1.2.0-py39hf3d152e_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/tornado-6.1-py39h3811e60_1.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/websocket-client-0.57.0-py39hf3d152e_4.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/anyio-3.3.4-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/argon2-cffi-21.1.0-py39h3811e60_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/biopython-1.79-py39h3811e60_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/brotlipy-0.7.0-py39h3811e60_1001.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/cryptography-35.0.0-py39h95dcef6_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.1.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.4.3-py39h2fa2bec_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.2.8-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pandas-1.3.4-py39hde0f152_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/qt-5.12.9-hda022c4_4.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/scipy-1.7.1-py39hee8e79c_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/terminado-0.12.1-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyqt-impl-5.12.3-py39h0fcd23e_7.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupytext-1.13.0-pyh6002c4b_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.4-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.21-pyha770c72_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyqtchart-5.12-py39h0fcd23e_7.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyqtwebengine-5.12.1-py39h0fcd23e_7.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/ipython-7.28.0-py39hef51801_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/nbconvert-6.2.0-py39hf3d152e_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.12.3-py39hf3d152e_7.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/ipykernel-6.4.2-py39hef51801_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.4.3-py39hf3d152e_1.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/requests-unixsocket-0.2.0-py_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.1-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.1-pyhd8ed1ab_0.tar.bz2 ================================================ FILE: Chapter02/.gitignore ================================================ *png VAERSDataUseGuide_en_September2021.pdf ================================================ FILE: Chapter02/Arrow.py ================================================ import gzip import pandas as pd from pyarrow import csv import pyarrow.compute as pc vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1") columns = list(vdata_pd.columns) vdata_pd.info(memory_usage="deep") vdata_arrow = csv.read_csv("2021VAERSDATA.csv.gz") tot_bytes = sum([ vdata_arrow[name].nbytes for name in vdata_arrow.column_names]) print(f"Total {tot_bytes // (1024 ** 2)} MB") for name in vdata_arrow.column_names: arr_bytes = vdata_arrow[name].nbytes arr_type = vdata_arrow[name].type pd_bytes = vdata_pd[name].memory_usage(index=False, deep=True) pd_type = vdata_pd[name].dtype print( name, arr_type, arr_bytes // (1024 ** 2), pd_type, pd_bytes // (1024 ** 2),) # %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1") # %timeit csv.read_csv("2021VAERSDATA.csv.gz") # REMOVE SYMPTOM_TEXT vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT") data_pd.info(memory_usage="deep") #columns.remove("SYMPTOM_TEXT") vdata_arrow = csv.read_csv( "2021VAERSDATA.csv.gz", convert_options=csv.ConvertOptions(include_columns=columns)) vdata_arrow.nbytes # %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT") # %timeit csv.read_csv("2021VAERSDATA.csv.gz", convert_options=csv.ConvertOptions(include_columns=columns)) vdata = vdata_arrow.to_pandas() vdata.info(memory_usage="deep") # Theres more vdata = vdata_arrow.to_pandas(self_destruct=True) ================================================ FILE: Chapter02/Matplotlib.py ================================================ import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt vdata = pd.read_csv( "2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda name: name != "SYMPTOM_TEXT") num_rows = len(vdata) perc_nan = {} for col_name in vdata.columns: num_nans = len(vdata[col_name][vdata[col_name].isna()]) perc_nan[col_name] = 100 * num_nans / num_rows labels = perc_nan.keys() bar_values = list(perc_nan.values()) x_positions = np.arange(len(labels)) fig = plt.figure() fig.suptitle("Fraction of empty values per column") ax = fig.add_subplot() ax.bar(x_positions, bar_values) ax.set_ylabel("Percent of empty values") ax.set_xlabel("Column") ax.set_xticks(x_positions) ax.set_xticklabels(labels) ax.legend() fig.savefig("naive_chart.png") # OO interface vs matlab... fig = plt.figure(figsize=(16, 9), tight_layout=True, dpi=600) fig.suptitle("Fraction of empty values per column", fontsize="48") ax = fig.add_subplot() b1 = ax.bar(x_positions, bar_values) ax.set_ylabel("Percent of empty values", fontsize="xx-large") ax.set_xticks(x_positions) ax.set_xticklabels(labels, rotation=45, ha="right") ax.set_ylim(0, 100) ax.set_xlim(-0.5, len(labels)) for i, x in enumerate(x_positions): ax.text( x, 2, "%.1f" % bar_values[i], rotation=90, va="bottom", ha="center", backgroundcolor="white") fig.text(0.2, 0.01, "Column", fontsize="xx-large") fig.savefig("cleaner_chart.png") dead = vdata[vdata.DIED == "Y"] vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID") vax.groupby("VAX_TYPE").size().sort_values() vax_dead = dead.join(vax, on="VAERS_ID", how="inner") # join on id, discuss vax_dead.iloc[0] dead_counts = vax_dead["VAX_TYPE"].value_counts() large_values = dead_counts[dead_counts >= 10] other_sum = dead_counts[dead_counts < 10].sum() large_values = large_values.append(pd.Series({"OTHER": other_sum})) distance_df = vax_dead[vax_dead.DATEDIED.notna() & vax_dead.VAX_DATE.notna()] distance_df["DATEDIED"] = pd.to_datetime(distance_df["DATEDIED"]) distance_df["VAX_DATE"] = pd.to_datetime(distance_df["VAX_DATE"]) distance_df = distance_df[distance_df.DATEDIED >= "2021"] distance_df = distance_df[distance_df.VAX_DATE >= "2021"] distance_df = distance_df[distance_df.DATEDIED >= distance_df.VAX_DATE] time_distances = distance_df["DATEDIED"] - distance_df["VAX_DATE"] time_distances_d = time_distances.astype(int) / (10**9 * 60 * 60 * 24) date_died = pd.to_datetime(vax_dead[vax_dead.DATEDIED.notna()]["DATEDIED"]) date_died = date_died[date_died >= "2021"] date_died_counts = date_died.value_counts().sort_index() cum_deaths = date_died_counts.cumsum() state_dead = vax_dead[vax_dead["STATE"].notna()][["STATE", "SEX"]] top_states = sorted(state_dead["STATE"].value_counts().head(10).index) top_state_dead = state_dead[state_dead["STATE"].isin(top_states)].groupby(["STATE", "SEX"]).size()#.reset_index() top_state_dead.loc["MN", "U"] = 0 # XXXX top_state_dead = top_state_dead.sort_index().reset_index() top_state_females = top_state_dead[top_state_dead.SEX == "F"][0] top_state_males = top_state_dead[top_state_dead.SEX == "M"][0] top_state_unk = top_state_dead[top_state_dead.SEX == "U"][0] fig, ((vax_cnt, time_dist), (death_time, state_reps)) = plt.subplots( 2, 2, figsize=(16, 9), tight_layout=True, dpi=600) vax_cnt.set_title("Vaccines involved in deaths") wedges, texts = vax_cnt.pie(large_values) vax_cnt.legend(wedges, large_values.index, loc="lower left") time_dist.hist(time_distances_d, bins=50) time_dist.set_title("Days between vaccine administration and death") time_dist.set_xlabel("Days") time_dist.set_ylabel("Observations") death_time.plot(date_died_counts.index, date_died_counts, ".") death_time.set_title("Deaths over time") death_time.set_ylabel("Daily deaths") death_time.set_xlabel("Date") tw = death_time.twinx() tw.plot(cum_deaths.index, cum_deaths) tw.set_ylabel("Cummulative deaths") state_reps.set_title("Deaths per state stratified by sex") state_reps.bar(top_states, top_state_females, label="Females") state_reps.bar(top_states, top_state_males, label="Males", bottom=top_state_females) state_reps.bar(top_states, top_state_unk, label="Unknown", bottom=top_state_females.values + top_state_males.values) state_reps.legend() state_reps.set_xlabel("State") state_reps.set_ylabel("Deaths") fig.savefig("summary.png") fig ================================================ FILE: Chapter02/NumPy.py ================================================ import numpy as np import pandas as pd import matplotlib.pyplot as plt vdata = pd.read_csv( "2021VAERSDATA.csv.gz", encoding="iso-8859-1") vdata["STATE"] = vdata["STATE"].str.upper() top_states = pd.DataFrame({ "size": vdata.groupby("STATE").size().sort_values(ascending=False).head(5)}).reset_index() top_states["rank"] = top_states.index top_states = top_states.set_index("STATE") top_vdata = vdata[vdata["STATE"].isin(top_states.index)] top_vdata["state_code"] = top_vdata["STATE"].apply( lambda state: top_states["rank"].at[state] ).astype(np.uint8) top_vdata = top_vdata[top_vdata["AGE_YRS"].notna()] top_vdata.loc[:,"AGE_YRS"] = top_vdata["AGE_YRS"].astype(int) top_states age_state = top_vdata[["state_code", "AGE_YRS"]] age_state["state_code"] state_code_arr = age_state["state_code"].values type(state_code_arr), state_code_arr.shape, state_code_arr.dtype age_state["AGE_YRS"] age_arr = age_state["AGE_YRS"].values type(age_arr), age_arr.shape, age_arr.dtype age_arr.max() age_state_mat = np.zeros((5,6), dtype=np.uint64) for row in age_state.itertuples(): age_state_mat[row.state_code, row.AGE_YRS//20] += 1 age_state_mat cal = age_state_mat[0,:] kids = age_state_mat[:,0] def compute_frac(arr_1d): return arr_1d / arr_1d.sum() frac_age_stat_mat = np.apply_along_axis(compute_frac, 1, age_state_mat) perc_age_stat_mat = frac_age_stat_mat * 100 perc_age_stat_mat = perc_age_stat_mat.astype(np.uint8) perc_age_stat_mat perc_age_stat_mat = perc_age_stat_mat[:, :5] perc_age_stat_mat fig = plt.figure() ax = fig.add_subplot() ax.matshow(perc_age_stat_mat, cmap=plt.get_cmap("Greys")) ax.set_yticks(range(5)) ax.set_yticklabels(top_states.index) ax.set_xticks(range(6)) ax.set_xticklabels(["0-19", "20-39", "40-59", "60-79", "80-99", "100-119"]) fig.savefig("matrix.png") ================================================ FILE: Chapter02/Pandas_Basic.py ================================================ # # Using Pandas to process vaccine adverse events # # ## Data Access # # Go to https://vaers.hhs.gov/data/datasets.html and Download 2021 **zip** Data. Please do not download only the CSV File. # # Drop it on the directory where this notebook is. # !unzip 2021VAERSData.zip # !gzip -9 *csv import pandas as pd import matplotlib.pyplot as plt vdata = pd.read_csv( "2021VAERSDATA.csv.gz", encoding="iso-8859-1") vdata.columns vdata.dtypes vdata.shape vdata.iloc[0] vdata = vdata.set_index("VAERS_ID") vdata.loc[916600] vdata.head(3) vdata.iloc[:3] vdata.iloc[:5, 2:4] vdata["AGE_YRS"].max() vdata.AGE_YRS.max() vdata["AGE_YRS"].sort_values().plot(use_index=False) vdata["AGE_YRS"].sort_values().plot(use_index=False) fig, ax = plt.subplots(1, 2, sharey=True, dpi=300) fig.suptitle("Age of adverse events") vdata["AGE_YRS"].sort_values().plot( use_index=False, ax=ax[0], xlabel="Obervation", ylabel="Age") vdata["AGE_YRS"].plot.hist(bins=20, orientation="horizontal") fig.savefig("adverse.png") vdata["AGE_YRS"].dropna().apply(lambda x: int(x)).value_counts() # not documented vdata.DIED.value_counts(dropna=False) # NA is a problem, how to be implemented vdata["is_dead"] = (vdata.DIED == "Y") dead = vdata[vdata.is_dead] vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID") print(vax.columns) print(vax.shape) print(vax.VAX_TYPE.unique()) vax.groupby("VAX_TYPE").size().sort_values() vax19 = vax[vax.VAX_TYPE == "COVID19"] vax19_dead = dead.join(vax19) # join on id, discuss vax19_dead.index.value_counts() baddies = vax19_dead.groupby("VAX_LOT").size().sort_values(ascending=False) for i, (lot, cnt) in enumerate(baddies.items()): print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby("STATE"))) if i == 10: break # The data above is not totally correct - at least in terms of interpretation, but for that we need to check the next recipe ================================================ FILE: Chapter02/Pandas_Join.py ================================================ # # Pandas advanced import numpy as np import pandas as pd # # Code to sample original data # # ``` # vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1") # vdata.sample(frac=0.9).to_csv("vdata_sample.csv.gz", index=False) # vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1") # vax.sample(frac=0.9).to_csv("vax_sample.csv.gz", index=False) # ``` vdata = pd.read_csv("vdata_sample.csv.gz") # No encoding vax = pd.read_csv("vax_sample.csv.gz") vdata_with_vax = vdata.join( vax.set_index("VAERS_ID"), on="VAERS_ID", how="inner") len(vdata), len(vax), len(vdata_with_vax) lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] lost_vdata lost_vax = vax[~vax["VAERS_ID"].isin(vdata_with_vax["VAERS_ID"])] lost_vax # Left, Right and outer caveats vdata_with_vax_left = vdata.join( vax.set_index("VAERS_ID"), on="VAERS_ID") vdata_with_vax_left.groupby("VAERS_ID").size().sort_values() len(vdata_with_vax_left), len(vdata_with_vax_left.VAERS_ID.unique()) # + #vdata_all = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1") #vax_all = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1") # - dead = vdata[vdata.DIED == "Y"] vax19 = vax[vax.VAX_TYPE == "COVID19"] vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how="right") # join on id, discuss len(vax19), len(dead), len(vax19_dead) len(vax19_dead[vax19_dead.VAERS_ID.duplicated()]) len(vax19_dead) - len(dead) vax19_dead["STATE"] = vax19_dead["STATE"].str.upper() dead_lot = vax19_dead[["VAERS_ID", "VAX_LOT", "STATE"]].set_index(["VAERS_ID", "VAX_LOT"]) dead_lot_clean = dead_lot[~dead_lot.index.duplicated()] dead_lot_clean = dead_lot_clean.reset_index() dead_lot_clean[dead_lot_clean.VAERS_ID.isna()] baddies = dead_lot_clean.groupby("VAX_LOT").size().sort_values(ascending=False) for i, (lot, cnt) in enumerate(baddies.items()): print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby("STATE"))) if i == 10: break ================================================ FILE: Chapter02/Pandas_Memory.py ================================================ # # Pandas advanced import numpy as np import pandas as pd vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1") vdata.info(memory_usage="deep") for name in vdata.columns: col_bytes = vdata[name].memory_usage(index=False, deep=True) col_type = vdata[name].dtype print( name, col_type, col_bytes // (1024 ** 2)) vdata.DIED.memory_usage(index=False, deep=True) vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True) vdata.STATE.unique() vdata["STATE"] = vdata.STATE.str.upper() states = list(vdata["STATE"].unique()) states vdata["encoded_state"] = vdata.STATE.apply(lambda state: states.index(state)) vdata["encoded_state"] = vdata["encoded_state"].astype(np.uint8) vdata[["encoded_state", "STATE"]].head(10) vdata["STATE"].memory_usage(index=False, deep=True) vdata["encoded_state"].memory_usage(index=False, deep=True) vdata.index states = list(pd.read_csv( "vdata_sample.csv.gz", converters={ "STATE": lambda state: state.upper() # You need to know the states in advance }, usecols=["STATE"] )["STATE"].unique()) vdata = pd.read_csv( "vdata_sample.csv.gz", index_col="VAERS_ID", converters={ "DIED": lambda died: died == "Y", "STATE": lambda state: states.index(state.upper()) }, usecols=lambda name: name != "SYMPTOM_TEXT" ) vdata["STATE"] = vdata["STATE"].astype(np.uint8) vdata.info(memory_usage="deep") ================================================ FILE: Chapter03/Accessing_Databases.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- from Bio import Entrez, Medline, SeqIO # ### Do not forget to inform NCBI of your email address (change below) Entrez.email = "put@your_email.here" #This gives you the list of available databases handle = Entrez.einfo() rec = Entrez.read(handle) print(rec) handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]') rec_list = Entrez.read(handle) if int(rec_list['RetMax']) < int(rec_list['Count']): handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]', retmax=rec_list['Count']) rec_list = Entrez.read(handle) id_list = rec_list['IdList'] hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count']) recs = list(SeqIO.parse(hdl, 'gb')) for rec in recs: if rec.name == 'KM288867': break print(rec.name) print(rec.description) for feature in rec.features: if feature.type == 'gene': print(feature.qualifiers['gene']) elif feature.type == 'exon': loc = feature.location print('Exon', loc.start, loc.end, loc.strand) else: print('not processed:\n%s' % feature) for name, value in rec.annotations.items(): print('%s=%s' % (name, value)) print(len(rec.seq)) refs = rec.annotations['references'] print(refs) for ref in refs: if ref.pubmed_id != '': print(ref.pubmed_id) handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id], rettype="medline", retmode="text") records = Medline.parse(handle) for med_rec in records: for k, v in med_rec.items(): print('%s: %s' % (k, v)) ================================================ FILE: Chapter03/Basic_Sequence_Processing.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- from Bio import Entrez, Seq, SeqIO, SeqRecord Entrez.email = "put@your_email.here" hdl = Entrez.efetch(db='nucleotide', id=['NM_002299'], rettype='gb') # Lactase gene #for l in hdl: # print l gb_rec = SeqIO.read(hdl, 'gb') for feature in gb_rec.features: if feature.type == 'CDS': location = feature.location # Note translation existing cds = SeqRecord.SeqRecord(gb_rec.seq[location.start:location.end], 'NM_002299', description='LCT CDS only') w_hdl = open('example.fasta', 'w') SeqIO.write([cds], w_hdl, 'fasta') w_hdl.close() recs = SeqIO.parse('example.fasta', 'fasta') for rec in recs: seq = rec.seq print(rec.description) print(seq[:10]) print((seq[:12], seq[-12:])) rna = seq.transcribe() rna prot = seq.translate() prot ================================================ FILE: Chapter03/Filtering_SNPs.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Getting the necessary data # You will need to do this only once # !rm -rf centro.vcf.gz 2>/dev/null # !rm -rf standard.vcf.gz 2>/dev/null # !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:1-200000 |bgzip -c > centro.vcf.gz # !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:21000000-21200000 |bgzip -c > standard.vcf.gz # !tabix -p vcf centro.vcf.gz # !tabix -p vcf standard.vcf.gz # # Recipe # + from collections import defaultdict import functools import numpy as np import seaborn as sns import matplotlib.pyplot as plt from cyvcf2 import VCF # - def do_window(recs, size, fun): start = None win_res = [] for rec in recs: if not rec.is_snp or len(rec.ALT) > 1: continue if start is None: start = rec.POS my_win = 1 + (rec.POS - start) // size while len(win_res) < my_win: win_res.append([]) win_res[my_win - 1].extend(fun(rec)) return win_res def apply_win_funs(wins, funs): fun_results = [] for win in wins: my_funs = {} for name, fun in funs.items(): try: my_funs[name] = fun(win) except: my_funs[name] = None fun_results.append(my_funs) return fun_results wins = {} size = 2000 names = ['centro.vcf.gz', 'standard.vcf.gz'] for name in names: recs = VCF(name) wins[name] = do_window(recs, size, lambda x: [1]) stats = {} fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True) for name, nwins in wins.items(): stats[name] = apply_win_funs(nwins, {'sum': sum}) x_lim = [i * size for i in range(len(stats[name]))] ax.plot(x_lim, [x['sum'] for x in stats[name]], label=name) ax.legend() ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large') ax.set_ylabel('Number of variant sites (bi-allelic SNPs)', fontsize='xx-large') fig.suptitle('Number of bi-allelic SNPs along the genome', fontsize='xx-large') fig.savefig('bi.png') # + mq0_wins = {} size = 5000 def get_sample(rec, annot, my_type): return [v for v in rec.format(annot) if v > np.iinfo(my_type).min] for name in names: recs = VCF(name) mq0_wins[name] = do_window(recs, size, functools.partial(get_sample, annot='MQ0', my_type=np.int32)) # - stats = {} colors = ['b', 'g'] i = 0 fig, ax = plt.subplots(figsize=(16, 9)) for name, nwins in mq0_wins.items(): stats[name] = apply_win_funs(nwins, {'median': np.median, '75': functools.partial(np.percentile, q=95)}) x_lim = [j * size for j in range(len(stats[name]))] ax.plot(x_lim, [x['median'] for x in stats[name]], label=name, color=colors[i]) ax.plot(x_lim, [x['75'] for x in stats[name]], '--', color=colors[i]) i += 1 #ax.set_ylim(0, 40) ax.legend() ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large') ax.set_ylabel('MQ0', fontsize='xx-large') fig.suptitle('Distribution of MQ0 along the genome', fontsize='xx-large') fig.savefig('MQ0.png') def get_sample_relation(recs, f1, f2): rel = defaultdict(int) for rec in recs: if not rec.is_snp: continue for pos in range(len(rec.genotypes)): v1 = f1(rec, pos) v2 = f2(rec, pos) if v1 is None or v2 == np.iinfo(type(v2)).min: continue # We ignore Nones rel[(v1, v2)] += 1 # careful with the size, floats: round? #break return rel rels = {} for name in names: recs = VCF(name) rels[name] = get_sample_relation( recs, lambda rec, pos: 1 if rec.genotypes[pos][0] != rec.genotypes[pos][1] else 0, lambda rec, pos: rec.format('DP')[pos][0]) # + fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True) def plot_hz_rel(dps, ax, ax2, name, rel): frac_hz = [] cnt_dp = [] for dp in dps: hz = 0.0 cnt = 0 for khz, kdp in rel.keys(): if kdp != dp: continue cnt += rel[(khz, dp)] if khz == 1: hz += rel[(khz, dp)] frac_hz.append(hz / cnt) cnt_dp.append(cnt) ax.plot(dps, frac_hz, label=name) ax2.plot(dps, cnt_dp, '--', label=name) ax2 = ax.twinx() for name, rel in rels.items(): dps = list(set([x[1] for x in rel.keys()])) dps.sort() plot_hz_rel(dps, ax, ax2, name, rel) ax.set_xlim(0, 75) ax.set_ylim(0, 0.2) ax2.set_ylabel('Quantity of calls', fontsize='xx-large') ax.set_ylabel('Fraction of Heterozygote calls', fontsize='xx-large') ax.set_xlabel('Sample Read Depth (DP)', fontsize='xx-large') ax.legend() fig.suptitle('Number of calls per depth and fraction of calls which are Hz', fontsize='xx-large') fig.savefig('hz.png') # - def get_variant_relation(recs, f1, f2): rel = defaultdict(int) for rec in recs: if not rec.is_snp: continue try: v1 = f1(rec) v2 = f2(rec) if v1 is None or v2 is None: continue # We ignore Nones rel[(v1, v2)] += 1 #careful with the size, floats: round? except: # This is outside the domain (typically None) pass return rel # + accepted_eff = ['INTERGENIC', 'INTRON', 'NON_SYNONYMOUS_CODING', 'SYNONYMOUS_CODING'] def eff_to_int(rec): try: annot = rec.INFO['EFF'] master_type = annot.split('(')[0] return accepted_eff.index(master_type) except ValueError: return len(accepted_eff) # - eff_mq0s = {} for name in names: recs = VCF(name) eff_mq0s[name] = get_variant_relation( recs, lambda r: eff_to_int(r), lambda r: int(r.INFO['DP'])) fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True) name = 'standard.vcf.gz' bp_vals = [[] for x in range(len(accepted_eff) + 1)] for k, cnt in eff_mq0s[name].items(): my_eff, mq0 = k bp_vals[my_eff].extend([mq0] * cnt) #memory usage #print(bp_vals[-2]) sns.boxplot(data=bp_vals, sym='', ax=ax) ax.set_xticklabels(accepted_eff + ['OTHER']) ax.set_ylabel('DP (variant)', fontsize='xx-large') fig.suptitle('Distribution of variant DP per SNP type', fontsize='xx-large') fig.savefig('eff.png') ================================================ FILE: Chapter03/LCT.bed ================================================ track name=gene description="Gene information" 2 135836529 135837180 ENSE00002202258 0 - 2 135833110 135833190 ENSE00001660765 0 - 2 135829592 135829676 ENSE00001731451 0 - 2 135823900 135824003 ENSE00001659892 0 - 2 135822019 135822098 ENSE00001777620 0 - 2 135817340 135818061 ENSE00001602826 0 - 2 135812310 135812956 ENSE00000776576 0 - 2 135808442 135809993 ENSE00001008768 0 - 2 135807127 135807396 ENSE00000776573 0 - 2 135804766 135805057 ENSE00000776572 0 - 2 135803929 135804128 ENSE00000776571 0 - 2 135800606 135800809 ENSE00000776570 0 - 2 135798028 135798138 ENSE00003515081 0 - 2 135794640 135794775 ENSE00001630333 0 - 2 135790657 135790881 ENSE00001667885 0 - 2 135789570 135789798 ENSE00001728878 0 - 2 135787839 135788544 ENSE00001653704 0 - 2 135812310 135812959 ENSE00001745158 0 - 2 135808442 135809993 ENSE00001008768 0 - 2 135807127 135807396 ENSE00000776573 0 - 2 135804766 135805057 ENSE00000776572 0 - 2 135803929 135804128 ENSE00000776571 0 - 2 135798028 135798138 ENSE00003459353 0 - 2 135794336 135794775 ENSE00001635523 0 - 2 135810168 135810279 ENSE00001438557 0 - 2 135820190 135820639 ENSE00001732580 0 + 2 135821674 135823087 ENSE00001695040 0 + 2 135836529 135837180 NM_002299.2.1 0 - 2 135833110 135833190 NM_002299.2.2 0 - 2 135829592 135829676 NM_002299.2.3 0 - 2 135823900 135824003 NM_002299.2.4 0 - 2 135822019 135822098 NM_002299.2.5 0 - 2 135817340 135818061 NM_002299.2.6 0 - 2 135812310 135812956 NM_002299.2.7 0 - 2 135808442 135809993 NM_002299.2.8 0 - 2 135807127 135807396 NM_002299.2.9 0 - 2 135804766 135805057 NM_002299.2.10 0 - 2 135803929 135804128 NM_002299.2.11 0 - 2 135800606 135800809 NM_002299.2.12 0 - 2 135798028 135798138 NM_002299.2.13 0 - 2 135794640 135794775 NM_002299.2.14 0 - 2 135790657 135790881 NM_002299.2.15 0 - 2 135789570 135789798 NM_002299.2.16 0 - 2 135787844 135788544 NM_002299.2.17 0 - 2 135836529 135837169 CCDS2178.117 0 - 2 135833110 135833190 CCDS2178.116 0 - 2 135829592 135829676 CCDS2178.115 0 - 2 135823900 135824003 CCDS2178.114 0 - 2 135822019 135822098 CCDS2178.113 0 - 2 135817340 135818061 CCDS2178.112 0 - 2 135812310 135812956 CCDS2178.111 0 - 2 135808442 135809993 CCDS2178.110 0 - 2 135807127 135807396 CCDS2178.19 0 - 2 135804766 135805057 CCDS2178.18 0 - 2 135803929 135804128 CCDS2178.17 0 - 2 135800606 135800809 CCDS2178.16 0 - 2 135798028 135798138 CCDS2178.15 0 - 2 135794640 135794775 CCDS2178.14 0 - 2 135790657 135790881 CCDS2178.13 0 - 2 135789570 135789798 CCDS2178.12 0 - 2 135788323 135788544 CCDS2178.11 0 - ================================================ FILE: Chapter03/Processing_BED_with_HTSeq.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- from collections import defaultdict import re import HTSeq lct_bed = HTSeq.BED_Reader('LCT.bed') # + feature_types = defaultdict(int) for rec in lct_bed: last_rec = rec feature_types[re.search('([A-Z]+)', rec.name).group(0)] += 1 print(feature_types) #Code specific to this dataset, document # - print(last_rec) print(last_rec.name) print(type(last_rec)) interval = last_rec.iv print(interval) print(type(interval)) # + print(interval.chrom, interval.start, interval.end) print(interval.strand) print(interval.length) print(interval.start_d) print(interval.start_as_pos) print(type(interval.start_as_pos)) #talk about overlaps # - exon_start = None exon_end = None sizes = [] for rec in lct_bed: if not rec.name.startswith('CCDS'): continue interval = rec.iv exon_start = min(interval.start, exon_start or interval.start) exon_end = max(interval.length, exon_end or interval.end) sizes.append(interval.length) sizes.sort() print("Num exons: %d / Begin: %d / End %d" % (len(sizes), exon_start, exon_end)) print("Smaller exon: %d / Larger exon: %d / Mean size: %.1f" % (sizes[0], sizes[-1], sum(sizes)/len(sizes))) ================================================ FILE: Chapter03/Working_with_BAM.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Getting the necessary data # You just need to do this only once # !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam 2>/dev/null # !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai 2>/dev/null # !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam # !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai # # The recipe # + #pip install pysam from collections import defaultdict import numpy as np import seaborn as sns import matplotlib.pyplot as plt import pysam # - bam = pysam.AlignmentFile('NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam', 'rb') headers = bam.header for record_type, records in headers.items(): print (record_type) for i, record in enumerate(records): if type(record) == dict: print('\t%d' % (i + 1)) for field, value in record.items(): print('\t\t%s\t%s' % (field, value)) else: print('\t\t%s' % record) #0-based for rec in bam: if rec.cigarstring.find('M') > -1 and rec.cigarstring.find('S') > -1 and not rec.is_unmapped and not rec.mate_is_unmapped: break print(rec.query_name, rec.reference_id, bam.getrname(rec.reference_id), rec.reference_start, rec.reference_end) print(rec.cigarstring) print(rec.query_alignment_start, rec.query_alignment_end, rec.query_alignment_length) print(rec.next_reference_id, rec.next_reference_start, rec.template_length) print(rec.is_paired, rec.is_proper_pair, rec.is_unmapped, rec.mapping_quality) print(rec.query_qualities) print(rec.query_alignment_qualities) print(rec.query_sequence) counts = [0] * 76 for n, rec in enumerate(bam.fetch('20', 0, 10000000)): for i in range(rec.query_alignment_start, rec.query_alignment_end): counts[i] += 1 freqs = [100 * x / (n + 1) for x in counts] fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True) ax.plot(range(1, 77), freqs) ax.set_xlabel('Read distance', fontsize='xx-large') ax.set_ylabel('PHRED score', fontsize='xx-large') fig.suptitle('Percentage of mapped calls as a function of the position from the start of the sequencer read', fontsize='xx-large') fig.savefig('map_perc.png') phreds = defaultdict(list) for rec in bam.fetch('20', 0, None): for i in range(rec.query_alignment_start, rec.query_alignment_end): phreds[i].append(rec.query_qualities[i]) maxs = [max(phreds[i]) for i in range(76)] tops = [np.percentile(phreds[i], 95) for i in range(76)] medians = [np.percentile(phreds[i], 50) for i in range(76)] bottoms = [np.percentile(phreds[i], 5) for i in range(76)] medians_fig = [x - y for x, y in zip(medians, bottoms)] tops_fig = [x - y for x, y in zip(tops, medians)] maxs_fig = [x - y for x, y in zip(maxs, tops)] fig, ax = plt.subplots(figsize=(16,9),dpi=300, tight_layout=True) ax.stackplot(range(1, 77), (bottoms, medians_fig, tops_fig, maxs_fig)) ax.plot(range(1, 77), maxs, 'k-') ax.set_xlabel('Read distance', fontsize='xx-large') ax.set_ylabel('PHRED score', fontsize='xx-large') fig.suptitle('Distribution of PHRED scores as a function of the position in the read', fontsize='xx-large') fig.savefig('phred2.png') ================================================ FILE: Chapter03/Working_with_FASTQ.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Getting the necessary data # You just need to download this ~28 MB file only once # !rm -f SRR003265.filt.fastq.gz 2>/dev/null # !wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265.filt.fastq.gz # # The recipe # + from collections import defaultdict import gzip import seaborn as sns import matplotlib.pyplot as plt from Bio import SeqIO # - recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq') rec = next(recs) print(rec.id, rec.description, rec.seq) print(rec.letter_annotations) recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq') cnt = defaultdict(int) for rec in recs: for letter in rec.seq: cnt[letter] += 1 tot = sum(cnt.values()) for letter, cnt in cnt.items(): print('%s: %.2f %d' % (letter, 100 * cnt / tot, cnt)) recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='UTF-8'), 'fastq') n_cnt = defaultdict(int) for rec in recs: for i, letter in enumerate(rec.seq): pos = i + 1 if letter == 'N': n_cnt[pos] += 1 seq_len = max(n_cnt.keys()) positions = range(1, seq_len + 1) fig, ax = plt.subplots(figsize=(16, 9), tight_layout=True, dpi=300) fig.suptitle('Number of N calls as a function of the distance from the start of the sequencer read', fontsize='xx-large') ax.plot(positions, [n_cnt[x] for x in positions]) ax.set_xlim(1, seq_len) ax.set_xlabel('Read distance', fontsize='xx-large') ax.set_ylabel('Number of N Calls', fontsize='xx-large') fig.savefig('n_calls.png') recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq') cnt_qual = defaultdict(int) for rec in recs: for i, qual in enumerate(rec.letter_annotations['phred_quality']): if i < 25: continue cnt_qual[qual] += 1 tot = sum(cnt_qual.values()) for qual, cnt in cnt_qual.items(): print('%d: %.2f %d' % (qual, 100. * cnt / tot, cnt)) recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq') qual_pos = defaultdict(list) for rec in recs: for i, qual in enumerate(rec.letter_annotations['phred_quality']): if i < 25 or qual == 40: continue pos = i + 1 qual_pos[pos].append(qual) vps = [] poses = list(qual_pos.keys()) poses.sort() for pos in poses: vps.append(qual_pos[pos]) fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True) sns.boxplot(data=vps, ax=ax) ax.set_xticklabels([str(x) for x in range(26, max(qual_pos.keys()) + 1)]) ax.set_xlabel('Read distance', fontsize='xx-large') ax.set_ylabel('PHRED score', fontsize='xx-large') fig.suptitle('Distribution of PHRED scores as a function of read distance', fontsize='xx-large') fig.savefig('phred.png') # # There is more... # ## Do this to download the paired end data # Be careful as this will be 1GB of data (and fully optional) # !rm -f SRR003265_1.filt.fastq.gz 2>/dev/null # !rm -f SRR003265_2.filt.fastq.gz 2>/dev/null # !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz # !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz # + f1 = gzip.open('SRR003265_1.filt.fastq.gz', 'rt', encoding='utf8') f2 = gzip.open('SRR003265_2.filt.fastq.gz', 'rt', encoding='utf8') recs1 = SeqIO.parse(f1, 'fastq') recs2 = SeqIO.parse(f2, 'fastq') cnt = 0 for rec1, rec2 in zip(recs1, recs2): cnt +=1 print('Number of pairs: %d' % cnt) # - ================================================ FILE: Chapter03/Working_with_VCF.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Getting the necessary data # You just need to do this only once # !rm -f genotypes.vcf.gz 2>/dev/null # !tabix -fh ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/supporting/vcf_with_sample_level_annotation/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5_extra_anno.20130502.genotypes.vcf.gz 22:1-17000000|bgzip -c > genotypes.vcf.gz # !tabix -p vcf genotypes.vcf.gz # + from collections import defaultdict import seaborn as sns import matplotlib.pyplot as plt from cyvcf2 import VCF # + v = VCF('genotypes.vcf.gz') rec = next(v) print('Variant Level information') info = rec.INFO for info in rec.INFO: print(info) print('Sample Level information') for fmt in rec.FORMAT: print(fmt) # + v = VCF('genotypes.vcf.gz') samples = v.samples print(len(samples)) # Order change variant = next(v) print(variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT, variant.QUAL, variant.FILTER) print(variant.INFO) print(variant.FORMAT) print(variant.is_snp) #rec.format('DP') #rec.format('GT') str_alleles = variant.gt_bases[0] alleles = variant.genotypes[0][0:2] is_phased = variant.genotypes[0][2] print(str_alleles, alleles, is_phased) print(variant.format('DP')[0]) # + f = VCF('genotypes.vcf.gz') my_type = defaultdict(int) num_alts = defaultdict(int) for variant in f: my_type[variant.var_type, variant.var_subtype] += 1 if variant.var_type == 'snp': num_alts[len(variant.ALT)] += 1 print(my_type) print(num_alts) # + f = VCF('genotypes.vcf.gz') sample_dp = defaultdict(int) for variant in f: if not variant.is_snp or len(variant.ALT) != 1: continue for dp in variant.format('DP'): #dp = int(dp) sample_dp[dp] += 1 # - dps = list(sample_dp.keys()) dps.sort() dp_dist = [sample_dp[x] for x in dps] fig, ax = plt.subplots(figsize=(16, 9)) ax.plot(dp_dist[:50], 'r') ax.axvline(dp_dist.index(max(dp_dist))) ================================================ FILE: Chapter04/2L.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + # %matplotlib inline from collections import defaultdict import gzip import numpy as np import matplotlib.pylab as plt # - num_parents = 8 dp_2L = np.load(gzip.open('DP_2L.npy.gz', 'rb')) dp_2L.shape for i in range(num_parents): print(np.median(dp_2L[:,i]), np.median(dp_2L[50000:150000,i])) window_size = 200000 parent_DP_windows = [defaultdict(list) for i in range(num_parents)] # + def insert_in_window(row): for parent in range(num_parents): parent_DP_windows[parent][row[-1] // window_size].append(row[parent]) insert_in_window_v = np.vectorize(insert_in_window, signature='(n)->()') _ = insert_in_window_v(dp_2L) # - fig, axs = plt.subplots(2, num_parents // 2, figsize=(16, 9), sharex=True, sharey=True, squeeze=True) for parent in range(num_parents): ax = axs[parent // 4][parent % 4] parent_data = parent_DP_windows[parent] ax.set_ylim(10, 40) ax.plot(*zip(*[(win*window_size, np.mean(lst)) for win, lst in parent_data.items()]), '.') ================================================ FILE: Chapter04/Exploration.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import gzip import pickle import random import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas.plotting import scatter_matrix # %matplotlib inline # - fit = np.load(gzip.open('balanced_fit.npy.gz', 'rb')) ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True) num_features = len(ordered_features) fit_df = pd.DataFrame(fit, columns=ordered_features + ['pos', 'error']) num_samples = 80 del fit fig,ax = plt.subplots(figsize=(16,9)) _ = fit_df.hist(column=ordered_features, ax=ax) fit_df['MeanDP'] = fit_df['DP'] / 80 fig, ax = plt.subplots() _ = ax.hist(fit_df[fit_df['MeanDP']<50]['MeanDP'], bins=100) errors_df = fit_df[fit_df['error'] == 1] ok_df = fit_df[fit_df['error'] == 0] ok_qual_above_df = ok_df[ok_df['QUAL']>0.005] errors_qual_above_df = errors_df[errors_df['QUAL']>0.005] print(ok_df.size, errors_df.size, ok_qual_above_df.size, errors_qual_above_df.size) print(ok_qual_above_df.size / ok_df.size, errors_qual_above_df.size / errors_df.size) ok_qd_above_df = ok_df[ok_df['QD']>0.05] errors_qd_above_df = errors_df[errors_df['QD']>0.05] print(ok_df.size, errors_df.size, ok_qd_above_df.size, errors_qd_above_df.size) print(ok_qd_above_df.size / ok_df.size, errors_qd_above_df.size / errors_df.size) not_bad_area_errors_df = errors_df[(errors_df['QUAL']<0.005)&(errors_df['QD']<0.05)] _ = scatter_matrix(not_bad_area_errors_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02) not_bad_area_ok_df = ok_df[(ok_df['QUAL']<0.005)&(ok_df['QD']<0.05)] _ = scatter_matrix(not_bad_area_ok_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02) all_fit_df = pd.DataFrame(np.load(gzip.open('feature_fit.npy.gz', 'rb')), columns=ordered_features + ['pos', 'error']) potentially_good_corner_df = all_fit_df[(all_fit_df['QUAL']<0.005)&(all_fit_df['QD']<0.05)] all_errors_df=all_fit_df[all_fit_df['error'] == 1] print(len(all_fit_df), len(all_errors_df), len(all_errors_df) / len(all_fit_df)) potentially_good_corner_errors_df = potentially_good_corner_df[potentially_good_corner_df['error'] == 1] print(len(potentially_good_corner_df), len(potentially_good_corner_errors_df), len(potentially_good_corner_errors_df) / len(potentially_good_corner_df)) print(len(potentially_good_corner_df)/len(all_fit_df)) ================================================ FILE: Chapter04/Mendel.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import random import matplotlib.pyplot as plt # # Mendelian simulations num_sims = 100000 num_ofs = 20 # + num_hets_AA_AT = [] for sim in range(num_sims): sim_hets = 0 for ofs in range(20): sim_hets += 1 if random.choice([0, 1]) == 1 else 0 num_hets_AA_AT.append(sim_hets) fig, ax = plt.subplots(1,1, figsize=(16,9)) ax.hist(num_hets_AA_AT, bins=range(20)) print(len([num_hets for num_hets in num_hets_AA_AT if num_hets==20])) # - num_AAs_AT_AT = [] num_hets_AT_AT = [] for sim in range(num_sims): sim_AAs = 0 sim_hets = 0 for ofs in range(20): derived_cnt = sum(random.choices([0, 1], k=2)) sim_AAs += 1 if derived_cnt == 0 else 0 sim_hets += 1 if derived_cnt == 1 else 0 num_AAs_AT_AT.append(sim_AAs) num_hets_AT_AT.append(sim_hets) fig, ax = plt.subplots(1,1, figsize=(16,9)) ax.hist([num_hets_AT_AT, num_AAs_AT_AT], histtype='step', fill=False, bins=range(20), label=['het', 'AA']) plt.legend() # # Balanced output # + import gzip import pickle import random import numpy as np # - mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb')) feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb')) ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True) num_features = len(ordered_features) len(mendelian_errors), len(list(filter(lambda x: x[0] > 0,mendelian_errors.values()))) total_observations = len(mendelian_errors) error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values()))) ok_observations = total_observations - error_observations fraction_errors = error_observations/total_observations print (total_observations, ok_observations, error_observations, 100*fraction_errors) del mendelian_errors # + prob_ok_choice = error_observations / ok_observations def accept_entry(row): if row[-1] == 1: return True return random.random() <= prob_ok_choice accept_entry_v = np.vectorize(accept_entry, signature='(i)->()') accepted_entries = accept_entry_v(feature_fit) balanced_fit = feature_fit[accepted_entries] del feature_fit balanced_fit.shape len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0]) # - np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False) ================================================ FILE: Chapter04/Preparation.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5 # !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5 # + import pickle import gzip import random import numpy as np import h5py import pandas as pd # - samples = pd.read_csv('samples.tsv', sep='\t') print(len(samples)) print(samples['cross'].unique()) print(samples[samples['cross'] == 'cross-29-2'][['id', 'function']]) print(len(samples[samples['cross'] == 'cross-29-2'])) print(samples[samples['function'] == 'parent']) # # Chromosome arm 3L # + h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r') samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples'])) calldata_genotype = h5_3L['/3L/calldata/genotype'] MQ0 = h5_3L['/3L/variants/MQ0'] MQ = h5_3L['/3L/variants/MQ'] QD = h5_3L['/3L/variants/QD'] Coverage = h5_3L['/3L/variants/Coverage'] CoverageMQ0 = h5_3L['/3L/variants/CoverageMQ0'] HaplotypeScore = h5_3L['/3L/variants/HaplotypeScore'] QUAL = h5_3L['/3L/variants/QUAL'] FS = h5_3L['/3L/variants/FS'] DP = h5_3L['/3L/variants/DP'] HRun = h5_3L['/3L/variants/HRun'] ReadPosRankSum = h5_3L['/3L/variants/ReadPosRankSum'] my_features = { 'MQ': MQ, 'QD': QD, 'Coverage': Coverage, 'HaplotypeScore': HaplotypeScore, 'QUAL': QUAL, 'FS': FS, 'DP': DP, 'HRun': HRun, 'ReadPosRankSum': ReadPosRankSum } num_features = len(my_features) num_alleles = h5_3L['/3L/variants/num_alleles'] is_snp = h5_3L['/3L/variants/is_snp'] POS = h5_3L['/3L/variants/POS'] # - #compute mendelian errors (biallelic) def compute_mendelian_errors(mother, father, offspring): num_errors = 0 num_ofs_problems = 0 if len(mother.union(father)) == 1: # Mother and father are homo and the same for ofs in offspring: if len(ofs) == 2: # Offspring is het num_errors += 1 num_ofs_problems += 1 elif len(ofs.intersection(mother)) == 0: # Offspring is homo, but opposite from parents num_errors += 2 num_ofs_problems += 1 elif len(mother) == 1 and len(father) == 1: # Mother and father are homo and different for ofs in offspring: if len(ofs) == 1: # Homo, should be het num_errors += 1 num_ofs_problems += 1 elif len(mother) == 2 and len(father) == 2: # Both are het, individual offspring can be anything pass else: # One is het, the other is homo homo = mother if len(mother) == 1 else father for ofs in offspring: if len(ofs) == 1 and not ofs.intersection(homo): # homo, but not including the allele from parent that is homo num_errors += 1 num_ofs_problems += 1 return num_errors, num_ofs_problems # + def acceptable_position_to_genotype(): for i, genotype in enumerate(calldata_genotype): if is_snp[i] and num_alleles[i] == 2: if len(np.where(genotype == -1)[0]) > 1: # Missing data continue yield i def acumulate(fun): acumulator = {} for res in fun(): if res is not None: acumulator[res[0]] = res[1] return acumulator # + def get_family_indexes(samples_hdf5, cross_pd): offspring = [] for i, individual in cross_pd.T.iteritems(): index = samples_hdf5.index(individual.id) if individual.function == 'parent': if individual.sex == 'M': father = index else: mother = index else: offspring.append(index) return {'mother': mother, 'father': father, 'offspring': offspring} cross_pd = samples[samples['cross'] == 'cross-29-2'] family_indexes = get_family_indexes(samples_hdf5, cross_pd) # + mother_index = family_indexes['mother'] father_index = family_indexes['father'] offspring_indexes = family_indexes['offspring'] all_errors = {} def get_mendelian_errors(): for i in acceptable_position_to_genotype(): genotype = calldata_genotype[i] mother = set(genotype[mother_index]) father = set(genotype[father_index]) offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes] my_mendelian_errors = compute_mendelian_errors(mother, father, offspring) yield POS[i], my_mendelian_errors mendelian_errors = acumulate(get_mendelian_errors) pickle.dump(mendelian_errors, gzip.open('mendelian_errors.pickle.gz', 'wb')) # + ordered_positions = sorted(mendelian_errors.keys()) ordered_features = sorted(my_features.keys()) #XXX on code? num_features = len(ordered_features) feature_fit = np.empty((len(ordered_positions), len(my_features) + 2), dtype=float) for column, feature in enumerate(ordered_features): # 'Strange' order print(feature) current_hdf_row = 0 for row, genomic_position in enumerate(ordered_positions): while POS[current_hdf_row] < genomic_position: current_hdf_row +=1 feature_fit[row, column] = my_features[feature][current_hdf_row] for row, genomic_position in enumerate(ordered_positions): feature_fit[row, num_features] = genomic_position feature_fit[row, num_features + 1] = 1 if mendelian_errors[genomic_position][0] > 0 else 0 np.save(gzip.open('feature_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False) pickle.dump(ordered_features, open('ordered_features', 'wb')) # - # # Chromosome arm 2L h5_2L = h5py.File('ag1000g.crosses.phase1.ar3sites.2L.h5', 'r') samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_2L['/2L/samples'])) calldata_DP = h5_2L['/2L/calldata/DP'] POS = h5_2L['/2L/variants/POS'] # + def get_parent_indexes(samples_hdf5, parents_pd): parents = [] for i, individual in parents_pd.T.iteritems(): index = samples_hdf5.index(individual.id) parents.append(index) return parents parents_pd = samples[samples['function'] == 'parent'] parent_indexes = get_parent_indexes(samples_hdf5, parents_pd) # - all_dps = [] for i, pos in enumerate(POS): if random.random() > 0.01: continue pos_dp = calldata_DP[i] parent_pos_dp = [pos_dp[parent_index] for parent_index in parent_indexes] all_dps.append(parent_pos_dp + [pos]) all_dps = np.array(all_dps) np.save(gzip.open('DP_2L.npy.gz', 'wb'), all_dps, allow_pickle=False, fix_imports=False) ================================================ FILE: Chapter04/QIIME2_Metagenomics.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Important: Read this! # # This recipe does not work with the standard conda environment. # # If you are in the standard environment, do this: # # 1. Stop Jupyter # 2. Activate QIIME2 environment on conda # 3. Do `jupyter serverextension enable --py qiime2 --sys-prefix` # 4. Start Jupyter inside QIIME2 environment # # Note that other recipes will not work inside this environment. # # Check this out! # # This is based on on [QIIME2 Fecal Microbiota Transpant example](https://docs.qiime2.org/2018.8/tutorials/fmt/) (for the command line). You are strongly advised to read it before proceeding. # # There is an [amazing example](http://nbviewer.jupyter.org/gist/tkosciol/29de5198a4be81559a075756c2490fde) of using the Artifact API using the "Moving Pictures" tutorial of QIIME 2 produced by Tomasz Kościółek. I use a more convoluted approach than Tomasz's in order to go a little deeper in terms of understanding of the Python internals. That is more of a learning experience on the internals than a practical recommendatin. **My recommendation is to use Tomasz's dialect, not mine**. # # # Getting the data # !wget https://data.qiime2.org/2018.8/tutorials/fmt/sample_metadata.tsv # !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-1-10p.qza # !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-2-10p.qza # # The recipe # + import pandas as pd from qiime2.metadata.metadata import Metadata from qiime2.metadata.metadata import CategoricalMetadataColumn from qiime2.sdk import Artifact from qiime2.sdk import PluginManager from qiime2.sdk import Result # - pm = PluginManager() demux_plugin = pm.plugins['demux'] #demux_emp_single = demux_plugin.actions['emp_single'] demux_summarize = demux_plugin.actions['summarize'] pm.plugins print(demux_summarize.description) demux_summarize_signature = demux_summarize.signature print(demux_summarize_signature.inputs) print(demux_summarize_signature.parameters) print(demux_summarize_signature.outputs) # + seqs1 = Result.load('fmt-tutorial-demux-1-10p.qza') sum_data1 = demux_summarize(seqs1) sum_data1.visualization # + seqs2 = Result.load('fmt-tutorial-demux-2-10p.qza') sum_data2 = demux_summarize(seqs2) print(dir(sum_data2)) print(type(sum_data2.visualization)) print(dir(sum_data2.visualization)) sum_data2.visualization # - #Quality control dada2_plugin = pm.plugins['dada2'] dada2_denoise_single = dada2_plugin.actions['denoise_single'] qual_control1 = dada2_denoise_single(demultiplexed_seqs=seqs1, trunc_len=150, trim_left=13) qual_control2 = dada2_denoise_single(demultiplexed_seqs=seqs2, trunc_len=150, trim_left=13) metadata_plugin = pm.plugins['metadata'] metadata_tabulate = metadata_plugin.actions['tabulate'] stats_meta1 = metadata_tabulate(input=qual_control1.denoising_stats.view(Metadata)) stats_meta1.visualization stats_meta2 = metadata_tabulate(input=qual_control2.denoising_stats.view(Metadata)) stats_meta2.visualization # + ft_plugin = pm.plugins['feature-table'] ft_merge = ft_plugin.actions['merge'] ft_merge_seqs = ft_plugin.actions['merge_seqs'] ft_summarize = ft_plugin.actions['summarize'] ft_tab_seqs = ft_plugin.actions['tabulate_seqs'] table_merge = ft_merge(tables=[qual_control1.table, qual_control2.table]) seqs_merge = ft_merge_seqs(data=[qual_control1.representative_sequences, qual_control2.representative_sequences]) # - ft_sum = ft_summarize(table=table_merge.merged_table) ft_sum.visualization tab_seqs = ft_tab_seqs(data=seqs_merge.merged_data) tab_seqs.visualization ================================================ FILE: Chapter04/samples.tsv ================================================ id cross sex function AD0231-C cross-29-2 F parent AD0232-C cross-29-2 M parent AD0234-C cross-29-2 F progeny AD0235-C cross-29-2 F progeny AD0236-C cross-29-2 F progeny AD0237-C cross-29-2 F progeny AD0238-C cross-29-2 F progeny AD0239-C cross-29-2 F progeny AD0240-C cross-29-2 M progeny AD0241-C cross-29-2 F progeny AD0242-C cross-29-2 M progeny AD0243-C cross-29-2 F progeny AD0244-C cross-29-2 F progeny AD0245-C cross-29-2 F progeny AD0246-C cross-29-2 F progeny AD0247-C cross-29-2 M progeny AD0248-C cross-29-2 F progeny AD0249-C cross-29-2 F progeny AD0250-C cross-29-2 F progeny AD0251-C cross-29-2 F progeny AD0252-C cross-29-2 F progeny AD0253-C cross-29-2 M progeny AD0254-C cross-36-9 F parent AD0255-C cross-36-9 M parent AD0259-C cross-36-9 M progeny AD0260-C cross-36-9 F progeny AD0261-C cross-36-9 F progeny AD0262-C cross-36-9 M progeny AD0263-C cross-36-9 M progeny AD0265-C cross-36-9 F progeny AD0266-C cross-36-9 M progeny AD0267-C cross-36-9 F progeny AD0268-C cross-36-9 M progeny AD0269-C cross-36-9 F progeny AD0270-C cross-36-9 M progeny AD0271-C cross-36-9 M progeny AD0272-C cross-36-9 F progeny AD0273-C cross-36-9 M progeny AD0274-C cross-36-9 F progeny AD0275-C cross-36-9 M progeny AD0276-C cross-36-9 F progeny AD0305-C cross-42-4 F parent AD0306-C cross-42-4 M parent AD0309-C cross-42-4 M progeny AD0310-C cross-42-4 M progeny AD0311-C cross-42-4 M progeny AD0312-C cross-42-4 M progeny AD0313-C cross-42-4 M progeny AD0314-C cross-42-4 M progeny AD0315-C cross-42-4 M progeny AD0316-C cross-42-4 F progeny AD0317-C cross-42-4 M progeny AD0318-C cross-42-4 M progeny AD0319-C cross-42-4 F progeny AD0320-C cross-42-4 F progeny AD0322-C cross-42-4 F progeny AD0323-C cross-42-4 F progeny AD0347-C cross-46-9 F parent AD0348-C cross-46-9 M parent AD0351-C cross-46-9 M progeny AD0352-C cross-46-9 F progeny AD0353-C cross-46-9 F progeny AD0354-C cross-46-9 F progeny AD0355-C cross-46-9 F progeny AD0356-C cross-46-9 M progeny AD0357-C cross-46-9 F progeny AD0358-C cross-46-9 F progeny AD0359-C cross-46-9 M progeny AD0360-C cross-46-9 F progeny AD0361-C cross-46-9 F progeny AD0362-C cross-46-9 M progeny AD0363-C cross-46-9 F progeny AD0364-C cross-46-9 M progeny AD0365-C cross-46-9 M progeny AD0366-C cross-46-9 F progeny AD0367-C cross-46-9 F progeny AD0368-C cross-46-9 F progeny AD0369-C cross-46-9 F progeny AD0370-C cross-46-9 F progeny AD0438-C cross-36-9 F progeny ================================================ FILE: Chapter05/.gitignore ================================================ *.fasta ag.db *gz *png ================================================ FILE: Chapter05/Annotations.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + #pip install gffutils from collections import defaultdict import gffutils import sqlite3 # - # !rm -f ag.db # !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff # !gzip -9 gambiae.gff try: db = gffutils.create_db('gambiae.gff.gz', 'ag.db') except sqlite3.OperationalError: db = gffutils.FeatureDB('ag.db') print(list(db.featuretypes())) for feat_type in db.featuretypes(): print(feat_type, db.count_features_of_type(feat_type)) seqids = set() for e in db.all_features(): seqids.add(e.seqid) for seqid in seqids: print(seqid) num_mRNAs = defaultdict(int) num_exons = defaultdict(int) max_exons = 0 max_span = 0 for seqid in seqids: cnt = 0 for gene in db.region(seqid=seqid, featuretype='protein_coding_gene'): cnt += 1 span = abs(gene.start - gene.end) # strand if span > max_span: max_span = span max_span_gene = gene my_mRNAs = list(db.children(gene, featuretype='mRNA')) num_mRNAs[len(my_mRNAs)] += 1 if len(my_mRNAs) == 0: exon_check = [gene] else: exon_check = my_mRNAs for check in exon_check: my_exons = list(db.children(check, featuretype='exon')) num_exons[len(my_exons)] += 1 if len(my_exons) > max_exons: max_exons = len(my_exons) max_exons_gene = gene print(f'seqid {seqid}, number of genes {cnt}') print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons)) print('Max span: %s (%d)' % (max_span_gene.id, max_span)) print(num_mRNAs) print(num_exons) ================================================ FILE: Chapter05/Gene_Ontology.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- #use pip install as conda install requires a lot of downgrades at this stage import pygraphviz as pgv from IPython.core.display import Image # ## The cell below comes from the Orthology notebook # + import requests ensembl_server = 'http://rest.ensembl.org' def do_request(server, service, *args, **kwargs): params = '' for a in args: if a is not None: params += '/' + a req = requests.get('%s/%s%s' % (server, service, params), params=kwargs, headers={'Content-Type': 'application/json'}) if not req.ok: req.raise_for_status() return req.json() # - lct_id = 'ENSG00000115850' refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1') print(len(refs)) print(refs[0].keys()) for ref in refs: go_id = ref['primary_id'] details = do_request(ensembl_server, 'ontology/id', go_id) print('%s %s %s' % (go_id, details['namespace'], ref['description'])) print('%s\n' % details['definition']) go_id = 'GO:0000016' my_data = do_request(ensembl_server, 'ontology/id', go_id) for k, v in my_data.items(): if k == 'parents': for parent in v: print(parent) parent_id = parent['accession'] else: print('%s: %s' % (k, str(v))) print() parent_data = do_request(ensembl_server, 'ontology/id', parent_id) print(parent_id, len(parent_data['children'])) refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id) for go, entry in refs.items(): print(go) term = entry['term'] print('%s %s' % (term['name'], term['definition'])) is_a = entry.get('is_a', []) print('\t is a: %s\n' % ', '.join([x['accession'] for x in is_a])) def get_upper(go_id): parents = {} node_data = {} refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id) for ref, entry in refs.items(): my_data = do_request(ensembl_server, 'ontology/id', ref) node_data[ref] = {'name': entry['term']['name'], 'children': my_data['children']} try: parents[ref] = [x['accession'] for x in entry['is_a']] except KeyError: pass # Top of hierarchy return parents, node_data parents, node_data = get_upper(go_id) g = pgv.AGraph(directed=True) for ofs, ofs_parents in parents.items(): ofs_text = '%s\n(%s)' % (node_data[ofs]['name'].replace(', ', '\n'), ofs) for parent in ofs_parents: parent_text = '%s\n(%s)' % (node_data[parent]['name'].replace(', ', '\n'), parent) children = node_data[parent]['children'] if len(children) < 3: for child in children: if child['accession'] in node_data: continue g.add_edge(parent_text, child['accession']) else: g.add_edge(parent_text, '...%d...' % (len(children) - 1)) g.add_edge(parent_text, ofs_text) print(g) g.graph_attr['label']='Ontology tree for Lactase activity' g.node_attr['shape']='rectangle' g.layout(prog='dot') g.draw('graph.png') Image("graph.png") print(go_id) refs = do_request(ensembl_server, 'ontology/descendants', go_id) for go in refs: print(go['accession'], go['name'], go['definition']) ================================================ FILE: Chapter05/Getting_Gene.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import gffutils import gzip from Bio import Seq, SeqIO # ## Retrieving data # !rm -f ag.db # !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff # !gzip -9 gambiae.gff db = gffutils.FeatureDB('ag.db') # # Getting a gene gene_id = 'AGAP004707' gene = db[gene_id] print(gene) print(gene.seqid, gene.strand) recs = SeqIO.parse(gzip.open('gambiae.fa.gz', 'rt', encoding='utf-8'), 'fasta') for rec in recs: print(rec.description) if rec.id == gene.seqid: my_seq = rec.seq break # + def get_sequence(chrom_seq, CDSs, strand): seq = Seq.Seq('') for CDS in CDSs: # #FRAME??? my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1: CDS.end])) seq += my_cds return seq if strand == '+' else seq.reverse_complement() # + mRNAs = db.children(gene, featuretype='mRNA') for mRNA in mRNAs: print(mRNA.id) if mRNA.id.endswith('RA'): break CDSs = db.children(mRNA, featuretype='CDS', order_by='start') gene_seq = get_sequence(my_seq, CDSs, gene.strand) print(len(gene_seq), gene_seq) prot = gene_seq.translate() print(len(prot), prot) # - # # Reverse strand reverse_transcript_id = 'AGAP004708-RA' # + reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start') reverse_seq = get_sequence(my_seq, reverse_CDSs, '-') print(len(reverse_seq), reverse_seq) reverse_prot = reverse_seq.translate() print(len(reverse_prot), reverse_prot) # - ================================================ FILE: Chapter05/Low_Quality.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import gzip import numpy as np import matplotlib.pyplot as plt from Bio import SeqIO, SeqUtils # - # !rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null # !wget https://vectorbase.org/common/downloads/Current_Release/AgambiaePEST/fasta/data/VectorBase-67_AgambiaePEST_Genome.fasta -O gambiae.fa # !gzip -9 gambiae.fa # !wget https://vectorbase.org/common/downloads/Current_Release/AatroparvusEBRO/fasta/data/VectorBase-67_AatroparvusEBRO_Genome.fasta -O atroparvus.fa # !gzip -9 atroparvus.fa gambiae_name = 'gambiae.fa.gz' atroparvus_name = 'atroparvus.fa.gz' recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta') for rec in recs: print(rec.description) #Do not do this with atroparvus recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta') chrom_Ns = {} chrom_sizes = {} for rec in recs: if rec.description.find('supercontig') > -1: continue print(rec.description, rec.id, rec) chrom = rec.id.split('_')[1] if chrom in ['UNKN']:#, 'Y_unplaced']: continue chrom_Ns[chrom] = [] on_N = False curr_size = 0 for pos, nuc in enumerate(rec.seq): if nuc in ['N', 'n']: curr_size += 1 on_N = True else: if on_N: chrom_Ns[chrom].append(curr_size) curr_size = 0 on_N = False if on_N: chrom_Ns[chrom].append(curr_size) chrom_sizes[chrom] = len(rec.seq) for chrom, Ns in chrom_Ns.items(): size = chrom_sizes[chrom] if len(Ns) > 0: max_Ns = max(Ns) else: max_Ns = 'NA' print(f'{chrom} ({size}): %Ns ({round(100 * sum(Ns) / size, 1)}), num Ns: {len(Ns)}, max N: {max_Ns}') # ## Atroparvus super-contigs recs = SeqIO.parse(gzip.open(atroparvus_name, 'rt', encoding='utf-8'), 'fasta') sizes = [] size_N = [] for rec in recs: size = len(rec.seq) sizes.append(size) count_N = 0 for nuc in rec.seq: if nuc in ['n', 'N']: count_N += 1 size_N.append((size, count_N / size)) print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes), np.percentile(sizes, 10), np.percentile(sizes, 90)) small_split = 4800 large_split = 540000 fig, axs = plt.subplots(1, 3, figsize=(16, 9), dpi=300, squeeze=False, sharey=True) xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split]) axs[0, 0].plot(xs, ys, '.') xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split]) axs[0, 1].plot(xs, ys, '.') axs[0, 1].set_xlim(small_split, large_split) xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split]) axs[0, 2].plot(xs, ys, '.') axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12) axs[0, 1].set_xlabel('Contig size', fontsize=12) fig.suptitle('Fraction of Ns per contig size', fontsize=26) fig.savefig('frac.png') ================================================ FILE: Chapter05/Orthology.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import requests ensembl_server = 'http://rest.ensembl.org' def do_request(server, service, *args, **kwargs): url_params = '' for a in args: if a is not None: url_params += '/' + a req = requests.get('%s/%s%s' % (server, service, url_params), params=kwargs, headers={'Content-Type': 'application/json'}) if not req.ok: req.raise_for_status() return req.json() # - answer = do_request(ensembl_server, 'info/species') for i, sp in enumerate(answer['species']): print(i, sp['name']) ext_dbs = do_request(ensembl_server, 'info/external_dbs', 'homo_sapiens', filter='HGNC%') print(ext_dbs) answer = do_request(ensembl_server, 'lookup/symbol', 'homo_sapiens', 'LCT') print(answer) lct_id = answer['id'] lct_seq = do_request(ensembl_server, 'sequence/id', lct_id) print(lct_seq) lct_xrefs = do_request(ensembl_server, 'xrefs/id', lct_id) for xref in lct_xrefs: print(xref['db_display_name']) print(xref) refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1') print(lct_id, refs) hom_response = do_request(ensembl_server, 'homology/id', lct_id, type='orthologues', sequence='none') #print(hom_response['data'][0]['homologies']) homologies = hom_response['data'][0]['homologies'] for homology in homologies: print(homology['target']['species']) if homology['target']['species'] != 'equus_caballus': continue print(homology) print(homology['taxonomy_level']) horse_id = homology['target']['id'] horse_req = do_request(ensembl_server, 'lookup/id', horse_id) print(horse_req) # + #maybe synteny of MCM6 and LCT with caballus and gorilla ================================================ FILE: Chapter05/Reference_Genome.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + from IPython.core.display import Image from reportlab.lib import colors from reportlab.lib.units import cm from Bio import SeqIO from Bio.Graphics import BasicChromosome # - # !rm -f PlasmoDB-9.3_Pfalciparum3D7_Genome.fasta 2>/dev/null # vvvv 13.0 # !wget http://plasmodb.org/common/downloads/release-13.0/Pfalciparum3D7/fasta/data/PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta genome_name = 'PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta' recs = SeqIO.parse(genome_name, 'fasta') chroms = {} for rec in recs: print(rec.description) # + from Bio import SeqUtils chrom_sizes = {} chrom_GC = {} recs = SeqIO.parse(genome_name, 'fasta') block_size = 50000 min_GC = 100.0 max_GC = 0.0 for rec in recs: if rec.description.find('SO=chromosome') == -1: continue chrom = int(rec.description.split('_')[1]) chrom_GC[chrom] = [] size = len(rec.seq) chrom_sizes[chrom] = size num_blocks = size // block_size + 1 for block in range(num_blocks): start = block_size * block if block == num_blocks - 1: end = size else: end = block_size + start + 1 block_seq = rec.seq[start:end] block_GC = SeqUtils.GC(block_seq) if block_GC < min_GC: min_GC = block_GC if block_GC > max_GC: max_GC = block_GC chrom_GC[chrom].append(block_GC) print(min_GC, max_GC) # + chroms = list(chrom_sizes.keys()) chroms.sort() biggest_chrom = max(chrom_sizes.values()) my_genome = BasicChromosome.Organism(output_format="png") my_genome.page_size = (29.7*cm, 21*cm) # check telomere_length = 10 bottom_GC = 17.5 top_GC = 22.0 for chrom in chroms: chrom_size = chrom_sizes[chrom] chrom_representation = BasicChromosome.Chromosome('Cr %d' % chrom) chrom_representation.scale_num = biggest_chrom tel = BasicChromosome.TelomereSegment() tel.scale = telomere_length chrom_representation.add(tel) num_blocks = len(chrom_GC[chrom]) for block, gc in enumerate(chrom_GC[chrom]): my_GC = chrom_GC[chrom][block] body = BasicChromosome.ChromosomeSegment() if my_GC > top_GC: body.fill_color = colors.Color(1, 0, 0) elif my_GC < bottom_GC: body.fill_color = colors.Color(1, 1, 0) else: my_color = (my_GC - bottom_GC) / (top_GC - bottom_GC) body.fill_color = colors.Color(my_color, my_color, 1) if block < num_blocks - 1: body.scale = block_size else: body.scale = chrom_size % block_size chrom_representation.add(body) tel = BasicChromosome.TelomereSegment(inverted=True) tel.scale = telomere_length chrom_representation.add(tel) my_genome.add(chrom_representation) my_genome.draw("falciparum.png", "Plasmodium falciparum") Image("falciparum.png") # - ================================================ FILE: Chapter06/.gitignore ================================================ *.log *.ped *.map *.bed *.bim *.fam exclude*.txt relationships_w_pops_041510.txt *.in *.out ================================================ FILE: Chapter06/Admixture.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # + from collections import defaultdict import os import matplotlib.pyplot as plt from genomics.popgen.admix import cluster, plot # %matplotlib notebook # - k_range = range(2, 10) # 2..9 # ### The next cell is very slow. Example outputs are provided (so you can avoid running it) # + #for k in k_range: # os.system('admixture --cv=10 hapmap10_auto_noofs_ld.bed %d > admix.%d' % (k, k)) # - # ## Individual order f = open('hapmap10_auto_noofs_ld.fam') ind_order = [] for l in f: toks = l.rstrip().replace(' ', '\t').split('\t') fam_id = toks[0] ind_id = toks[1] ind_order.append((fam_id, ind_id)) f.close() # ## CV-plot CVs = [] for k in k_range: f = open('admix.%d' % k) for l in f: if l.find('CV error') > -1: CVs.append(float(l.rstrip().split(' ')[-1])) break f.close() fig = plt.figure(figsize=(16, 9)) ax = fig.add_subplot(111) ax.plot(k_range, CVs) ax.set_title('Cross-Validation error') ax.set_xlabel('K') # ## Load meta-data f = open('relationships_w_pops_121708.txt') pop_ind = defaultdict(list) f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] if (fam_id, ind_id) not in ind_order: continue mom = toks[2] dad = toks[3] if mom != '0' or dad != '0': continue pop = toks[-1] pop_ind[pop].append((fam_id, ind_id)) #ind_pop[('2469', 'NA20281')] = ind_pop[('2805', 'NA20281')] f.close() def load_Q(fname, ind_order): ind_comps = {} f = open(fname) for i, l in enumerate(f): comps = [float(x) for x in l.rstrip().split(' ')] ind_comps[ind_order[i]] = comps f.close() return ind_comps comps = {} for k in k_range: comps[k] = load_Q('hapmap10_auto_noofs_ld.%d.Q' % k, ind_order) ordering = {} for k in k_range: ordering[k] = cluster(comps[k], pop_ind) fig = plt.figure(figsize=(9, 9)) plot.single(comps[4], ordering[4], fig) None fig = plt.figure(figsize=(16, 9)) plot.stacked(comps, ordering[7], fig) # ## Q files? # ## Log-likelihood ================================================ FILE: Chapter06/Data_Formats.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # ## Data download # + # !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz # !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz # !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt # - # !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz # !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz # # Preparation import os from collections import defaultdict # ## Loading HapMap meta-data f = open('relationships_w_pops_041510.txt') pop_ind = defaultdict(list) f.readline() # header offspring = [] for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] mom = toks[2] dad = toks[3] if mom != '0' or dad != '0': offspring.append((fam_id, ind_id)) pop = toks[-1] pop_ind[pop].append((fam_id, ind_id)) f.close() # ## Sub-sampling os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap10 --thin 0.1 --geno 0.1 --export ped') os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap1 --thin 0.01 --geno 0.1 --export ped') # ## Getting only autosomal data def get_non_auto_SNPs(map_file, exclude_file): f = open(map_file) w = open(exclude_file, 'w') for l in f: toks = l.rstrip().split('\t') try: chrom = int(toks[0]) except ValueError: rs = toks[1] w.write('%s\n' % rs) w.close() get_non_auto_SNPs('hapmap10.map', 'exclude10.txt') get_non_auto_SNPs('hapmap1.map', 'exclude1.txt') # !plink2 --pedmap hapmap10 --out hapmap10_auto --exclude exclude10.txt --export ped # !plink2 --pedmap hapmap1 --out hapmap1_auto --exclude exclude1.txt --export ped # ## Removing offspring # !plink2 --pedmap hapmap10_auto --filter-founders --out hapmap10_auto_noofs --export ped # ## LD-prunning # !plink2 --pedmap hapmap10_auto_noofs --indep-pairwise 50 10 0.1 --out keep --export ped # !plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --out hapmap10_auto_noofs_ld --export ped # ## Different encoding # !plink2 --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld_12 --export ped 12 # !plink2 --make-bed --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld # ## Single chromosome # !plink2 --pedmap hapmap10_auto_noofs --chr 2 --out hapmap10_auto_noofs_2 --export ped ================================================ FILE: Chapter06/Exploratory_Analysis.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # ## Loading HapMap data # + import numpy as np import xarray as xr import sgkit as sg from sgkit.io import plink data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t') # - data print(data.dims) variant_stats = sg.variant_stats(data) variant_stats variant_stats.variant_call_rate.to_series().describe() print(type(variant_stats.variant_call_rate.to_series())) sample_stats = sg.sample_stats(data) sample_stats sample_stats.sample_call_rate.to_series().hist() data['sample_cohort'] = xr.DataArray( np.zeros(data.dims['samples'], dtype=np.int64), dims='samples') # data["sample_cohort"] = xr.DataArray(np.repeat([0, 1], data.dims["samples"] // 2), dims="samples") sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].values sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].to_series().hist() # # maf cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values min_freqs = map( lambda x: x if x < 0.5 else 1 - x, filter( lambda x: x not in [0, 1], cohort_allele_frequency[:, 0, 0])) ================================================ FILE: Chapter06/PCA.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.3 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + jupyter={"outputs_hidden": false} import os from genomics.popgen.plink.convert import to_eigen from genomics.popgen.pca import plot, smart # %matplotlib inline # - # ## Meta-data load # + jupyter={"outputs_hidden": false} f = open('relationships_w_pops_121708.txt') ind_pop = {} f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] ind_pop['/'.join([fam_id, ind_id])] = pop f.close() ind_pop['2469/NA20281'] = ind_pop['2805/NA20281'] # - # ## Requires plink from data preparation # + jupyter={"outputs_hidden": false} to_eigen('hapmap10_auto_noofs_ld_12', 'hapmap10_auto_noofs_ld_12') # - # ## Running smartpca # + jupyter={"outputs_hidden": false} ctrl = smart.SmartPCAController('hapmap10_auto_noofs_ld_12') ctrl.run() # + jupyter={"outputs_hidden": false} wei, wei_perc, ind_comp = smart.parse_evec('hapmap10_auto_noofs_ld_12.evec', 'hapmap10_auto_noofs_ld_12.eval') # + jupyter={"outputs_hidden": false} plot.render_pca(ind_comp, 1, 2, cluster=ind_pop) #put weights # + jupyter={"outputs_hidden": false} plot.render_pca_eight(ind_comp, cluster=ind_pop) # + jupyter={"outputs_hidden": false} markers = { 'CHB': '*', 'CHD': '*', 'JPT': '*', 'GIH': '*', 'CEU': 'v', 'TSI': 'v', 'MEX': 'v', 'ASW': 'o', 'LWK': 'o', 'YRI': 'o', 'MKK': 'o' } # - # ## With scikit-learn # + jupyter={"outputs_hidden": false} from sklearn.decomposition import PCA import numpy as np # + jupyter={"outputs_hidden": false} f = open('hapmap10_auto_noofs_ld_12.ped') ninds = 0 ind_order = [] for line in f: ninds += 1 toks = line[:100].replace(' ', '\t').split('\t') # for speed fam_id = toks[0] ind_id = toks[1] ind_order.append('%s/%s' % (fam_id, ind_id)) nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2 print (nsnps) f.close() # + jupyter={"outputs_hidden": false} pca_array = np.empty((ninds, nsnps), dtype=int) print(pca_array.shape) f = open('hapmap10_auto_noofs_ld_12.ped') for ind, line in enumerate(f): snps = line.replace(' ', '\t').split('\t')[6:] for pos in range(len(snps) // 2): a1 = int(snps[2 * pos]) a2 = int(snps[2 * pos]) my_code = a1 + a2 - 2 pca_array[ind, pos] = my_code f.close() #slow # + jupyter={"outputs_hidden": false} my_pca = PCA(n_components=8) my_pca.fit(pca_array) trans = my_pca.transform(pca_array) #Memory required # + jupyter={"outputs_hidden": false} sc_ind_comp = {} for i, ind_pca in enumerate(trans): sc_ind_comp[ind_order[i]] = ind_pca plot.render_pca_eight(sc_ind_comp, cluster=ind_pop) # + jupyter={"outputs_hidden": false} ================================================ FILE: Chapter06/Pop_Stats.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # ## Loading HapMap meta-data # + from collections import defaultdict from pprint import pprint import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import xarray as xr import sgkit as sg from sgkit.io import plink data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t') # - data f = open('relationships_w_pops_041510.txt') pop_ind = defaultdict(list) f.readline() # header for line in f: toks = line.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] pop_ind[pop].append((fam_id, ind_id)) pops = list(pop_ind.keys()) def assign_cohort(pops, pop_ind, sample_family_id, sample_id): cohort = [] for fid, sid in zip(sample_family_id, sample_id): processed = False for i, pop in enumerate(pops): if (fid, sid) in pop_ind[pop]: processed = True cohort.append(i) break if not processed: raise Exception(f'Not processed {fid}, {sid}') return cohort cohort = assign_cohort(pops, pop_ind, data.sample_family_id.values, data.sample_id.values) data['sample_cohort'] = xr.DataArray( cohort, dims='samples') # # monomorphic positions per pop cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values monom = {} for i, pop in enumerate(pops): monom[pop] = len(list(filter(lambda x: x, np.isin(cohort_allele_frequency[:, i, 0], [0, 1])))) pprint(monom) # # MAF mafs = {} for i, pop in enumerate(pops): min_freqs = map( lambda x: x if x < 0.5 else 1 - x, filter( lambda x: x not in [0, 1], cohort_allele_frequency[:, i, 0])) mafs[pop] = pd.Series(min_freqs) maf_plot, maf_ax = plt.subplots(nrows=2, sharey=True) mafs['YRI'].hist(ax=maf_ax[0], bins=50) maf_ax[0].set_title('*YRI*') mafs['JPT'].hist(ax=maf_ax[1], bins=50) maf_ax[1].set_title('*JPT*') maf_ax[1].set_xlabel('MAF') # # Fst fst = sg.Fst(data) fst = fst.assign_coords({"cohorts_0": pops, "cohorts_1": pops}) remove_nan = lambda data: filter(lambda x: not np.isnan(x), data) ceu_chb = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CEU', cohorts_1='CHB').values)) chb_chd = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CHB', cohorts_1='CHD').values)) ceu_chb.describe() chb_chd.describe() mean_fst = {} for i, pop_i in enumerate(pops): for j, pop_j in enumerate(pops): if j <= i: continue pair_fst = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0=pop_i, cohorts_1=pop_j).values)) mean = pair_fst.mean() mean_fst[(pop_i, pop_j)] = mean min_pair = min(mean_fst.values()) max_pair = max(mean_fst.values()) sns.set_style("white") num_pops = len(pops) arr = np.ones((num_pops - 1, num_pops - 1, 3), dtype=float) fig = plt.figure(figsize=(16, 9)) ax = fig.add_subplot(111) for row in range(num_pops - 1): pop_i = pops[row] for col in range(row + 1, num_pops): pop_j = pops[col] val = mean_fst[(pop_i, pop_j)] norm_val = (val - min_pair) / (max_pair - min_pair) ax.text(col - 1, row, '%.3f' % val, ha='center') if norm_val == 0.0: arr[row, col - 1, 0] = 1 arr[row, col - 1, 1] = 1 arr[row, col - 1, 2] = 0 elif norm_val == 1.0: arr[row, col - 1, 0] = 1 arr[row, col - 1, 1] = 0 arr[row, col - 1, 2] = 1 else: arr[row, col - 1, 0] = 1 - norm_val arr[row, col - 1, 1] = 1 arr[row, col - 1, 2] = 1 ax.imshow(arr, interpolation='none') ax.set_title('Multilocus Pairwise FST') ax.set_xticks(range(num_pops - 1)) ax.set_xticklabels(pops[1:]) ax.set_yticks(range(num_pops - 1)) ax.set_yticklabels(pops[:-1]) ================================================ FILE: Chapter06/Sgkit.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import os from collections import defaultdict # ## Loading HapMap data # + import numpy as np from sgkit.io import plink data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t') # - data print(data.dims) print(len(data.sample_id.values)) print(data.sample_id.values) print(data.sample_family_id.values) print(data.sample_sex.values) print(data.contigs) print(len(data.variant_contig.values)) print(data.variant_contig.values) print(data.variant_position.values) print(data.variant_allele.values) print(data.variant_id.values) data.call_genotype call_genotype = data.call_genotype.values print(call_genotype.shape) first_individual = call_genotype[:,0,:] first_variant = call_genotype[0,:,:] first_variant_of_first_individual = call_genotype[0,0,:] print(first_variant_of_first_individual) print(data.sample_family_id.values[0], data.sample_id.values[0]) print(data.variant_allele.values[0]) ================================================ FILE: Chapter07/.gitignore ================================================ *fasta trim.fasta.reduced *nex bp_rx ================================================ FILE: Chapter07/Alignment.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import os import dendropy # - # ## Genome alignment from Bio.Align.Applications import MafftCommandline mafft_cline = MafftCommandline(input='sample.fasta', ep=0.123, reorder=True, maxiterate=1000, localpair=True) print(mafft_cline) stdout, stderr = mafft_cline() with open('align.fasta', 'w') as w: w.write(stdout) os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta') # ## Protein alignment # + from Bio.Align.Applications import MuscleCommandline my_genes = ['NP', 'L', 'VP35', 'VP40'] for gene in my_genes: muscle_cline = MuscleCommandline(input='%s_P.fasta' % gene) print(muscle_cline) stdout, stderr = muscle_cline() with open('%s_P_align.fasta' % gene, 'w') as w: w.write(stdout) # + from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord # XXX vvv # from Bio.Alphabet import generic_protein for gene in my_genes: gene_seqs = {} unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta') for rec in unal_gene: gene_seqs[rec.id] = rec.seq al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta') al_genes = [] for protein in al_prot: my_id = protein.id seq = '' pos = 0 for c in protein.seq: if c == '-': seq += '---' else: seq += str(gene_seqs[my_id][pos:pos + 3]) pos += 3 al_genes.append(SeqRecord(Seq(seq), id=my_id)) SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta') # - ================================================ FILE: Chapter07/Comparison.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.6 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import os from collections import OrderedDict import numpy as np import pandas as pd import dendropy from dendropy.calculate import popgenstat # - # ## Genes # + genes_species = OrderedDict() my_species = ['RESTV', 'SUDV'] my_genes = ['NP', 'L', 'VP35', 'VP40'] for name in my_genes: gene_name = name.split('.')[0] char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta') genes_species[gene_name] = {} for species in my_species: genes_species[gene_name][species] = dendropy.DnaCharacterMatrix() for taxon, char_map in char_mat.items(): species = taxon.label.split('_')[0] if species in my_species: genes_species[gene_name][species].taxon_namespace.add_taxon(taxon) genes_species[gene_name][species][taxon] = char_map # - summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species))) stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta'] for row, (gene, species_data) in enumerate(genes_species.items()): for col_base, species in enumerate(my_species): summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species]) summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species]) summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species]) summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species]) columns = [] for species in my_species: columns.extend(['%s (%s)' % (stat, species) for stat in stats]) df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns) df # vs print(df) # ## Genomes def do_basic_popgen(seqs): num_seg_sites = popgenstat.num_segregating_sites(seqs) avg_pair = popgenstat.average_number_of_pairwise_differences(seqs) nuc_div = popgenstat.nucleotide_diversity(seqs) print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div)) print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs)) print("Tajima's D: %s" % popgenstat.tajimas_d(seqs)) #XXX change ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path( 'trim.fasta', schema='fasta', data_type='dna') sl_2014 = [] drc_2007 = [] ebov2007_set = dendropy.DnaCharacterMatrix() ebov2014_set = dendropy.DnaCharacterMatrix() for taxon, char_map in ebov_seqs.items(): print(taxon.label) if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8: sl_2014.append(char_map) ebov2014_set.taxon_namespace.add_taxon(taxon) ebov2014_set[taxon] = char_map elif taxon.label.startswith('EBOV_2007'): drc_2007.append(char_map) ebov2007_set.taxon_namespace.add_taxon(taxon) ebov2007_set[taxon] = char_map #ebov2007_set.extend_map({taxon: char_map}) del ebov_seqs # + print('2007 outbreak:') print('Number of individuals: %s' % len(ebov2007_set.taxon_namespace)) do_basic_popgen(ebov2007_set) print('\n2014 outbreak:') print('Number of individuals: %s' % len(ebov2014_set.taxon_namespace)) do_basic_popgen(ebov2014_set) # - print(len(sl_2014)) print(len(drc_2007)) pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007) print('Average number of pairwise differences irrespective of population: %.2f' % pair_stats.average_number_of_pairwise_differences) print('Average number of pairwise differences between populations: %.2f' % pair_stats.average_number_of_pairwise_differences_between) print('Average number of pairwise differences within populations: %.2f' % pair_stats.average_number_of_pairwise_differences_within) print('Average number of net pairwise differences : %.2f' % pair_stats.average_number_of_pairwise_differences_net) print('Number of segregating sites: %d' % pair_stats.num_segregating_sites) print("Watterson's theta: %.2f" % pair_stats.wattersons_theta) print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi) print("Tajima's D: %.2f" % pair_stats.tajimas_d) ================================================ FILE: Chapter07/Exploration.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.6 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import dendropy from dendropy.interop import genbank # ## Getting the data # + def get_ebov_2014_sources(): #EBOV_2014 #yield 'EBOV_2014', genbank.GenBankDna(id_range=(233036, 233118), prefix='KM') yield 'EBOV_2014', genbank.GenBankDna(id_range=(34549, 34563), prefix='KM0') def get_other_ebov_sources(): #EBOV other yield 'EBOV_1976', genbank.GenBankDna(ids=['AF272001', 'KC242801']) yield 'EBOV_1995', genbank.GenBankDna(ids=['KC242796', 'KC242799']) yield 'EBOV_2007', genbank.GenBankDna(id_range=(84, 90), prefix='KC2427') def get_other_ebolavirus_sources(): #BDBV yield 'BDBV', genbank.GenBankDna(id_range=(3, 6), prefix='KC54539') yield 'BDBV', genbank.GenBankDna(ids=['FJ217161']) #RESTV yield 'RESTV', genbank.GenBankDna(ids=['AB050936', 'JX477165', 'JX477166', 'FJ621583', 'FJ621584', 'FJ621585']) #SUDV yield 'SUDV', genbank.GenBankDna(ids=['KC242783', 'AY729654', 'EU338380', 'JN638998', 'FJ968794', 'KC589025', 'JN638998']) #yield 'SUDV', genbank.GenBankDna(id_range=(89, 92), prefix='KC5453') #TAFV yield 'TAFV', genbank.GenBankDna(ids=['FJ217162']) # + other = open('other.fasta', 'w') sampled = open('sample.fasta', 'w') for species, recs in get_other_ebolavirus_sources(): tn = dendropy.TaxonNamespace() char_mat = recs.generate_char_matrix(taxon_namespace=tn, gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession))) char_mat.write_to_stream(other, 'fasta') char_mat.write_to_stream(sampled, 'fasta') other.close() ebov_2014 = open('ebov_2014.fasta', 'w') ebov = open('ebov.fasta', 'w') for species, recs in get_ebov_2014_sources(): tn = dendropy.TaxonNamespace() char_mat = recs.generate_char_matrix(taxon_namespace=tn, gb_to_taxon_fn=lambda gb: tn.require_taxon(label='EBOV_2014_%s' % gb.accession)) char_mat.write_to_stream(ebov_2014, 'fasta') char_mat.write_to_stream(sampled, 'fasta') char_mat.write_to_stream(ebov, 'fasta') ebov_2014.close() ebov_2007 = open('ebov_2007.fasta', 'w') for species, recs in get_other_ebov_sources(): tn = dendropy.TaxonNamespace() char_mat = recs.generate_char_matrix(taxon_namespace=tn, gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession))) char_mat.write_to_stream(ebov, 'fasta') char_mat.write_to_stream(sampled, 'fasta') if species == 'EBOV_2007': char_mat.write_to_stream(ebov_2007, 'fasta') ebov.close() ebov_2007.close() sampled.close() # - # ## Genes # + my_genes = ['NP', 'L', 'VP35', 'VP40'] def dump_genes(species, recs, g_dls, p_hdls): for rec in recs: for feature in rec.feature_table: if feature.key == 'CDS': gene_name = None for qual in feature.qualifiers: if qual.name == 'gene': if qual.value in my_genes: gene_name = qual.value elif qual.name == 'translation': protein_translation = qual.value if gene_name is not None: locs = feature.location.split('.') start, end = int(locs[0]), int(locs[-1]) g_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession)) p_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession)) g_hdls[gene_name].write('%s\n' % rec.sequence_text[start - 1 : end]) p_hdls[gene_name].write('%s\n' % protein_translation) g_hdls = {} p_hdls = {} for gene in my_genes: g_hdls[gene] = open('%s.fasta' % gene, 'w') p_hdls[gene] = open('%s_P.fasta' % gene, 'w') for species, recs in get_other_ebolavirus_sources(): if species in ['RESTV', 'SUDV']: dump_genes(species, recs, g_hdls, p_hdls) for gene in my_genes: g_hdls[gene].close() p_hdls[gene].close() # - # ## Genome exploration def describe_seqs(seqs): print('Number of sequences: %d' % len(seqs.taxon_namespace)) print('First 10 taxon sets: %s' % ' '.join([taxon.label for taxon in seqs.taxon_namespace[:10]])) lens = [] for tax, seq in seqs.items(): lens.append(len([x for x in seq.symbols_as_list() if x != '-'])) print('Genome length: min %d, mean %.1f, max %d' % (min(lens), sum(lens) / len(lens), max(lens))) ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('ebov.fasta', schema='fasta', data_type='dna') print('EBOV') describe_seqs(ebov_seqs) del ebov_seqs print('ebolavirus sequences') ebolav_seqs = dendropy.DnaCharacterMatrix.get_from_path('other.fasta', schema='fasta', data_type='dna') describe_seqs(ebolav_seqs) from collections import defaultdict species = defaultdict(int) for taxon in ebolav_seqs.taxon_namespace: toks = taxon.label.split('_') my_species = toks[0] if my_species == 'EBOV': ident = '%s (%s)' % (my_species, toks[1]) else: ident = my_species species[ident] += 1 for my_species, cnt in species.items(): print("%20s: %d" % (my_species, cnt)) del ebolav_seqs # ## Genes # + import os gene_length = {} my_genes = ['NP', 'L', 'VP35', 'VP40'] for name in my_genes: gene_name = name.split('.')[0] seqs = dendropy.DnaCharacterMatrix.get_from_path('%s.fasta' % name, schema='fasta', data_type='dna') gene_length[gene_name] = [] for tax, seq in seqs.items(): gene_length[gene_name].append(len([x for x in seq.symbols_as_list() if x != '-'])) for gene, lens in gene_length.items(): print ('%6s: %d' % (gene, sum(lens) / len(lens))) # - ================================================ FILE: Chapter07/Reconstruction.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.6 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import os import random import shutil import sys import dendropy from dendropy.interop import raxml # - ebola_data = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta', 'fasta') rx = raxml.RaxmlRunner() ebola_tree = rx.estimate_tree(ebola_data, ['-m', 'GTRGAMMA', '-N', '10']) print('RAxML temporary directory: %s' % rx.working_dir_path) del ebola_data ebola_tree.write_to_path('my_ebola.nex', 'nexus') # + import matplotlib.pyplot as plt from Bio import Phylo # # %matplotlib inline my_ebola_tree = Phylo.read('my_ebola.nex', 'nexus') my_ebola_tree.name = 'Our Ebolavirus tree' fig = plt.figure(figsize=(16, 18)) ax = fig.add_subplot(1, 1, 1) Phylo.draw(my_ebola_tree, axes=ax) # - # ## RAxML with Biopython # XXX change from Bio.Phylo.Applications import RaxmlCommandline raxml_cline = RaxmlCommandline(sequences='trim.fasta', model='GTRGAMMA', name='biopython', num_replicates='10', parsimony_seed=random.randint(0, sys.maxsize), working_dir=os.getcwd() + os.sep + 'bp_rx') print(raxml_cline) try: os.mkdir('bp_rx') except OSError: shutil.rmtree('bp_rx') os.mkdir('bp_rx') out, err = raxml_cline() from Bio import Phylo biopython_tree = Phylo.read('bp_rx/RAxML_bestTree.biopython', 'newick') print(biopython_tree) ================================================ FILE: Chapter07/Selection.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # + ### XXX This is probably to remove # - sl_2014 = [] drc_2007 = [] for seq in ebola_seqs.taxon_set: if seq.label.startswith('EBOV_2014') and len(sl_2014) < 8: sl_2014.append(ebola_seqs[seq]) elif seq.label.startswith('EBOV_2007'): drc_2007.append(ebola_seqs[seq]) print(len(sl_2014)) print(len(drc_2007)) pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007) print('Average number of pairwise differences (total): %s' % pair_stats.average_number_of_pairwise_differences) print('Average number of pairwise differences between populations: %s' % pair_stats.average_number_of_pairwise_differences_between) print('Average number of pairwise differences within populations: %s' % pair_stats.average_number_of_pairwise_differences_within) print('Average number of new pairwise differences : %s' % pair_stats.average_number_of_pairwise_differences_net) print('Number of segregating sites: %s' % pair_stats.num_segregating_sites) print("Watterson's theta: %s" % pair_stats.wattersons_theta) print("Wakeley's Psi: %s" % pair_stats.wakeleys_psi) print("Tajima's D: %s" % pair_stats.tajimas_d) ================================================ FILE: Chapter07/Trees.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.6 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import dendropy ebola_raxml = dendropy.Tree.get_from_path('my_ebola.nex', 'nexus') # + def compute_level(node, level=0): for child in node.child_nodes(): compute_level(child, level + 1) if node.taxon is not None: print("%s: %d %d" % (node.taxon, node.level(), level)) compute_level(ebola_raxml.seed_node) # + def compute_height(node): children = node.child_nodes() if len(children) == 0: height = 0 else: height = 1 + max(map(lambda x: compute_height(x), children)) desc = node.taxon or 'Internal' print("%s: %d %d" % (desc, height, node.level())) return height compute_height(ebola_raxml.seed_node) # + def compute_nofs(node): children = node.child_nodes() nofs = len(children) map(lambda x: compute_nofs(x), children) desc = node.taxon or 'Internal' print("%s: %d %d" % (desc, nofs, node.level())) compute_nofs(ebola_raxml.seed_node) # + def print_nodes(node): for child in node.child_nodes(): print_nodes(child) if node.taxon is not None: print('%s (%d)' % (node.taxon, node.level())) print_nodes(ebola_raxml.seed_node) # + from collections import deque def print_breadth(tree): queue = deque() queue.append(tree.seed_node) while len(queue) > 0: process_node = queue.popleft() if process_node.taxon is not None: print('%s (%d)' % (process_node.taxon, process_node.level())) else: for child in process_node.child_nodes(): queue.append(child) print_breadth(ebola_raxml) # + from copy import deepcopy simple_ebola = deepcopy(ebola_raxml) def simplify_tree(node): prefs = set() for leaf in node.leaf_nodes(): my_toks = leaf.taxon.label.split(' ')[0].split('_') if my_toks[0] == 'EBOV': prefs.add('EBOV' + my_toks[1]) else: prefs.add(my_toks[0]) if len(prefs) == 1: print(prefs, len(node.leaf_nodes())) node.taxon = dendropy.Taxon(label=list(prefs)[0]) #node.collapse_clade() node.set_child_nodes([]) else: for child in node.child_nodes(): simplify_tree(child) simplify_tree(simple_ebola.seed_node) simple_ebola.ladderize() simple_ebola.write_to_path('ebola_simple.nex', 'nexus') # - ================================================ FILE: Chapter07/Visualization.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.6 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- from copy import deepcopy import matplotlib.pyplot as plt from Bio import Phylo ebola_tree = Phylo.read('my_ebola.nex', 'nexus') ebola_tree.name = 'Ebolavirus tree' ebola_simple_tree = Phylo.read('ebola_simple.nex', 'nexus') ebola_simple_tree.name = 'Ebolavirus simplified tree' Phylo.draw_ascii(ebola_simple_tree) Phylo.draw_ascii(ebola_tree) fig = plt.figure(figsize=(16, 22)) ax = fig.add_subplot(111) Phylo.draw(ebola_simple_tree, axes=ax, branch_labels= lambda c: c.branch_length if c.branch_length > 0.02 else None) # + fig = plt.figure(figsize=(16, 22)) ax = fig.add_subplot(111) from collections import OrderedDict my_colors = OrderedDict({ 'EBOV_2014': 'red', 'EBOV': 'magenta', 'BDBV': 'cyan', 'SUDV': 'blue', 'RESTV' : 'green', 'TAFV' : 'yellow' }) def get_color(name): for pref, color in my_colors.items(): if name.find(pref) > -1: return color return 'grey' def color_tree(node, fun_color=get_color): if node.is_terminal(): node.color = fun_color(node.name) else: my_children = set() for child in node.clades: color_tree(child, fun_color) my_children.add(child.color.to_hex()) if len(my_children) == 1: node.color = child.color else: node.color = 'grey' ebola_color_tree = deepcopy(ebola_tree) color_tree(ebola_color_tree.root) Phylo.draw(ebola_color_tree, axes=ax, label_func= lambda x: x.name.split(' ')[0][1:] if x.name is not None else None) # - ================================================ FILE: Chapter08/.gitignore ================================================ *ent *fasta ================================================ FILE: Chapter08/Distance.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import math import timeit from Bio import PDB # - repository = PDB.PDBList() parser = PDB.PDBParser() repository.retrieve_pdb_file('1TUP', file_format='pdb', pdir='.') # XXX p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') zns = [] for atom in p53_1tup.get_atoms(): if atom.element == 'ZN': #print(atom, dir(atom), atom.mass, atom.element, atom.coord[0]) zns.append(atom) for zn in zns: print(zn, zn.coord) # + #Suggest a pymol viewing # - #Try this in numba? def get_closest_atoms(pdb_struct, ref_atom, distance): atoms = {} rx, ry, rz = ref_atom.coord for atom in pdb_struct.get_atoms(): if atom == ref_atom: continue x, y, z = atom.coord my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) if my_dist < distance: atoms[atom] = my_dist return atoms for zn in zns: print() print(zn.coord) atoms = get_closest_atoms(p53_1tup, zn, 4) for atom, distance in atoms.items(): print(atom.element, distance, atom.coord) for distance in [1, 2, 4, 8, 16, 32, 64, 128]: my_atoms = [] for zn in zns: atoms = get_closest_atoms(p53_1tup, zn, distance) my_atoms.append(len(atoms)) print(distance, my_atoms) nexecs = 10 print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], 4.0)', 'from __main__ import get_closest_atoms, p53_1tup, zns', number=nexecs) / nexecs * 1000) def get_closest_alternative(pdb_struct, ref_atom, distance): atoms = {} rx, ry, rz = ref_atom.coord for atom in pdb_struct.get_atoms(): if atom == ref_atom: continue x, y, z = atom.coord if abs(x - rx) > distance or abs(y - ry) > distance or abs(z - rz) > distance: continue my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) if my_dist < distance: atoms[atom] = my_dist return atoms print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], 4.0)', 'from __main__ import get_closest_alternative, p53_1tup, zns', number=nexecs) / nexecs * 1000) print('Standard') for distance in [1, 4, 16, 64, 128]: print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], distance)', 'from __main__ import get_closest_atoms, p53_1tup, zns, distance', number=nexecs) / nexecs * 1000) print('Optimized') for distance in [1, 4, 16, 64, 128]: print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], distance)', 'from __main__ import get_closest_alternative, p53_1tup, zns, distance', number=nexecs) / nexecs * 1000) # + #for interesting distances ================================================ FILE: Chapter08/Intro.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + from collections import defaultdict import requests from Bio import ExPASy, SwissProt # - #explain why not biopython server = 'https://rest.uniprot.org/uniprotkb/search' def do_request(server, **kwargs): params = '' req = requests.get(server, params=kwargs) if not req.ok: req.raise_for_status() return req req = do_request(server, # 1. Filtering human p53, reviewed entries query='gene:p53 AND reviewed:true AND organism_id:9606', format='tsv', # 2. Specifying output columns with REST API field names fields='accession,id,protein_name,gene_names,organism_name,length', size=50 ) print(req.text) #We might revisit this for KEGG # + #XXX - stringio import pandas as pd import io uniprot_list = pd.read_table(io.StringIO(req.text)) uniprot_list.rename(columns={'Organism ID': 'ID'}, inplace=True) print(uniprot_list) # - p53_human = uniprot_list[ (uniprot_list.Entry == 'P04637') & (uniprot_list['Entry Name'].str.contains('P53_HUMAN'))]['Entry'].iloc[0] handle = ExPASy.get_sprot_raw(p53_human) sp_rec = SwissProt.read(handle) print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name) print(sp_rec.description) print(sp_rec.organism, sp_rec.seqinfo) print(sp_rec.sequence) print(sp_rec.comments) print(sp_rec.keywords) help(sp_rec) done_features = set() print('Total features:', len(sp_rec.features)) for feature in sp_rec.features: if feature in done_features: continue else: done_features.add(feature) print(feature) print('Cross references: ',len(sp_rec.cross_references)) per_source = defaultdict(list) for xref in sp_rec.cross_references: source = xref[0] per_source[source].append(xref[1:]) print(per_source.keys()) done_GOs = set() print('Annotation SOURCES:', len(per_source['GO'])) for annot in per_source['GO']: if annot[1][0] in done_GOs: continue else: done_GOs.add(annot[1][0]) print(annot) ================================================ FILE: Chapter08/Mass.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + import numpy as np import pandas as pd from Bio import PDB # + # #!rm -f 1tup.cif 2>/dev/null # #!wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif #parser = PDB.MMCIFParser() #p53_1tup = parser.get_structure('P53', '1tup.cif') # - repository = PDB.PDBList() parser = PDB.PDBParser() repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') my_residues = set() for residue in p53_1tup.get_residues(): my_residues.add(residue.id[0]) print(my_residues) # + def get_mass(atoms, accept_fun=lambda atom: atom.parent.id[0] != 'W'): return sum([atom.mass for atom in atoms if accept_fun(atom)]) chain_names = [chain.id for chain in p53_1tup.get_chains()] my_mass = np.ndarray((len(chain_names), 3)) for i, chain in enumerate(p53_1tup.get_chains()): my_mass[i, 0] = get_mass(chain.get_atoms()) my_mass[i, 1] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] not in [' ', 'W']) my_mass[i, 2] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] == 'W') masses = pd.DataFrame(my_mass, index=chain_names, columns=['No Water', 'Zincs', 'Water']) masses # - def get_center(atoms, weight_fun=lambda atom: 1 if atom.parent.id[0] != 'W' else 0): xsum = ysum = zsum = 0.0 acum = 0.0 for atom in atoms: x, y, z = atom.coord weight = weight_fun(atom) acum += weight xsum += weight * x ysum += weight * y zsum += weight * z return xsum / acum, ysum / acum, zsum / acum print(get_center(p53_1tup.get_atoms())) print(get_center(p53_1tup.get_atoms(), weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0)) my_center = np.ndarray((len(chain_names), 6)) for i, chain in enumerate(p53_1tup.get_chains()): x, y, z = get_center(chain.get_atoms()) my_center[i, 0] = x my_center[i, 1] = y my_center[i, 2] = z x, y, z = get_center(chain.get_atoms(), weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0) my_center[i, 3] = x my_center[i, 4] = y my_center[i, 5] = z weights = pd.DataFrame(my_center, index=chain_names, columns=['X', 'Y', 'Z', 'X (Mass)', 'Y (Mass)', 'Z (Mass)']) weights # + #Pymol viz ================================================ FILE: Chapter08/PDB.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- from Bio import PDB repository = PDB.PDBList() repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') repository.retrieve_pdb_file('1OLG', pdir='.', file_format='pdb') repository.retrieve_pdb_file('1YCQ', pdir='.', file_format='pdb') parser = PDB.PDBParser() p53_1tup = parser.get_structure('P 53 - DNA Binding', 'pdb1tup.ent') p53_1olg = parser.get_structure('P 53 - Tetramerization', 'pdb1olg.ent') p53_1ycq = parser.get_structure('P 53 - Transactivation', 'pdb1ycq.ent') # + def print_pdb_headers(headers, indent=0): ind_text = ' ' * indent for header, content in headers.items(): if type(content) == dict: print('\n%s%20s:' % (ind_text, header)) print_pdb_headers(content, indent + 4) print() elif type(content) == list: print('%s%20s:' % (ind_text, header)) for elem in content: print('%s%21s %s' % (ind_text, '->', elem)) else: print('%s%20s: %s' % (ind_text, header, content)) print_pdb_headers(p53_1tup.header) # - print(p53_1tup.header['compound']) print(p53_1olg.header['compound']) print(p53_1ycq.header['compound']) def describe_model(name, pdb): print() for model in pdb: for chain in model: print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' % (name, chain.id, len(chain), len(list(chain.get_atoms())))) describe_model('1TUP', p53_1tup) describe_model('1OLG', p53_1olg) describe_model('1YCQ', p53_1ycq) #will go deep in a next recipe (bottom up) for residue in p53_1tup.get_residues(): if residue.id[0] in [' ', 'W']: continue print(residue.id) res = next(p53_1tup[0]['A'].get_residues()) print(res) for atom in res: print(atom, atom.serial_number, atom.element) print(p53_1tup[0]['A'][94]['CA']) # + from Bio.SeqIO import PdbIO, FastaIO from Bio import SeqIO def get_fasta(pdb_file, fasta_file, transfer_ids=None): records = list(PdbIO.PdbSeqresIterator(pdb_file)) if transfer_ids is not None: records = [rec for rec in records if rec.id in transfer_ids and len(rec.seq) > 0] else: records = [rec for rec in records if len(rec.seq) > 0] with open(fasta_file, 'w') as out_handle: SeqIO.write(records, out_handle, 'fasta') for rec in records: print(rec.id, rec.seq, len(rec.seq)) get_fasta('pdb1tup.ent', '1tup.fasta', transfer_ids=['1TUP:B']) get_fasta('pdb1olg.ent', '1olg.fasta', transfer_ids=['1OLG:B']) get_fasta('pdb1ycq.ent', '1ycq.fasta', transfer_ids=['1YCQ:B']) # - ================================================ FILE: Chapter08/Parser.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- from Bio import PDB #XXX repository = PDB.PDBList() repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') # + rec_types = { #single line 'HEADER': [(str, 11, 49), (str, 50, 58), (str, 62, 65)], #multi_line 'SOURCE': [(int, 7, 9), (str, 10, 78)], #multi_rec 'LINK' : [(str, 12, 15), (str, 16, 16), (str, 17, 19), (str, 21, 21), (int, 22, 25), (str, 26, 26), (str, 42, 45), (str, 46, 46), (str, 47, 49), (str, 51, 51), (int, 52, 55), (str, 56, 56), (str, 59, 64), (str, 66, 71), (float, 73, 77)], 'HELIX': [(int, 7, 9), (str, 11, 13), (str, 15, 17), (str, 19, 19), (int, 21, 24), (str, 25, 25), (str, 27, 29), (str, 31, 31), (int, 33, 36), (str, 37 ,37), (int, 38, 39), (str, 40, 69), (int, 71, 75)], 'SHEET': [(int, 7, 9), (str, 11, 13), (int, 14, 15), (str, 17, 19), (str, 21, 21), (int, 22, 24), (str, 26, 26), (str, 28, 30), (str, 32, 32), (int, 33, 36), (str, 37, 37), (int, 38, 39), (str, 41, 44), (str, 45, 47), (str, 49, 49), (int, 50, 53), (str, 54, 54), (str, 56, 59), (str, 60, 62), (str, 64, 64), (int, 65, 68), (str, 69, 69)], } def parse_pdb(hdl): for line in hdl: line = line[:-1] # remove \n but not other whitespace toks = [] for section, elements in rec_types.items(): if line.startswith(section): for fun, start, end in elements: try: toks.append(fun(line[start: end + 1])) except ValueError: toks.append(None) # eg continuation yield (section, toks) if len(toks) == 0: yield ('UNKNOWN', line) # - hdl = open('pdb1tup.ent') done_rec = set() for rec in parse_pdb(hdl): if rec[0] == 'UNKNOWN' or rec[0] in done_rec: continue print(rec) done_rec.add(rec[0]) # + multi_lines = ['SOURCE'] #assume multi is just a string def process_multi_lines(hdl): current_multi = '' current_multi_name = None for rec_type, toks in parse_pdb(hdl): if current_multi_name is not None and current_multi_name != rec_type: yield current_multi_name, [current_multi] current_multi = '' current_multi_name = None if rec_type in multi_lines: current_multi += toks[1].strip().rstrip() + ' ' current_multi_name = rec_type else: if len(current_multi) != 0: yield current_multi_name, [current_multi] current_multi = '' current_multi_name = None yield rec_type, toks if len(current_multi) != 0: yield current_multi_name, [current_multi] # - hdl = open('pdb1tup.ent') done_rec = set() for rec in process_multi_lines(hdl): if rec[0] == 'UNKNOWN' or rec[0] in done_rec: continue print(rec) done_rec.add(rec[0]) # + def get_spec_list(my_str): #ignoring escape characters spec_list = {} elems = my_str.strip().strip().split(';') for elem in elems: toks = elem.split(':') spec_list[toks[0].strip()] = toks[1].strip() return spec_list struct_types = { 'SOURCE': [get_spec_list] } def process_struct_types(hdl): for rec_type, toks in process_multi_lines(hdl): if rec_type in struct_types.keys(): funs = struct_types[rec_type] struct_toks = [] for tok, fun in zip(toks, funs): struct_toks.append(fun(tok)) yield rec_type, struct_toks else: yield rec_type, toks # - hdl = open('pdb1tup.ent') for rec in process_struct_types(hdl): if rec[0] != 'SOURCE': continue print(rec) ================================================ FILE: Chapter08/PyMol_Intro.py ================================================ import threading def dump_thread(): print for thr in threading.enumerate(): print(thr) dump_thread() import pymol pymol.pymol_launch=4 pymol.pymol_argv = [ 'pymol', '-qc'] # Quiet / no GUI from pymol import cmd pymol.finish_launching() dump_thread() #cmd.fetch('1TUP', async=False) cmd.fetch('1TUP') cmd.disable('all') cmd.enable('1TUP') cmd.bg_color('white') cmd.hide('all') cmd.show('cartoon') #cmd.hide('cartoon', 'chain E+F') #cmd.show('ribbon', 'chain E+F') cmd.select('zinc', 'name zn') cmd.show('sphere', 'zinc') cmd.set('ray_trace_mode', 3) cmd.png('1TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False) dump_thread() cmd.set('ray_trace_mode', 1) cmd.png('TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False) cmd.quit() ================================================ FILE: Chapter08/PyMol_Movie.py ================================================ import pymol from pymol import cmd #pymol.pymol_argv = [ 'pymol', '-qc'] # Quiet / no GUI pymol.finish_launching() #cmd.fetch('1TUP', async=False) cmd.fetch('1TUP') cmd.disable('all') cmd.enable('1TUP') cmd.hide('all') cmd.show('sphere', 'name zn') cmd.show('surface', 'chain A+B+C') cmd.show('cartoon', 'chain E+F') cmd.scene('S0', action='store', view=0, frame=0, animate=-1) cmd.show('cartoon') cmd.hide('surface') cmd.scene('S1', action='store', view=0, frame=0, animate=-1) cmd.hide('cartoon', 'chain A+B+C') cmd.show('mesh', 'chain A') cmd.show('sticks', 'chain A+B+C') cmd.scene('S2', action='store', view=0, frame=0, animate=-1) cmd.set('ray_trace_mode', 0) cmd.mset(1, 500) cmd.frame(0) cmd.scene('S0') cmd.mview() cmd.frame(60) cmd.set_view((-0.175534308, -0.331560850, -0.926960170, 0.541812420, 0.753615797, -0.372158051, 0.821965039, -0.567564785, 0.047358301, 0.000000000, 0.000000000, -249.619018555, 58.625568390, 15.602619171, 77.781631470, 196.801528931, 302.436492920, -20.000000000)) cmd.mview() cmd.frame(90) cmd.set_view((-0.175534308, -0.331560850, -0.926960170, 0.541812420, 0.753615797, -0.372158051, 0.821965039, -0.567564785, 0.047358301, -0.000067875, 0.000017881, -249.615447998, 54.029174805, 26.956727982, 77.124832153, 196.801528931, 302.436492920, -20.000000000)) cmd.mview() cmd.frame(150) cmd.set_view((-0.175534308, -0.331560850, -0.926960170, 0.541812420, 0.753615797, -0.372158051, 0.821965039, -0.567564785, 0.047358301, -0.000067875, 0.000017881, -55.406421661, 54.029174805, 26.956727982, 77.124832153, 2.592475891, 108.227416992, -20.000000000)) cmd.mview() cmd.frame(200) cmd.scene('S1') cmd.mview() cmd.frame(350) cmd.scene('S1') cmd.set_view((0.395763457, -0.173441306, 0.901825786, 0.915456235, 0.152441502, -0.372427106, -0.072881661, 0.972972929, 0.219108686, 0.000070953, 0.000013039, -37.689743042, 57.748500824, 14.325904846, 77.241867065, -15.123448372, 90.511535645, -20.000000000)) cmd.mview() cmd.frame(351) cmd.scene('S2') cmd.mview() cmd.frame(500) cmd.scene('S2') cmd.mview() cmd.mplay() cmd.mpng('p53_1tup') cmd.quit() ================================================ FILE: Chapter08/Stats.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.8 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + from collections import defaultdict import sys import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # #%matplotlib inline from Bio import PDB # - repository = PDB.PDBList() parser = PDB.PDBParser() repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') #XXX p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') # + atom_cnt = defaultdict(int) atom_chain = defaultdict(int) atom_res_types = defaultdict(int) for atom in p53_1tup.get_atoms(): my_residue = atom.parent my_chain = my_residue.parent atom_chain[my_chain.id] += 1 if my_residue.resname != 'HOH': atom_cnt[atom.element] += 1 atom_res_types[my_residue.resname] += 1 print(dict(atom_res_types)) print(dict(atom_chain)) print(dict(atom_cnt)) # - res_types = defaultdict(int) res_per_chain = defaultdict(int) for residue in p53_1tup.get_residues(): res_types[residue.resname] += 1 res_per_chain[residue.parent.id] +=1 print(dict(res_types)) print(dict(res_per_chain)) def get_bounds(my_atoms): my_min = [sys.maxsize] * 3 my_max = [-sys.maxsize] * 3 for atom in my_atoms: for i, coord in enumerate(atom.coord): if coord < my_min[i]: my_min[i] = coord if coord > my_max[i]: my_max[i] = coord return my_min, my_max chain_bounds = {} for chain in p53_1tup.get_chains(): print(chain.id, get_bounds(chain.get_atoms())) chain_bounds[chain.id] = get_bounds(chain.get_atoms()) print(get_bounds(p53_1tup.get_atoms())) #matplotlib 3d plot fig = plt.figure(figsize=(16, 9)) ax3d = fig.add_subplot(111, projection='3d') ax_xy = fig.add_subplot(331) ax_xy.set_title('X/Y') ax_xz = fig.add_subplot(334) ax_xz.set_title('X/Z') ax_zy = fig.add_subplot(337) ax_zy.set_title('Z/Y') color = {'A': 'r', 'B': 'g', 'C': 'b', 'E': '0.5', 'F': '0.75'} zx, zy, zz = [], [], [] for chain in p53_1tup.get_chains(): xs, ys, zs = [], [], [] for residue in chain.get_residues(): ref_atom = next(residue.get_iterator()) x, y, z = ref_atom.coord if ref_atom.element == 'ZN': zx.append(x) zy.append(y) zz.append(z) continue xs.append(x) ys.append(y) zs.append(z) ax3d.scatter(xs, ys, zs, color=color[chain.id]) ax_xy.scatter(xs, ys, marker='.', color=color[chain.id]) ax_xz.scatter(xs, zs, marker='.', color=color[chain.id]) ax_zy.scatter(zs, ys, marker='.', color=color[chain.id]) ax3d.set_xlabel('X') ax3d.set_ylabel('Y') ax3d.set_zlabel('Z') ax3d.scatter(zx, zy, zz, color='k', marker='v', s=300) ax_xy.scatter(zx, zy, color='k', marker='v', s=80) ax_xz.scatter(zx, zz, color='k', marker='v', s=80) ax_zy.scatter(zz, zy, color='k', marker='v', s=80) for ax in [ax_xy, ax_xz, ax_zy]: ax.get_yaxis().set_visible(False) ax.get_xaxis().set_visible(False) ================================================ FILE: Chapter08/mmCIF.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- from Bio import PDB # !rm -f 1tup.cif 2>/dev/null # !wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif parser = PDB.MMCIFParser() p53_1tup = parser.get_structure('P53_HUMAN', '1tup.cif') def describe_model(name, pdb): print() for model in p53_1tup: for chain in model: print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' % (name, chain.id, len(chain), len(list(chain.get_atoms())))) describe_model('1TUP', p53_1tup) done_chain = set() for residue in p53_1tup.get_residues(): chain = residue.parent if chain.id in done_chain: continue done_chain.add(chain.id) print(chain.id, residue.id) mmcif_dict = PDB.MMCIF2Dict.MMCIF2Dict('1tup.cif') for k, v in mmcif_dict.items(): print(k, v) print() ================================================ FILE: Chapter09/galaxy/.gitignore ================================================ galaxy.yaml.enc tool salt ================================================ FILE: Chapter09/galaxy/LCT.bed ================================================ track name=gene description="Gene information" 2 135836529 135837180 ENSE00002202258 0 - 2 135833110 135833190 ENSE00001660765 0 - 2 135829592 135829676 ENSE00001731451 0 - 2 135823900 135824003 ENSE00001659892 0 - 2 135822019 135822098 ENSE00001777620 0 - 2 135817340 135818061 ENSE00001602826 0 - 2 135812310 135812956 ENSE00000776576 0 - 2 135808442 135809993 ENSE00001008768 0 - 2 135807127 135807396 ENSE00000776573 0 - 2 135804766 135805057 ENSE00000776572 0 - 2 135803929 135804128 ENSE00000776571 0 - 2 135800606 135800809 ENSE00000776570 0 - 2 135798028 135798138 ENSE00003515081 0 - 2 135794640 135794775 ENSE00001630333 0 - 2 135790657 135790881 ENSE00001667885 0 - 2 135789570 135789798 ENSE00001728878 0 - 2 135787839 135788544 ENSE00001653704 0 - 2 135812310 135812959 ENSE00001745158 0 - 2 135808442 135809993 ENSE00001008768 0 - 2 135807127 135807396 ENSE00000776573 0 - 2 135804766 135805057 ENSE00000776572 0 - 2 135803929 135804128 ENSE00000776571 0 - 2 135798028 135798138 ENSE00003459353 0 - 2 135794336 135794775 ENSE00001635523 0 - 2 135810168 135810279 ENSE00001438557 0 - 2 135820190 135820639 ENSE00001732580 0 + 2 135821674 135823087 ENSE00001695040 0 + 2 135836529 135837180 NM_002299.2.1 0 - 2 135833110 135833190 NM_002299.2.2 0 - 2 135829592 135829676 NM_002299.2.3 0 - 2 135823900 135824003 NM_002299.2.4 0 - 2 135822019 135822098 NM_002299.2.5 0 - 2 135817340 135818061 NM_002299.2.6 0 - 2 135812310 135812956 NM_002299.2.7 0 - 2 135808442 135809993 NM_002299.2.8 0 - 2 135807127 135807396 NM_002299.2.9 0 - 2 135804766 135805057 NM_002299.2.10 0 - 2 135803929 135804128 NM_002299.2.11 0 - 2 135800606 135800809 NM_002299.2.12 0 - 2 135798028 135798138 NM_002299.2.13 0 - 2 135794640 135794775 NM_002299.2.14 0 - 2 135790657 135790881 NM_002299.2.15 0 - 2 135789570 135789798 NM_002299.2.16 0 - 2 135787844 135788544 NM_002299.2.17 0 - 2 135836529 135837169 CCDS2178.117 0 - 2 135833110 135833190 CCDS2178.116 0 - 2 135829592 135829676 CCDS2178.115 0 - 2 135823900 135824003 CCDS2178.114 0 - 2 135822019 135822098 CCDS2178.113 0 - 2 135817340 135818061 CCDS2178.112 0 - 2 135812310 135812956 CCDS2178.111 0 - 2 135808442 135809993 CCDS2178.110 0 - 2 135807127 135807396 CCDS2178.19 0 - 2 135804766 135805057 CCDS2178.18 0 - 2 135803929 135804128 CCDS2178.17 0 - 2 135800606 135800809 CCDS2178.16 0 - 2 135798028 135798138 CCDS2178.15 0 - 2 135794640 135794775 CCDS2178.14 0 - 2 135790657 135790881 CCDS2178.13 0 - 2 135789570 135789798 CCDS2178.12 0 - 2 135788323 135788544 CCDS2178.11 0 - ================================================ FILE: Chapter09/galaxy/api.py ================================================ import base64 from collections import defaultdict #import ftplib import getpass import pprint import warnings from ruamel.yaml import YAML from cryptography.fernet import Fernet from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC import pandas as pd from bioblend.galaxy import GalaxyInstance import paramiko pp = pprint.PrettyPrinter() warnings.filterwarnings('ignore') # explain above, and warn with open('galaxy.yaml.enc', 'rb') as f: enc_conf = f.read() password = getpass.getpass('Please enter the password:').encode() with open('salt', 'rb') as f: salt = f.read() kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt, iterations=100000, backend=default_backend()) key = base64.urlsafe_b64encode(kdf.derive(password)) fernet = Fernet(key) yaml = YAML() conf = yaml.load(fernet.decrypt(enc_conf).decode()) server = conf['server'] rest_protocol = conf['rest_protocol'] rest_port = conf['rest_port'] user = conf['user'] password = conf['password'] sftp_port = int(conf['sftp_port']) api_key = conf['api_key'] rest_url = '%s://%s:%d' % (rest_protocol, server, rest_port) history_name = 'bioinf_example' gi = GalaxyInstance(url=rest_url, key=api_key) gi.verify = False histories = gi.histories print('Existing histories:') for history in histories.get_histories(): if history['name'] == history_name: histories.delete_history(history['id']) print(' - ' + history['name']) print() ds_history = histories.create_history(history_name) print('Uploading file') transport = paramiko.Transport((server, sftp_port)) transport.connect(None, user, password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.put('LCT.bed', 'LCT.bed') sftp.close() transport.close() #ftp = ftplib.FTP() #ftp.connect(host=server, port=ftp_port) #ftp.login(user=user, passwd=password) #f = open('LCT.bed', 'rb') #ftp.set_pasv(True) # explain ##ftp.storbinary('STOR LCT.bed', f) #s = ftp.transfercmd('STOR LCT.bed') #s.send(f.read()) #s.close() #f.close() #ftp.close() gi.tools.upload_from_ftp('LCT.bed', ds_history['id']) print() contents = gi.histories.show_history(ds_history['id'], contents=True) def summarize_contents(contents): summary = defaultdict(list) for item in contents: summary['íd'].append(item['id']) summary['híd'].append(item['hid']) summary['name'].append(item['name']) summary['type'].append(item['type']) summary['extension'].append(item['extension']) return pd.DataFrame.from_dict(summary) print('History contents:') pd_contents = summarize_contents(contents) print(pd_contents) print() print('Metadata for LCT.bed') bed_ds = contents[0] pp.pprint(bed_ds) print() print('Metadata about all tools') all_tools = gi.tools.get_tools() pp.pprint(all_tools) print() bed2gff = gi.tools.get_tools(name='Convert BED to GFF')[0] print("Convert BED to GFF metadata:") pp.pprint(gi.tools.show_tool(bed2gff['id'], io_details=True, link_details=True)) print() def dataset_to_param(dataset): return dict(src='hda', id=dataset['id']) tool_inputs = { 'input1': dataset_to_param(bed_ds) } #hid! gi.tools.run_tool(ds_history['id'], bed2gff['id'], tool_inputs=tool_inputs) ================================================ FILE: Chapter09/galaxy/encrypt.py ================================================ "Encrypt an YAML file with the script configuration" import base64 import getpass from io import StringIO import os from ruamel.yaml import YAML from cryptography.fernet import Fernet from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC password = getpass.getpass('Please enter the password:').encode() salt = os.urandom(16) kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt, iterations=100000, backend=default_backend()) key = base64.urlsafe_b64encode(kdf.derive(password)) fernet = Fernet(key) with open('salt', 'wb') as w: w.write(salt) yaml = YAML() content = yaml.load(open('galaxy.yaml', 'rt', encoding='utf-8')) print(type(content), content) output = StringIO() yaml.dump(content, output) print ('Encrypting:\n%s' % output.getvalue()) enc_output = fernet.encrypt(output.getvalue().encode()) with open('galaxy.yaml.enc', 'wb') as w: w.write(enc_output) print("Complete, the clear version should be deleted now") ================================================ FILE: Chapter09/galaxy/galaxy.yaml ================================================ rest_protocol: http server: localhost rest_port: 8080 sftp_port: 8022 user: admin@galaxy.org password: password api_key: fakekey ================================================ FILE: Chapter09/nextflow/.gitignore ================================================ data pca.png work .nextflow* report ================================================ FILE: Chapter09/nextflow/pipeline.nf ================================================ nextflow.enable.dsl=2 download_root = "https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3" process plink_download { output: path 'hapmap.map.gz'//, emit: mapgz path 'hapmap.ped.gz'//, emit: pedgz script: """ wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz -O hapmap.map.gz wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz -O hapmap.ped.gz """ } process uncompress_plink { publishDir 'data', glob: '*', mode: 'copy' input: path mapgz path pedgz output: path 'hapmap.map' path 'hapmap.ped' script: """ gzip -dc $mapgz > hapmap.map gzip -dc $pedgz > hapmap.ped """ } //DSL 2 and docs //conda process subsample_1p { input: path 'hapmap.map' path 'hapmap.ped' output: path 'hapmap1.map' path 'hapmap1.ped' script: """ plink2 --pedmap hapmap --out hapmap1 --thin 0.01 --geno 0.1 --export ped """ } process plink_pca { input: path 'hapmap.map' path 'hapmap.ped' output: path 'hapmap.eigenvec' path 'hapmap.eigenval' script: """ plink2 --pca --pedmap hapmap -out hapmap """ } process plot_pca { publishDir '.', glob: '*', mode: 'copy' input: path 'hapmap.eigenvec' path 'hapmap.eigenval' output: path 'pca.png' script: """ #!/usr/bin/env python import pandas as pd pca_df = pd.read_csv('hapmap.eigenvec', sep='\t') ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9)) ax.figure.savefig('pca.png') """ } /* workflow { plink_download | uncompress_plink } */ /* workflow { ped_file = file('data/hapmap.ped') map_file = file('data/hapmap.map') if (!ped_file.exists() | !map_file.exists()) { plink_download | uncompress_plink } } */ workflow { ped_file = file('data/hapmap.ped') map_file = file('data/hapmap.map') if (!ped_file.exists() | !map_file.exists()) { plink_download | uncompress_plink | subsample_1p | plink_pca | plot_pca } else { subsample_1p( Channel.fromPath('data/hapmap.map'), Channel.fromPath('data/hapmap.ped')) | plink_pca | plot_pca } } ================================================ FILE: Chapter09/snakemake/.gitignore ================================================ data scratch .snakemake pca.png dag.svg bio.png bio.svg ================================================ FILE: Chapter09/snakemake/Snakefile ================================================ rule all: input: "pca.png" rule plink_download: output: map="scratch/hapmap.map.gz", ped="scratch/hapmap.ped.gz", rel="data/relationships.txt" shell: """ python -c "import urllib.request; urllib.request.urlretrieve( 'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz', '{output.map}')" python -c "import urllib.request; urllib.request.urlretrieve( 'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz', '{output.ped}')" python -c "import urllib.request; urllib.request.urlretrieve( 'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt', '{output.rel}')" """ PLINKEXTS = ['ped', 'map'] rule uncompress_plink: input: "scratch/hapmap.{plinkext}.gz" output: "data/hapmap.{plinkext}" shell: "gzip -dc {input} > {output}" rule subsample_1p: input: "data/hapmap.ped", "data/hapmap.map" output: "data/hapmap1.ped", "data/hapmap1.map" run: shell(f"plink2 --pedmap {input[0][:-4]} --out {output[0][:-4]} --thin 0.01 --geno 0.1 --export ped") # snakemake and software requirements # https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#automatic-deployment-of-software-dependencies #plink2 --pedmap data/hapmap --out data/hapmap10 --thin 0.1 --geno 0.1 --export ped rule plink_pca: input: "data/hapmap1.ped", "data/hapmap1.map" output: "data/hapmap1.eigenvec", "data/hapmap1.eigenval" shell: "plink2 --pca --pedmap data/hapmap1 -out data/hapmap1" rule plot_pca: input: "data/hapmap1.eigenvec", "data/hapmap1.eigenval" output: "pca.png" script: "./plot_pca.py" ================================================ FILE: Chapter09/snakemake/plot_pca.py ================================================ import pandas as pd eigen_fname = snakemake.input[0] if snakemake.input[0].endswith('eigenvec') else snakemake.input[1] pca_df = pd.read_csv(eigen_fname, sep='\t') ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9)) ax.figure.savefig(snakemake.output[0]) ================================================ FILE: Chapter10/Clustering.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + jupyter={"outputs_hidden": false} import os import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.decomposition import PCA import numpy as np from genomics.popgen.pca import plot # - # ## Meta-data load # + jupyter={"outputs_hidden": false} f = open('../Chapter06/relationships_w_pops_041510.txt') ind_pop = {} f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] ind_pop['/'.join([fam_id, ind_id])] = pop f.close() # - # ## With scikit-learn # + jupyter={"outputs_hidden": false} f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped') ninds = 0 ind_order = [] for line in f: ninds += 1 toks = line[:100].replace(' ', '\t').split('\t') # for speed fam_id = toks[0] ind_id = toks[1] ind_order.append('%s/%s' % (fam_id, ind_id)) nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2 print (nsnps) f.close() # + jupyter={"outputs_hidden": false} all_array = np.empty((ninds, nsnps), dtype=int) f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped') for ind, line in enumerate(f): snps = line.replace(' ', '\t').split('\t')[6:] for pos in range(len(snps) // 2): a1 = int(snps[2 * pos]) a2 = int(snps[2 * pos]) my_code = a1 + a2 - 2 all_array[ind, pos] = my_code f.close() #slow # - predict_case = all_array[-1, :] pca_array = all_array[:-1,:] last_ind = ind_order[-1] last_ind, ind_pop[last_ind] my_pca = PCA(n_components=2) my_pca.fit(pca_array) trans = my_pca.transform(pca_array) sc_ind_comp = {} for i, ind_pca in enumerate(trans): sc_ind_comp[ind_order[i]] = ind_pca plot.render_pca(sc_ind_comp, cluster=ind_pop) # + jupyter={"outputs_hidden": false} def plot_kmeans_pca(trans, kmeans): x_min, x_max = trans[:, 0].min() - 1, trans[:, 0].max() + 1 y_min, y_max = trans[:, 1].min() - 1, trans[:, 1].max() + 1 mesh_x, mesh_y = np.meshgrid(np.arange(x_min, x_max, 0.5), np.arange(y_min, y_max, 0.5)) k_surface = kmeans.predict(np.c_[mesh_x.ravel(), mesh_y.ravel()]).reshape(mesh_x.shape) fig, ax = plt.subplots(1,1, dpi=300) ax.imshow( k_surface, origin="lower", cmap=plt.cm.Pastel1, extent=(mesh_x.min(), mesh_x.max(), mesh_y.min(), mesh_y.max()), ) ax.plot(trans[:, 0], trans[:, 1], "k.", markersize=2) ax.set_title("KMeans clustering of PCA data") ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) ax.set_yticks(()) return ax # + jupyter={"outputs_hidden": false} kmeans11 = KMeans(n_clusters=11).fit(trans) plot_kmeans_pca(trans, kmeans11) # - kmeans4 = KMeans(n_clusters=4).fit(trans) plot_kmeans_pca(trans, kmeans4) pca_predict = my_pca.transform([predict_case]) kmeans4.predict(pca_predict) last_train = ind_order[-2] last_train, ind_pop[last_train] kmeans4.predict(trans)[0] ================================================ FILE: Chapter10/Decision_Tree.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + jupyter={"outputs_hidden": false} import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn import tree # + [markdown] jupyter={"outputs_hidden": false} # http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29 # + jupyter={"outputs_hidden": false} # !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data # !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names # - # ## With scikit-learn # + jupyter={"outputs_hidden": false} f = open('breast-cancer-wisconsin.data') w = open('clean.data', 'w') for line in f: if line.find('?') > -1: continue w.write(line) f.close() w.close() # + jupyter={"outputs_hidden": false} column_names = [ 'sample_id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell shape', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class' ] samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0) samples # + jupyter={"outputs_hidden": false} training_input = samples.iloc[:,:-1] target = samples.iloc[:,-1].apply(lambda x: 0 if x == 2 else 1) # + jupyter={"outputs_hidden": false} clf = tree.DecisionTreeClassifier(max_depth=3) # + jupyter={"outputs_hidden": false} clf.fit(training_input, target) # + jupyter={"outputs_hidden": false} importances = pd.Series( clf.feature_importances_ * 100, index=training_input.columns).sort_values(ascending=False) importances # + jupyter={"outputs_hidden": false} 100 * clf.score(training_input, target) # + jupyter={"outputs_hidden": false} fig, ax = plt.subplots(1, dpi=300) tree.plot_tree(clf,ax=ax, feature_names=training_input.columns, class_names=['Benign', 'Malignant']) # - ================================================ FILE: Chapter10/PCA.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + jupyter={"outputs_hidden": false} import os from sklearn.decomposition import PCA import numpy as np from genomics.popgen.pca import plot # - # ## Meta-data load # + jupyter={"outputs_hidden": false} f = open('../Chapter06/relationships_w_pops_041510.txt') ind_pop = {} f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] ind_pop['/'.join([fam_id, ind_id])] = pop f.close() # - # ## With scikit-learn # + jupyter={"outputs_hidden": false} f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped') ninds = 0 ind_order = [] for line in f: ninds += 1 toks = line[:100].replace(' ', '\t').split('\t') # for speed fam_id = toks[0] ind_id = toks[1] ind_order.append('%s/%s' % (fam_id, ind_id)) nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2 f.close() # + jupyter={"outputs_hidden": false} pca_array = np.empty((ninds, nsnps), dtype=int) print(pca_array.shape) f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped') for ind, line in enumerate(f): snps = line.replace(' ', '\t').split('\t')[6:] for pos in range(len(snps) // 2): a1 = int(snps[2 * pos]) a2 = int(snps[2 * pos]) my_code = a1 + a2 - 2 pca_array[ind, pos] = my_code f.close() # + jupyter={"outputs_hidden": false} my_pca = PCA(n_components=8) my_pca.fit(pca_array) trans = my_pca.transform(pca_array) #Memory required # + jupyter={"outputs_hidden": false} sc_ind_comp = {} for i, ind_pca in enumerate(trans): sc_ind_comp[ind_order[i]] = ind_pca plot.render_pca_eight(sc_ind_comp, cluster=ind_pop) # + jupyter={"outputs_hidden": false} ================================================ FILE: Chapter10/Random_Forest.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.14.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + jupyter={"outputs_hidden": false} import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.tree import export_graphviz # + [markdown] jupyter={"outputs_hidden": false} # http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29 # + jupyter={"outputs_hidden": false} # !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data # !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names # - # ## With scikit-learn # + jupyter={"outputs_hidden": false} f = open('breast-cancer-wisconsin.data') w = open('clean.data', 'w') for line in f: if line.find('?') > -1: continue w.write(line) f.close() w.close() # + jupyter={"outputs_hidden": false} column_names = [ 'sample_id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell shape', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class' ] samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0) samples # + jupyter={"outputs_hidden": false} trainning_input = samples.iloc[:,:-1] target = samples.iloc[:,-1] # + jupyter={"outputs_hidden": false} clf = RandomForestClassifier(max_depth=3, n_estimators=200) # + jupyter={"outputs_hidden": false} clf.fit(trainning_input, target) # + jupyter={"outputs_hidden": false} importances = pd.Series( clf.feature_importances_ * 100, index=trainning_input.columns).sort_values(ascending=False) importances # - 100 * clf.score(trainning_input, target) for test_size in [0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99]: X_train, X_test, y_train, y_test = train_test_split( trainning_input, target, test_size=test_size) tclf = RandomForestClassifier(max_depth=3) tclf.fit(X_train, y_train) score = tclf.score(X_test, y_test) print(f'{1 - test_size:.1%} {score:.2%}') # Random number generator ================================================ FILE: Chapter11/.gitignore ================================================ dask-worker-space data mydask.png x.png ================================================ FILE: Chapter11/Dask_Distributed.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # + #import dask #from dask.base import get_scheduler #import dask.array as da # #mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT') #print(get_scheduler(collections=[mosquito]).__module__) # + import zarr import dask.dataframe as dd from dask.distributed import Client #client = Client('127.0.0.1:8786') client = Client() client # + import numpy as np import dask.array as da mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT') # - mosquito mosquito.shape[0] mosquito = mosquito.rechunk((mosquito.shape[0]//8, 81, 2)) mosquito = mosquito.persist() mosquito.visualize() mosquito mosquito.chunks def calc_stats(my_chunk): num_miss = np.sum(np.equal(my_chunk[0][0][:,:,0], -1), axis=1) return num_miss stats = da.blockwise(calc_stats, 'i', mosquito, 'ijk', dtype=np.uint8) stats.visualize() stat_results = stats.compute() stat_results ================================================ FILE: Chapter11/Dask_Intro.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- import zarr mosquito = zarr.open('data/AG1000G-AO/2L/calldata/GT') mosquito zarr.array(mosquito, chunks=(1 + 48525747 // 4, 81, 2), store='data/rechunk') mosquito = zarr.open('data/rechunk') mosquito.chunks # + import numpy as np import dask.array as da mosquito = da.from_zarr('data/rechunk') #mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT') # ^^^ load array # - mosquito print(mosquito[0]) mosquito[0].compute() mosquito.visualize(rankdir='TB') def calc_stats(variant): variant = variant.reshape(variant.shape[0] // 2, 2) num_misses = np.sum(np.equal(variant, -1)) // 2 return num_misses mosquito_2d = mosquito.reshape(mosquito.shape[0], mosquito.shape[1] * mosquito.shape[2]) mosquito_2d.visualize(rankdir='TB') mosquito_2d max_pos = 10000000 stats = da.apply_along_axis( calc_stats, 1, mosquito_2d[:max_pos,:], shape=(max_pos,), dtype=np.int64) stats.visualize('x.png',rankdir='TB') a = stats.compute() a ================================================ FILE: Chapter11/MP_intro.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Downloading data # https://malariagen.github.io/vector-data/ag3/download.html # !mkdir -p data/AG1000G-AO/ # !gsutil -m rsync -r \ # -x '.*/calldata/(AD|GQ|MQ)/.*' \ # gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \ # data/AG1000G-AO/ > /dev/null # !mkdir -p data/metadata/ # !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/ # # BLA # + import numpy as np import zarr mosquito = zarr.open('data/AG1000G-AO') print(mosquito.tree()) gt_2l = mosquito['/2L/calldata/GT'] gt_2l.info dir(gt_2l) gt_2l.shape[0] # + from math import ceil from multiprocessing import Pool def calc_stats(my_chunk): num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1) num_anc_hom = np.sum( np.all([ np.equal(my_chunk[:,:,0], 0), np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1) num_het = np.sum( np.not_equal( my_chunk[:,:,0], my_chunk[:,:,1]), axis=1) return num_miss, num_anc_hom, num_het chunk_pos_size = gt_2l.chunks[0] max_pos = gt_2l.shape[0] intervals = [] for chunk_pos in range(ceil(max_pos / chunk_pos_size)): start_pos = chunk_pos * chunk_pos_size end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size) intervals.append((start_pos, end_pos)) def compute_interval(interval): start_pos, end_pos = interval my_chunk = gt_2l[start_pos:end_pos, :, :] num_samples = my_chunk.shape[1] num_miss, num_anc_hom, num_het = calc_stats(my_chunk) chunk_complete_data = np.sum(np.equal(num_miss, 0)) chunk_more_anc_hom = np.sum(num_anc_hom > num_het) return chunk_complete_data, chunk_more_anc_hom with Pool() as p: print(p) chunk_returns = p.map(compute_interval, intervals) complete_data = sum(map(lambda x: x[0], chunk_returns)) more_anc_hom = sum(map(lambda x: x[1], chunk_returns)) print(complete_data, more_anc_hom) # - ================================================ FILE: Chapter11/Zarr_Intro.py ================================================ # --- # jupyter: # jupytext: # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.13.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- # # Downloading data # https://malariagen.github.io/vector-data/ag3/download.html # !mkdir -p data/AG1000G-AO/ # !gsutil -m rsync -r \ # -x '.*/calldata/(AD|GQ|MQ)/.*' \ # gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \ # data/AG1000G-AO/ > /dev/null # !mkdir -p data/metadata/ # !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/ # # BLA # + import numpy as np import zarr mosquito = zarr.open('data/AG1000G-AO') print(mosquito.tree()) # - mosquito['samples'] np.array(mosquito['samples']) gt_2l = mosquito['/2L/calldata/GT'] gt_2l gt_2l.info gt_2l[400000,:,:] # + # Do not do np.array(gt_2l) # - dir(gt_2l) gt_2l.shape[0] # + from math import ceil chunk_pos_size = gt_2l.chunks[0] max_pos = gt_2l.shape[0] def calc_stats(my_chunk): num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1) num_anc_hom = np.sum( np.all([ np.equal(my_chunk[:,:,0], 0), np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1) num_het = np.sum( np.not_equal( my_chunk[:,:,0], my_chunk[:,:,1]), axis=1) return num_miss, num_anc_hom, num_het complete_data = 0 more_anc_hom = 0 total_pos = 0 for chunk_pos in range(ceil(max_pos / chunk_pos_size)): start_pos = chunk_pos * chunk_pos_size end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size) my_chunk = gt_2l[start_pos:end_pos, :, :] #print(start_pos, end_pos, my_chunk.shape) num_samples = my_chunk.shape[1] num_miss, num_anc_hom, num_het = calc_stats(my_chunk) chunk_complete_data = np.sum(np.equal(num_miss, 0)) #print(end_pos - start_pos, my_chunk.shape, num_anc_hom.shape, num_het.shape) chunk_more_anc_hom = np.sum(num_anc_hom > num_het) print(np.sum(num_anc_hom > num_het)) complete_data += chunk_complete_data more_anc_hom += chunk_more_anc_hom total_pos += (end_pos - start_pos) print(complete_data, more_anc_hom, total_pos) # - ================================================ FILE: Chapter12/Builtin.py ================================================ import functools @functools.cache def fibo(n): if n == 0: return 0 if n == 1: return 1 return fibo(n - 1) + fibo(n - 2) fibo(1000) def gene_min_reads(source, min_reads): return map( lambda x: x[0], filter( lambda x: x[1] >= min_reads, source.items())) list(gene_min_reads({'LCT': 10, 'MRAP2': 1}, 2)) multiplication = lambda x, y: x * y double = functools.partial(multiplication, 2) double(3) ================================================ FILE: Chapter12/Lazy.py ================================================ import pandas as pd def load(file_name): df = pd.read_csv(file_name).set_index('gene') return dict(df['count']) def get_min_reads(all_data, min_reads): return { gene: count for gene, count in all_data.items() if count >= min_reads } def has_min_observations(subset_data, min_observations): return len(subset_data) >= min_observations print(has_min_observations( get_min_reads( load('my_genes.csv'), 4 ), 3)) def get_rec(file_name): with open(file_name) as f: f.readline() # header for line in f: toks = line.strip().split(',') yield toks[0], int(toks[1]) def gene_min_reads(source, min_reads): for gene, count in source: if count >= min_reads: yield gene def gene_min_observations(subset_source, min_observations): my_observations = 0 for gene in subset_source: my_observations += 1 if my_observations == min_observations: return True return False print(gene_min_observations( gene_min_reads( get_rec('my_genes.csv'), 4 ), 2)) ================================================ FILE: Chapter12/Mutability.py ================================================ import shutil import pandas as pd def restore_db(file_name): shutil.copyfile(f'{file_name}.base', file_name) def load(file_name): df = pd.read_csv(file_name).set_index('gene') return dict(df['count']) def save(dict_db, file_name): pd.Series(dict_db).to_csv( file_name, index_label='gene', header=['count']) def add_sample_dict(dict_db, gene_list): for gene in gene_list: dict_db[gene] = dict_db.get(0) + 1 def add_sample_new_dict(dict_db, gene_list): my_dict_db = dict(dict_db) # next recipe for gene in gene_list: my_dict_db[gene] = my_dict_db.get(0) + 1 return my_dict_db gene_count = load('my_genes.csv') add_sample_dict(gene_count, ['DEPP']) new_gene_count = add_sample_new_dict(gene_count, ['DEPP']) ================================================ FILE: Chapter12/Persistence1.py ================================================ import shutil import pandas as pd def restore_db(file_name): shutil.copyfile(f'{file_name}.base', file_name) def load(file_name): df = pd.read_csv(file_name).set_index('gene') return dict(df['count']) def save(dict_db, file_name): pd.Series(dict_db).to_csv( file_name, index_label='gene', header=['count']) def add_sample_csv(gene_list): gene_count = load('my_genes.csv') for gene in gene_list: gene_count[gene] = gene_count.get(0) + 1 save(gene_count, 'my_genes.csv') restore_db('my_genes.csv') add_sample_csv(['MC4R', 'TYR']) add_sample_csv(['LCT', 'HLA-A']) add_sample_csv(['HLA-B', 'HLA-C']) ================================================ FILE: Chapter12/Persistence2.py ================================================ import shutil import pandas as pd def restore_db(file_name): shutil.copyfile(f'{file_name}.base', file_name) def load(file_name): df = pd.read_csv(file_name).set_index('gene') return dict(df['count']) def save(dict_db, file_name): pd.Series(dict_db).to_csv( file_name, index_label='gene', header=['count']) def add_sample_new_dict(dict_db, gene_list): my_dict_db = dict(dict_db) # next recipe for gene in gene_list: my_dict_db[gene] = my_dict_db.get(0) + 1 return my_dict_db restore_db('my_genes.csv') gene_count = load('my_genes.csv') gene_count = add_sample_new_dict(gene_count, ['MC4R', 'TYR']) gene_count = add_sample_new_dict(gene_count, ['LCT', 'HLA-A']) gene_count = add_sample_new_dict(gene_count, ['HLA-B', 'HLA-C']) save(gene_count, 'my_genes.csv') ================================================ FILE: Chapter12/Pure.py ================================================ import shutil import pandas as pd def restore_db(file_name): shutil.copyfile(f'{file_name}.base', file_name) def load(file_name): df = pd.read_csv(file_name).set_index('gene') return dict(df['count']) def save(dict_db, file_name): pd.Series(dict_db).to_csv( file_name, index_label='gene', header=['count']) def add_sample_csv(gene_list): gene_count = load('my_genes.csv') for gene in gene_list: gene_count[gene] = gene_count.get(0) + 1 save(gene_count, 'my_genes.csv') def add_sample_global_dict(gene_list): global gene_count for gene in gene_list: gene_count[gene] = gene_count.get(0) + 1 def add_sample_dict(dict_db, gene_list): for gene in gene_list: dict_db[gene] = dict_db.get(0) + 1 gene_count = load('my_genes.csv') add_sample_csv(['MC4R', 'TYR']) add_sample_dict(gene_count, ['MC4R', 'TYR']) save(gene_count, 'my_genes.csv') ================================================ FILE: Chapter12/Recursion.py ================================================ def fibo_iter(n): if n < 2: return n last = 1 second_last = 0 for _i in range(2, n + 1): result = second_last + last second_last = last last = result return result def fibo_naive(n): if n == 0: return 0 if n == 1: return 1 return fibo_naive(n - 1) + fibo_naive(n - 2) fibo_iter(0) fibo_iter(1) fibo_iter(2) fibo_iter(3) fibo_iter(4) fibo_iter(5) fibo_iter(6) fibo_naive(1000) def factorial(n): if n == 1: return 1 return n * factorial(n - 1) factorial(5) factorial(20000) ================================================ FILE: Chapter12/Tools.py ================================================ import functools def fibo_iter(n): if n == 0: return 0 if n == 1: return 1 last = 1 second_last = 1 for i in range(3, n + 1): result = second_last + last second_last = last last = result return result def fibo_naive(n): if n == 0: return 0 if n == 1: return 1 return fibo_naive(n - 1) + fibo_naive(n - 2) @functools.lru_cache def fibo(n): if n == 0: return 0 if n == 1: return 1 return fibo(n - 1) + fibo(n - 2) time fibo_iter(100) #time fibo_naive(1000) time fibo(1000) def factorial(n): if n == 1: return 1 return n * factorial(n - 1) factorial(20000) ================================================ FILE: Chapter12/my_genes.csv ================================================ gene,count LCT,5 LEPR,4 MRAP2,1 ================================================ FILE: Chapter12/my_genes.csv.base ================================================ gene,count LCT,5 LEPR,4 MRAP2,1 ================================================ FILE: Datasets.py ================================================ # # Datasets for the book # # Here we provide links to the datasets used in the book. # # Important Notes: # # 1. Note that these datasets are provided on external servers by third parties # # Python and the Surrounding Software Ecology # # ## R sections # # http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index # # # PDB # # # ## Parsing mmCIF files with Biopython # # [1TUP.cif](http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP)" ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2021 Packt Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Bioinformatics-with-Python-Cookbook-third-edition Bioinformatics with Python Cookbook - Third Edition This is the code repository for [Bioinformatics with Python Cookbook - Third Edition](https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421), published by Packt. **Use modern Python libraries and applications to solve real-world computational biology problems** ## What is this book about? Bioinformatics is an active research field that uses a range of simple-to-advanced computations to extract valuable information from biological data, and this book will show you how to manage these tasks using Python. This updated third edition of the Bioinformatics with Python Cookbook begins with a quick overview of the various tools and libraries in the Python ecosystem that will help you convert, analyze, and visualize biological datasets. Next, you'll cover key techniques for next-generation sequencing, single-cell analysis, genomics, metagenomics, population genetics, phylogenetics, and proteomics with the help of real-world examples. You'll learn how to work with important pipeline systems, such as Galaxy servers and Snakemake, and understand the various modules in Python for functional and asynchronous programming. This book will also help you explore topics such as SNP discovery using statistical approaches under high-performance computing frameworks, including Dask and Spark. In addition to this, you’ll explore the application of machine learning algorithms in bioinformatics. By the end of this bioinformatics Python book, you'll be equipped with the knowledge you need to implement the latest programming techniques and frameworks, empowering you to deal with bioinformatics data on every scale. This book covers the following exciting features: * Become well-versed with data processing libraries such as NumPy, pandas, arrow, and zarr in the context of bioinformatic analysis * Interact with genomic databases * Solve real-world problems in the fields of population genetics, phylogenetics, and proteomics * Build bioinformatics pipelines using a Galaxy server and Snakemake * Work with functools and itertools for functional programming * Perform parallel processing with Dask on biological data * Explore principal component analysis (PCA) techniques with scikit-learn If you feel this book is for you, get your [copy](https://www.amazon.in/Bioinformatics-Python-Cookbook-bioinformatics-computational/dp/1789344697/ref=sr_1_2?keywords=Bioinformatics+with+Python+Cookbook+-+Third+Edition&qid=1665382032&sr=8-2) today! https://www.packtpub.com/ ## Instructions and Navigations All of the code is organized into folders. The code will look like the following: ``` from Bio import SeqIO genome_name = 'PlasmoDB-9.3_Pfalciparum3D7_Genome.fasta' recs = SeqIO.parse(genome_name, 'fasta') for rec in recs: print(rec.description) ``` **Following is what you need for this book:** This book is for bioinformatics analysts, data scientists, computational biologists, researchers, and Python developers who want to address intermediate-to-advanced biological and bioinformatics problems. Working knowledge of the Python programming language is expected. Basic knowledge of biology will also be helpful. With the following software and hardware list you can run all code files present in the book (Chapter 1-12). ### Software and Hardware List | Chapter | Software required | OS required | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| | 1-12 | Python 3.9 | Any OS | | 1-12 | Numpy, Pandas and Matplotlib | Any OS | | 1-12 | BioPython | Any OS | | 1-12 | DAsk, Zarr, Sckit-learn | Any OS | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://packt.link/3KQQO). ## Get to Know the Author **Tiago Antao** is a bioinformatician who is currently working in the field of genomics. A former computer scientist, Tiago moved into computational biology with an MSc in bioinformatics from the Faculty of Sciences at the University of Porto, Portugal, and a PhD on the spread of drug-resistant malaria from the Liverpool School of Tropical Medicine, UK. Post his doctoral, Tiago worked with human datasets at the University of Cambridge, UK and with mosquito whole-genome sequencing data at the University of Oxford, UK, before helping to set up the bioinformatics infrastructure at the University of Montana, USA. He currently works as a data engineer in the biotechnology field in Boston, MA. He is one of the co-authors of Biopython, a major bioinformatics package written in Python. ### Download a free PDF If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.

https://packt.link/free-ebook/9781803236421

================================================ FILE: Welcome.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "3040340b-bd0b-4266-a7a6-8b48d9a94625", "metadata": {}, "source": [ "# Python for Bionformatics\n", "\n", "## Datasets\n", "\n", "[Click here](Datasets.py) for the datasets used in the book. You only need this if you do not use the notebooks (as the notebooks will take care of the data)\n" ] }, { "cell_type": "markdown", "id": "ee3697db-cdfe-41c2-ae06-8dc1633b5701", "metadata": {}, "source": [ "## Python and the surrounding software ecology\n", "\n", "- [Interfacing with R](Chapter01/Interfacing_R.py)\n", "- [R Magic](Chapter01/R_magic.py)" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2663bc-8efe-4bb0-9ac5-f9e2eb09cc5e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "", "name": "" }, "language_info": { "name": "" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: docker/Chapter01/Dockerfile ================================================ FROM tiagoantao/bio3 MAINTAINER Tiago Antao # RUN conda create -n bioinformatics_r --clone bioinformatics_base #RUN conda install -n bioinformatics_r r-base=4.1.3 r-ggplot2=3.3.5 r-lazyeval=0.2.2 r-gridextra=2.3 rpy2 RUN conda create -n bioinformatics_r jupyterlab jupytext pandas RUN conda install -n bioinformatics_r r-base r-ggplot2 r-lazyeval r-gridextra rpy2 CMD conda run --no-capture-output -n bioinformatics_r jupyter-lab --ip=0.0.0.0 --no-browser --allow-root --port=9875 --NotebookApp.token='' --NotebookApp.password='' ================================================ FILE: docker/main/Dockerfile ================================================ FROM continuumio/anaconda3:2021.05 MAINTAINER Tiago Antao #ENV DEBIAN_FRONTEND noninteractive #RUN apt-get update && apt-get upgrade -y && apt-get install -y git wget build-essential unzip graphviz libgraphviz-dev pkg-config swig libx11-dev libgsl0-dev libopenblas-dev liblapacke-dev #RUN apt-get install -y samtools mafft muscle raxml tabix RUN git clone https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-third-Edition.git #RUN conda upgrade -n base conda RUN conda config --add channels conda-forge RUN conda config --add channels bioconda RUN conda create -n bioinformatics_base --file /Bioinformatics-with-Python-Cookbook-third-Edition/Chapter01/bioinformatics_base.txt RUN pip install pyarrow==8.0.0 RUN conda init bash EXPOSE 9875 WORKDIR /Bioinformatics-with-Python-Cookbook-third-Edition RUN echo setterm -foreground magenta >> /etc/bash.bashrc CMD conda run --no-capture-output -n bioinformatics_base jupyter-lab --ip=0.0.0.0 --no-browser --allow-root --port=9875 --NotebookApp.token='' --NotebookApp.password=''