Showing preview only (209K chars total). Download the full file or copy to clipboard to get everything.
Repository: PacktPublishing/Bioinformatics-with-Python-Cookbook-third-edition
Branch: main
Commit: 9b10894b1a19
Files: 93
Total size: 188.9 KB
Directory structure:
gitextract_jf5fqbmn/
├── .gitignore
├── Chapter01/
│ ├── Interfacing_R.py
│ ├── R_magic.py
│ ├── base_setup.sh
│ └── bioinformatics_base.txt
├── Chapter02/
│ ├── .gitignore
│ ├── Arrow.py
│ ├── Matplotlib.py
│ ├── NumPy.py
│ ├── Pandas_Basic.py
│ ├── Pandas_Join.py
│ └── Pandas_Memory.py
├── Chapter03/
│ ├── Accessing_Databases.py
│ ├── Basic_Sequence_Processing.py
│ ├── Filtering_SNPs.py
│ ├── LCT.bed
│ ├── Processing_BED_with_HTSeq.py
│ ├── Working_with_BAM.py
│ ├── Working_with_FASTQ.py
│ └── Working_with_VCF.py
├── Chapter04/
│ ├── 2L.py
│ ├── Exploration.py
│ ├── Mendel.py
│ ├── Preparation.py
│ ├── QIIME2_Metagenomics.py
│ └── samples.tsv
├── Chapter05/
│ ├── .gitignore
│ ├── Annotations.py
│ ├── Gene_Ontology.py
│ ├── Getting_Gene.py
│ ├── Low_Quality.py
│ ├── Orthology.py
│ └── Reference_Genome.py
├── Chapter06/
│ ├── .gitignore
│ ├── Admixture.py
│ ├── Data_Formats.py
│ ├── Exploratory_Analysis.py
│ ├── PCA.py
│ ├── Pop_Stats.py
│ └── Sgkit.py
├── Chapter07/
│ ├── .gitignore
│ ├── Alignment.py
│ ├── Comparison.py
│ ├── Exploration.py
│ ├── Reconstruction.py
│ ├── Selection.py
│ ├── Trees.py
│ └── Visualization.py
├── Chapter08/
│ ├── .gitignore
│ ├── Distance.py
│ ├── Intro.py
│ ├── Mass.py
│ ├── PDB.py
│ ├── Parser.py
│ ├── PyMol_Intro.py
│ ├── PyMol_Movie.py
│ ├── Stats.py
│ └── mmCIF.py
├── Chapter09/
│ ├── galaxy/
│ │ ├── .gitignore
│ │ ├── LCT.bed
│ │ ├── api.py
│ │ ├── encrypt.py
│ │ └── galaxy.yaml
│ ├── nextflow/
│ │ ├── .gitignore
│ │ └── pipeline.nf
│ └── snakemake/
│ ├── .gitignore
│ ├── Snakefile
│ └── plot_pca.py
├── Chapter10/
│ ├── Clustering.py
│ ├── Decision_Tree.py
│ ├── PCA.py
│ └── Random_Forest.py
├── Chapter11/
│ ├── .gitignore
│ ├── Dask_Distributed.py
│ ├── Dask_Intro.py
│ ├── MP_intro.py
│ └── Zarr_Intro.py
├── Chapter12/
│ ├── Builtin.py
│ ├── Lazy.py
│ ├── Mutability.py
│ ├── Persistence1.py
│ ├── Persistence2.py
│ ├── Pure.py
│ ├── Recursion.py
│ ├── Tools.py
│ ├── my_genes.csv
│ └── my_genes.csv.base
├── Datasets.py
├── LICENSE
├── README.md
├── Welcome.ipynb
└── docker/
├── Chapter01/
│ └── Dockerfile
└── main/
└── Dockerfile
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.ipynb_checkpoints
.Rhistory
__pycache__
================================================
FILE: Chapter01/Interfacing_R.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# ## The next cell will get a ~65 MB data file 'sequence.index', you only need to run the cell once
# %%
# !rm sequence.index 2>/dev/null
# !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index
# %% [markdown]
# # Interfacing with R
# %%
import os
from IPython.display import Image
import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
from rpy2.robjects.functions import SignatureTranslatedFunction
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
# %%
read_delim = robjects.r('read.delim')
seq_data = read_delim('sequence.index', header=True, stringsAsFactors=False)
#In R:
# seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)
# %%
print('This data frame has %d columns and %d rows' % (seq_data.ncol, seq_data.nrow))
print(seq_data.colnames)
#In R:
# print(colnames(seq.data))
# print(nrow(seq.data))
# print(ncol(seq.data))
print('Columns in Python %d ' % robjects.r.ncol(seq_data)[0])
#access some functions
as_integer = robjects.r('as.integer')
match = robjects.r.match
my_col = match('READ_COUNT', seq_data.colnames)[0] # Vector returned
print('Type of read count before as.integer: %s' % seq_data[my_col - 1].rclass[0])
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])
print('Type of read count after as.integer: %s' % seq_data[my_col - 1].rclass[0])
my_col = match('BASE_COUNT', seq_data.colnames)[0] # Vector returned
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])
my_col = match('CENTER_NAME', seq_data.colnames)[0]
seq_data[my_col - 1] = robjects.r.toupper(seq_data[my_col - 1])
robjects.r.assign('seq.data', seq_data)
robjects.r('print(c("Column names in R: ",colnames(seq.data)))')
robjects.r('seq.data <- seq.data[seq.data$WITHDRAWN==0, ]')
#Lets remove all withdrawn sequences
robjects.r("seq.data <- seq.data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]")
#Lets shorten the dataframe
#Population as factor
robjects.r('seq.data$POPULATION <- as.factor(seq.data$POPULATION)')
# %%
ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme,
init_prm_translate = {'axis_text_x': 'axis.text.x'})
bar = ggplot2.ggplot(seq_data) + ggplot2.geom_bar() + ggplot2.aes_string(x='CENTER_NAME') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1, size=40), axis_text_y=ggplot2.element_text(size=40), text=ggplot2.element_text(size=40))
robjects.r.png('out.png', width=16, height=9, units="in", res=600)
bar.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')
# %%
#Get Yoruba and CEU
robjects.r('yri_ceu <- seq.data[seq.data$POPULATION %in% c("YRI", "CEU") & seq.data$BASE_COUNT < 2E9 & seq.data$READ_COUNT < 3E7, ]')
yri_ceu = robjects.r('yri_ceu')
# %%
scatter = ggplot2.ggplot(yri_ceu) + ggplot2.aes_string(x='BASE_COUNT', y='READ_COUNT', shape='factor(POPULATION)', col='factor(ANALYSIS_GROUP)') + ggplot2.geom_point()
robjects.r.png('out.png', width=16, height=9, units="in", res=600)
scatter.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')
# %%
with localconverter(ro.default_converter + pandas2ri.converter):
pd_yri_ceu = ro.conversion.rpy2py(yri_ceu)
del pd_yri_ceu['PAIRED_FASTQ']
# no_paired = pandas2ri.py2ri(pd_yri_ceu)
with localconverter(ro.default_converter + pandas2ri.converter):
no_paired = ro.conversion.py2rpy(pd_yri_ceu)
robjects.r.assign('no.paired', no_paired)
robjects.r("print(colnames(no.paired))")
# %%
================================================
FILE: Chapter01/R_magic.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# ## The cell below will get the data file, you only need to run it once
# %% [markdown]
# (you do not need to do this if you have done it in the Interfacing_R notebook)
# %%
# !rm sequence.index 2>/dev/null
# !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index
# %%
import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
# %load_ext rpy2.ipython
# %% language="R"
# seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)
# seq.data$READ_COUNT <- as.integer(seq.data$READ_COUNT)
# seq.data$BASE_COUNT <- as.integer(seq.data$BASE_COUNT)
# %%
# seq_data = %R seq.data
print(type(seq_data)) #pandas dataframe???
# %%
my_col = list(seq_data.columns).index("CENTER_NAME")
seq_data['CENTER_NAME'] = seq_data['CENTER_NAME'].apply(lambda x: x.upper())
# %%
# %R -i seq_data
# %R print(colnames(seq_data))
# %% language="R"
# seq_data <- seq_data[seq_data$WITHDRAWN==0, ]
# seq_data$POPULATION <- as.factor(seq_data$POPULATION)
# %% language="R"
# bar <- ggplot(seq_data) + aes(factor(CENTER_NAME)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
# print(bar)
# %% language="R"
# seq_data$POPULATION <- as.factor(seq_data$POPULATION)
# yri_ceu <- seq_data[seq_data$POPULATION %in% c("YRI", "CEU") & seq_data$BASE_COUNT < 2E9 & seq_data$READ_COUNT < 3E7, ]
# %% language="R"
# scatter <- ggplot(yri_ceu, aes(x=BASE_COUNT, y=READ_COUNT, col=factor(ANALYSIS_GROUP), shape=POPULATION)) + geom_point()
# print(scatter)
# %% language="R"
# library(gridExtra)
# library(grid)
# g <- grid.arrange(bar, scatter, ncol=1)
# g
# %% language="R"
# png('fig.png')
# g
# dev.off()
================================================
FILE: Chapter01/base_setup.sh
================================================
conda create -n bioinformatics_base python=3.9.7
conda activate bioinformatics_base
conda config --add channels bioconda
conda config --add channels conda-forge
conda install \
biopython==1.79 \
jupyterlab==3.2.1 \
jupytext==1.13 \
matplotlib==3.4.3 \
numpy==1.21.3 \
pandas==1.3.4 \
scipy==1.7.1
conda list --explicit > bioinformatics_base.txt
================================================
FILE: Chapter01/bioinformatics_base.txt
================================================
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2021.10.8-ha878542_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.36.1-hea4e1c9_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-11.2.0-h5c6108e_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-11.2.0-he4da1e4_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mysql-common-8.0.27-ha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pandoc-2.15-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/tzdata-2021e-he74cb21_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-11.2.0-h69a702a_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgomp-11.2.0-h1d223b6_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-1_gnu.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-11.2.0-h1d223b6_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.3-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/expat-2.4.1-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/icu-68.2-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jbig-2.1-h7f98852_2003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jpeg-9d-h36c2ea0_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lerc-3.0-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.8-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h9c3ff4c_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.16-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.4-h7f98852_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.18-pthreads_h8fe5266_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libopus-1.3.1-h7f98852_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.18-h36c2ea0_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h7f98852_1000.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.2.1-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.11-h36c2ea0_1013.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.3-h9c3ff4c_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.2-h58526e2_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nspr-4.32-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1l-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pcre-8.45-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.5-h516909a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-h73d1719_1008.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.10-h9b69904_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h7f98852_1003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/readline-8.1-h46c0cb4_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.4-h9c3ff4c_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h36c2ea0_1013.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libglib-2.70.0-h174f98d_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libllvm11-11.1.0-hf817b99_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-h21135ba_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.12-h72842e0_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.36.0-h9cd32fc_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.11-h27826a3_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.0-ha95c52a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.4-h0708190_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.70.0-h780b84a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.18.5-h76c114f_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/krb5-1.19.2-hcc1bbae_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libclang-11.1.0-default_ha53f305_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.3.0-h6f004c6_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.0.3-he3ba5ed_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-8.0.27-hfa10184_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nss-3.69-hb5efdd6_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/python-3.9.7-hb7a2778_3_cpython.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-hba837de_1005.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/glib-2.70.0-h780b84a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.18.5-hf529b03_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.12-hddcbb42_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libpq-13.3-hd57d9b9_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.4.0-hb52868f_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.11.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd3deb0d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pycparser-2.20-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.9-2_cp39.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/certifi-2021.10.8-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/cffi-1.14.6-py39h4bc2ebd_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/chardet-4.0.0-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h48d8840_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/debugpy-1.4.1-py39he80948d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/importlib-metadata-4.8.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jedi-0.18.0-py39hf3d152e_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jupyter_core-4.9.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.3.2-py39h1a9c180_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-1.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/markupsafe-2.0.1-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mistune-0.8.4-py39h3811e60_1004.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/numpy-1.21.3-py39hdbf815f_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pexpect-4.8.0-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pillow-8.3.2-py39ha612740_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-4.19.18-py39he80948d_7.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyrsistent-0.17.3-py39h3811e60_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pysocks-1.7.1-py39hf3d152e_3.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyzmq-22.3.0-py39h37b5a0c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/setuptools-58.2.0-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/sniffio-1.2.0-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/tornado-6.1-py39h3811e60_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/websocket-client-0.57.0-py39hf3d152e_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/anyio-3.3.4-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/argon2-cffi-21.1.0-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/biopython-1.79-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/brotlipy-0.7.0-py39h3811e60_1001.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/cryptography-35.0.0-py39h95dcef6_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.1.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.4.3-py39h2fa2bec_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.2.8-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pandas-1.3.4-py39hde0f152_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/qt-5.12.9-hda022c4_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/scipy-1.7.1-py39hee8e79c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/terminado-0.12.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt-impl-5.12.3-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupytext-1.13.0-pyh6002c4b_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.21-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqtchart-5.12-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqtwebengine-5.12.1-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ipython-7.28.0-py39hef51801_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nbconvert-6.2.0-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.12.3-py39hf3d152e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ipykernel-6.4.2-py39hef51801_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.4.3-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/requests-unixsocket-0.2.0-py_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.1-pyhd8ed1ab_0.tar.bz2
================================================
FILE: Chapter02/.gitignore
================================================
*png
VAERSDataUseGuide_en_September2021.pdf
================================================
FILE: Chapter02/Arrow.py
================================================
import gzip
import pandas as pd
from pyarrow import csv
import pyarrow.compute as pc
vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
columns = list(vdata_pd.columns)
vdata_pd.info(memory_usage="deep")
vdata_arrow = csv.read_csv("2021VAERSDATA.csv.gz")
tot_bytes = sum([
vdata_arrow[name].nbytes
for name in vdata_arrow.column_names])
print(f"Total {tot_bytes // (1024 ** 2)} MB")
for name in vdata_arrow.column_names:
arr_bytes = vdata_arrow[name].nbytes
arr_type = vdata_arrow[name].type
pd_bytes = vdata_pd[name].memory_usage(index=False, deep=True)
pd_type = vdata_pd[name].dtype
print(
name,
arr_type, arr_bytes // (1024 ** 2),
pd_type, pd_bytes // (1024 ** 2),)
# %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
# %timeit csv.read_csv("2021VAERSDATA.csv.gz")
# REMOVE SYMPTOM_TEXT
vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
data_pd.info(memory_usage="deep")
#columns.remove("SYMPTOM_TEXT")
vdata_arrow = csv.read_csv(
"2021VAERSDATA.csv.gz",
convert_options=csv.ConvertOptions(include_columns=columns))
vdata_arrow.nbytes
# %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
# %timeit csv.read_csv("2021VAERSDATA.csv.gz", convert_options=csv.ConvertOptions(include_columns=columns))
vdata = vdata_arrow.to_pandas()
vdata.info(memory_usage="deep")
# Theres more
vdata = vdata_arrow.to_pandas(self_destruct=True)
================================================
FILE: Chapter02/Matplotlib.py
================================================
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
vdata = pd.read_csv(
"2021VAERSDATA.csv.gz", encoding="iso-8859-1",
usecols=lambda name: name != "SYMPTOM_TEXT")
num_rows = len(vdata)
perc_nan = {}
for col_name in vdata.columns:
num_nans = len(vdata[col_name][vdata[col_name].isna()])
perc_nan[col_name] = 100 * num_nans / num_rows
labels = perc_nan.keys()
bar_values = list(perc_nan.values())
x_positions = np.arange(len(labels))
fig = plt.figure()
fig.suptitle("Fraction of empty values per column")
ax = fig.add_subplot()
ax.bar(x_positions, bar_values)
ax.set_ylabel("Percent of empty values")
ax.set_xlabel("Column")
ax.set_xticks(x_positions)
ax.set_xticklabels(labels)
ax.legend()
fig.savefig("naive_chart.png")
# OO interface vs matlab...
fig = plt.figure(figsize=(16, 9), tight_layout=True, dpi=600)
fig.suptitle("Fraction of empty values per column", fontsize="48")
ax = fig.add_subplot()
b1 = ax.bar(x_positions, bar_values)
ax.set_ylabel("Percent of empty values", fontsize="xx-large")
ax.set_xticks(x_positions)
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_ylim(0, 100)
ax.set_xlim(-0.5, len(labels))
for i, x in enumerate(x_positions):
ax.text(
x, 2, "%.1f" % bar_values[i], rotation=90,
va="bottom", ha="center",
backgroundcolor="white")
fig.text(0.2, 0.01, "Column", fontsize="xx-large")
fig.savefig("cleaner_chart.png")
dead = vdata[vdata.DIED == "Y"]
vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID")
vax.groupby("VAX_TYPE").size().sort_values()
vax_dead = dead.join(vax, on="VAERS_ID", how="inner")
# join on id, discuss
vax_dead.iloc[0]
dead_counts = vax_dead["VAX_TYPE"].value_counts()
large_values = dead_counts[dead_counts >= 10]
other_sum = dead_counts[dead_counts < 10].sum()
large_values = large_values.append(pd.Series({"OTHER": other_sum}))
distance_df = vax_dead[vax_dead.DATEDIED.notna() & vax_dead.VAX_DATE.notna()]
distance_df["DATEDIED"] = pd.to_datetime(distance_df["DATEDIED"])
distance_df["VAX_DATE"] = pd.to_datetime(distance_df["VAX_DATE"])
distance_df = distance_df[distance_df.DATEDIED >= "2021"]
distance_df = distance_df[distance_df.VAX_DATE >= "2021"]
distance_df = distance_df[distance_df.DATEDIED >= distance_df.VAX_DATE]
time_distances = distance_df["DATEDIED"] - distance_df["VAX_DATE"]
time_distances_d = time_distances.astype(int) / (10**9 * 60 * 60 * 24)
date_died = pd.to_datetime(vax_dead[vax_dead.DATEDIED.notna()]["DATEDIED"])
date_died = date_died[date_died >= "2021"]
date_died_counts = date_died.value_counts().sort_index()
cum_deaths = date_died_counts.cumsum()
state_dead = vax_dead[vax_dead["STATE"].notna()][["STATE", "SEX"]]
top_states = sorted(state_dead["STATE"].value_counts().head(10).index)
top_state_dead = state_dead[state_dead["STATE"].isin(top_states)].groupby(["STATE", "SEX"]).size()#.reset_index()
top_state_dead.loc["MN", "U"] = 0 # XXXX
top_state_dead = top_state_dead.sort_index().reset_index()
top_state_females = top_state_dead[top_state_dead.SEX == "F"][0]
top_state_males = top_state_dead[top_state_dead.SEX == "M"][0]
top_state_unk = top_state_dead[top_state_dead.SEX == "U"][0]
fig, ((vax_cnt, time_dist), (death_time, state_reps)) = plt.subplots(
2, 2,
figsize=(16, 9), tight_layout=True, dpi=600)
vax_cnt.set_title("Vaccines involved in deaths")
wedges, texts = vax_cnt.pie(large_values)
vax_cnt.legend(wedges, large_values.index, loc="lower left")
time_dist.hist(time_distances_d, bins=50)
time_dist.set_title("Days between vaccine administration and death")
time_dist.set_xlabel("Days")
time_dist.set_ylabel("Observations")
death_time.plot(date_died_counts.index, date_died_counts, ".")
death_time.set_title("Deaths over time")
death_time.set_ylabel("Daily deaths")
death_time.set_xlabel("Date")
tw = death_time.twinx()
tw.plot(cum_deaths.index, cum_deaths)
tw.set_ylabel("Cummulative deaths")
state_reps.set_title("Deaths per state stratified by sex")
state_reps.bar(top_states, top_state_females, label="Females")
state_reps.bar(top_states, top_state_males, label="Males", bottom=top_state_females)
state_reps.bar(top_states, top_state_unk, label="Unknown",
bottom=top_state_females.values + top_state_males.values)
state_reps.legend()
state_reps.set_xlabel("State")
state_reps.set_ylabel("Deaths")
fig.savefig("summary.png")
fig
================================================
FILE: Chapter02/NumPy.py
================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
vdata = pd.read_csv(
"2021VAERSDATA.csv.gz", encoding="iso-8859-1")
vdata["STATE"] = vdata["STATE"].str.upper()
top_states = pd.DataFrame({
"size": vdata.groupby("STATE").size().sort_values(ascending=False).head(5)}).reset_index()
top_states["rank"] = top_states.index
top_states = top_states.set_index("STATE")
top_vdata = vdata[vdata["STATE"].isin(top_states.index)]
top_vdata["state_code"] = top_vdata["STATE"].apply(
lambda state: top_states["rank"].at[state]
).astype(np.uint8)
top_vdata = top_vdata[top_vdata["AGE_YRS"].notna()]
top_vdata.loc[:,"AGE_YRS"] = top_vdata["AGE_YRS"].astype(int)
top_states
age_state = top_vdata[["state_code", "AGE_YRS"]]
age_state["state_code"]
state_code_arr = age_state["state_code"].values
type(state_code_arr), state_code_arr.shape, state_code_arr.dtype
age_state["AGE_YRS"]
age_arr = age_state["AGE_YRS"].values
type(age_arr), age_arr.shape, age_arr.dtype
age_arr.max()
age_state_mat = np.zeros((5,6), dtype=np.uint64)
for row in age_state.itertuples():
age_state_mat[row.state_code, row.AGE_YRS//20] += 1
age_state_mat
cal = age_state_mat[0,:]
kids = age_state_mat[:,0]
def compute_frac(arr_1d):
return arr_1d / arr_1d.sum()
frac_age_stat_mat = np.apply_along_axis(compute_frac, 1, age_state_mat)
perc_age_stat_mat = frac_age_stat_mat * 100
perc_age_stat_mat = perc_age_stat_mat.astype(np.uint8)
perc_age_stat_mat
perc_age_stat_mat = perc_age_stat_mat[:, :5]
perc_age_stat_mat
fig = plt.figure()
ax = fig.add_subplot()
ax.matshow(perc_age_stat_mat, cmap=plt.get_cmap("Greys"))
ax.set_yticks(range(5))
ax.set_yticklabels(top_states.index)
ax.set_xticks(range(6))
ax.set_xticklabels(["0-19", "20-39", "40-59", "60-79", "80-99", "100-119"])
fig.savefig("matrix.png")
================================================
FILE: Chapter02/Pandas_Basic.py
================================================
# # Using Pandas to process vaccine adverse events
#
# ## Data Access
#
# Go to https://vaers.hhs.gov/data/datasets.html and Download 2021 **zip** Data. Please do not download only the CSV File.
#
# Drop it on the directory where this notebook is.
# !unzip 2021VAERSData.zip
# !gzip -9 *csv
import pandas as pd
import matplotlib.pyplot as plt
vdata = pd.read_csv(
"2021VAERSDATA.csv.gz", encoding="iso-8859-1")
vdata.columns
vdata.dtypes
vdata.shape
vdata.iloc[0]
vdata = vdata.set_index("VAERS_ID")
vdata.loc[916600]
vdata.head(3)
vdata.iloc[:3]
vdata.iloc[:5, 2:4]
vdata["AGE_YRS"].max()
vdata.AGE_YRS.max()
vdata["AGE_YRS"].sort_values().plot(use_index=False)
vdata["AGE_YRS"].sort_values().plot(use_index=False)
fig, ax = plt.subplots(1, 2, sharey=True, dpi=300)
fig.suptitle("Age of adverse events")
vdata["AGE_YRS"].sort_values().plot(
use_index=False, ax=ax[0],
xlabel="Obervation", ylabel="Age")
vdata["AGE_YRS"].plot.hist(bins=20, orientation="horizontal")
fig.savefig("adverse.png")
vdata["AGE_YRS"].dropna().apply(lambda x: int(x)).value_counts()
# not documented
vdata.DIED.value_counts(dropna=False)
# NA is a problem, how to be implemented
vdata["is_dead"] = (vdata.DIED == "Y")
dead = vdata[vdata.is_dead]
vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID")
print(vax.columns)
print(vax.shape)
print(vax.VAX_TYPE.unique())
vax.groupby("VAX_TYPE").size().sort_values()
vax19 = vax[vax.VAX_TYPE == "COVID19"]
vax19_dead = dead.join(vax19)
# join on id, discuss
vax19_dead.index.value_counts()
baddies = vax19_dead.groupby("VAX_LOT").size().sort_values(ascending=False)
for i, (lot, cnt) in enumerate(baddies.items()):
print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby("STATE")))
if i == 10:
break
# The data above is not totally correct - at least in terms of interpretation, but for that we need to check the next recipe
================================================
FILE: Chapter02/Pandas_Join.py
================================================
# # Pandas advanced
import numpy as np
import pandas as pd
# # Code to sample original data
#
# ```
# vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
# vdata.sample(frac=0.9).to_csv("vdata_sample.csv.gz", index=False)
# vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1")
# vax.sample(frac=0.9).to_csv("vax_sample.csv.gz", index=False)
# ```
vdata = pd.read_csv("vdata_sample.csv.gz") # No encoding
vax = pd.read_csv("vax_sample.csv.gz")
vdata_with_vax = vdata.join(
vax.set_index("VAERS_ID"),
on="VAERS_ID",
how="inner")
len(vdata), len(vax), len(vdata_with_vax)
lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)]
lost_vdata
lost_vax = vax[~vax["VAERS_ID"].isin(vdata_with_vax["VAERS_ID"])]
lost_vax
# Left, Right and outer caveats
vdata_with_vax_left = vdata.join(
vax.set_index("VAERS_ID"),
on="VAERS_ID")
vdata_with_vax_left.groupby("VAERS_ID").size().sort_values()
len(vdata_with_vax_left), len(vdata_with_vax_left.VAERS_ID.unique())
# +
#vdata_all = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
#vax_all = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1")
# -
dead = vdata[vdata.DIED == "Y"]
vax19 = vax[vax.VAX_TYPE == "COVID19"]
vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how="right")
# join on id, discuss
len(vax19), len(dead), len(vax19_dead)
len(vax19_dead[vax19_dead.VAERS_ID.duplicated()])
len(vax19_dead) - len(dead)
vax19_dead["STATE"] = vax19_dead["STATE"].str.upper()
dead_lot = vax19_dead[["VAERS_ID", "VAX_LOT", "STATE"]].set_index(["VAERS_ID", "VAX_LOT"])
dead_lot_clean = dead_lot[~dead_lot.index.duplicated()]
dead_lot_clean = dead_lot_clean.reset_index()
dead_lot_clean[dead_lot_clean.VAERS_ID.isna()]
baddies = dead_lot_clean.groupby("VAX_LOT").size().sort_values(ascending=False)
for i, (lot, cnt) in enumerate(baddies.items()):
print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby("STATE")))
if i == 10:
break
================================================
FILE: Chapter02/Pandas_Memory.py
================================================
# # Pandas advanced
import numpy as np
import pandas as pd
vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
vdata.info(memory_usage="deep")
for name in vdata.columns:
col_bytes = vdata[name].memory_usage(index=False, deep=True)
col_type = vdata[name].dtype
print(
name,
col_type, col_bytes // (1024 ** 2))
vdata.DIED.memory_usage(index=False, deep=True)
vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True)
vdata.STATE.unique()
vdata["STATE"] = vdata.STATE.str.upper()
states = list(vdata["STATE"].unique())
states
vdata["encoded_state"] = vdata.STATE.apply(lambda state: states.index(state))
vdata["encoded_state"] = vdata["encoded_state"].astype(np.uint8)
vdata[["encoded_state", "STATE"]].head(10)
vdata["STATE"].memory_usage(index=False, deep=True)
vdata["encoded_state"].memory_usage(index=False, deep=True)
vdata.index
states = list(pd.read_csv(
"vdata_sample.csv.gz",
converters={
"STATE": lambda state: state.upper() # You need to know the states in advance
},
usecols=["STATE"]
)["STATE"].unique())
vdata = pd.read_csv(
"vdata_sample.csv.gz",
index_col="VAERS_ID",
converters={
"DIED": lambda died: died == "Y",
"STATE": lambda state: states.index(state.upper())
},
usecols=lambda name: name != "SYMPTOM_TEXT"
)
vdata["STATE"] = vdata["STATE"].astype(np.uint8)
vdata.info(memory_usage="deep")
================================================
FILE: Chapter03/Accessing_Databases.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
from Bio import Entrez, Medline, SeqIO
# ### Do not forget to inform NCBI of your email address (change below)
Entrez.email = "put@your_email.here"
#This gives you the list of available databases
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)
handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)
if int(rec_list['RetMax']) < int(rec_list['Count']):
handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
retmax=rec_list['Count'])
rec_list = Entrez.read(handle)
id_list = rec_list['IdList']
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count'])
recs = list(SeqIO.parse(hdl, 'gb'))
for rec in recs:
if rec.name == 'KM288867':
break
print(rec.name)
print(rec.description)
for feature in rec.features:
if feature.type == 'gene':
print(feature.qualifiers['gene'])
elif feature.type == 'exon':
loc = feature.location
print('Exon', loc.start, loc.end, loc.strand)
else:
print('not processed:\n%s' % feature)
for name, value in rec.annotations.items():
print('%s=%s' % (name, value))
print(len(rec.seq))
refs = rec.annotations['references']
print(refs)
for ref in refs:
if ref.pubmed_id != '':
print(ref.pubmed_id)
handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
rettype="medline", retmode="text")
records = Medline.parse(handle)
for med_rec in records:
for k, v in med_rec.items():
print('%s: %s' % (k, v))
================================================
FILE: Chapter03/Basic_Sequence_Processing.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
from Bio import Entrez, Seq, SeqIO, SeqRecord
Entrez.email = "put@your_email.here"
hdl = Entrez.efetch(db='nucleotide', id=['NM_002299'], rettype='gb') # Lactase gene
#for l in hdl:
# print l
gb_rec = SeqIO.read(hdl, 'gb')
for feature in gb_rec.features:
if feature.type == 'CDS':
location = feature.location # Note translation existing
cds = SeqRecord.SeqRecord(gb_rec.seq[location.start:location.end], 'NM_002299', description='LCT CDS only')
w_hdl = open('example.fasta', 'w')
SeqIO.write([cds], w_hdl, 'fasta')
w_hdl.close()
recs = SeqIO.parse('example.fasta', 'fasta')
for rec in recs:
seq = rec.seq
print(rec.description)
print(seq[:10])
print((seq[:12], seq[-12:]))
rna = seq.transcribe()
rna
prot = seq.translate()
prot
================================================
FILE: Chapter03/Filtering_SNPs.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Getting the necessary data
# You will need to do this only once
# !rm -rf centro.vcf.gz 2>/dev/null
# !rm -rf standard.vcf.gz 2>/dev/null
# !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:1-200000 |bgzip -c > centro.vcf.gz
# !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:21000000-21200000 |bgzip -c > standard.vcf.gz
# !tabix -p vcf centro.vcf.gz
# !tabix -p vcf standard.vcf.gz
# # Recipe
# +
from collections import defaultdict
import functools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from cyvcf2 import VCF
# -
def do_window(recs, size, fun):
start = None
win_res = []
for rec in recs:
if not rec.is_snp or len(rec.ALT) > 1:
continue
if start is None:
start = rec.POS
my_win = 1 + (rec.POS - start) // size
while len(win_res) < my_win:
win_res.append([])
win_res[my_win - 1].extend(fun(rec))
return win_res
def apply_win_funs(wins, funs):
fun_results = []
for win in wins:
my_funs = {}
for name, fun in funs.items():
try:
my_funs[name] = fun(win)
except:
my_funs[name] = None
fun_results.append(my_funs)
return fun_results
wins = {}
size = 2000
names = ['centro.vcf.gz', 'standard.vcf.gz']
for name in names:
recs = VCF(name)
wins[name] = do_window(recs, size, lambda x: [1])
stats = {}
fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True)
for name, nwins in wins.items():
stats[name] = apply_win_funs(nwins, {'sum': sum})
x_lim = [i * size for i in range(len(stats[name]))]
ax.plot(x_lim, [x['sum'] for x in stats[name]], label=name)
ax.legend()
ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large')
ax.set_ylabel('Number of variant sites (bi-allelic SNPs)', fontsize='xx-large')
fig.suptitle('Number of bi-allelic SNPs along the genome', fontsize='xx-large')
fig.savefig('bi.png')
# +
mq0_wins = {}
size = 5000
def get_sample(rec, annot, my_type):
return [v for v in rec.format(annot) if v > np.iinfo(my_type).min]
for name in names:
recs = VCF(name)
mq0_wins[name] = do_window(recs, size, functools.partial(get_sample, annot='MQ0', my_type=np.int32))
# -
stats = {}
colors = ['b', 'g']
i = 0
fig, ax = plt.subplots(figsize=(16, 9))
for name, nwins in mq0_wins.items():
stats[name] = apply_win_funs(nwins, {'median': np.median, '75': functools.partial(np.percentile, q=95)})
x_lim = [j * size for j in range(len(stats[name]))]
ax.plot(x_lim, [x['median'] for x in stats[name]], label=name, color=colors[i])
ax.plot(x_lim, [x['75'] for x in stats[name]], '--', color=colors[i])
i += 1
#ax.set_ylim(0, 40)
ax.legend()
ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large')
ax.set_ylabel('MQ0', fontsize='xx-large')
fig.suptitle('Distribution of MQ0 along the genome', fontsize='xx-large')
fig.savefig('MQ0.png')
def get_sample_relation(recs, f1, f2):
rel = defaultdict(int)
for rec in recs:
if not rec.is_snp:
continue
for pos in range(len(rec.genotypes)):
v1 = f1(rec, pos)
v2 = f2(rec, pos)
if v1 is None or v2 == np.iinfo(type(v2)).min:
continue # We ignore Nones
rel[(v1, v2)] += 1
# careful with the size, floats: round?
#break
return rel
rels = {}
for name in names:
recs = VCF(name)
rels[name] = get_sample_relation(
recs,
lambda rec, pos: 1 if rec.genotypes[pos][0] != rec.genotypes[pos][1] else 0,
lambda rec, pos: rec.format('DP')[pos][0])
# +
fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True)
def plot_hz_rel(dps, ax, ax2, name, rel):
frac_hz = []
cnt_dp = []
for dp in dps:
hz = 0.0
cnt = 0
for khz, kdp in rel.keys():
if kdp != dp:
continue
cnt += rel[(khz, dp)]
if khz == 1:
hz += rel[(khz, dp)]
frac_hz.append(hz / cnt)
cnt_dp.append(cnt)
ax.plot(dps, frac_hz, label=name)
ax2.plot(dps, cnt_dp, '--', label=name)
ax2 = ax.twinx()
for name, rel in rels.items():
dps = list(set([x[1] for x in rel.keys()]))
dps.sort()
plot_hz_rel(dps, ax, ax2, name, rel)
ax.set_xlim(0, 75)
ax.set_ylim(0, 0.2)
ax2.set_ylabel('Quantity of calls', fontsize='xx-large')
ax.set_ylabel('Fraction of Heterozygote calls', fontsize='xx-large')
ax.set_xlabel('Sample Read Depth (DP)', fontsize='xx-large')
ax.legend()
fig.suptitle('Number of calls per depth and fraction of calls which are Hz',
fontsize='xx-large')
fig.savefig('hz.png')
# -
def get_variant_relation(recs, f1, f2):
rel = defaultdict(int)
for rec in recs:
if not rec.is_snp:
continue
try:
v1 = f1(rec)
v2 = f2(rec)
if v1 is None or v2 is None:
continue # We ignore Nones
rel[(v1, v2)] += 1
#careful with the size, floats: round?
except:
# This is outside the domain (typically None)
pass
return rel
# +
accepted_eff = ['INTERGENIC', 'INTRON', 'NON_SYNONYMOUS_CODING', 'SYNONYMOUS_CODING']
def eff_to_int(rec):
try:
annot = rec.INFO['EFF']
master_type = annot.split('(')[0]
return accepted_eff.index(master_type)
except ValueError:
return len(accepted_eff)
# -
eff_mq0s = {}
for name in names:
recs = VCF(name)
eff_mq0s[name] = get_variant_relation(
recs,
lambda r: eff_to_int(r), lambda r: int(r.INFO['DP']))
fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
name = 'standard.vcf.gz'
bp_vals = [[] for x in range(len(accepted_eff) + 1)]
for k, cnt in eff_mq0s[name].items():
my_eff, mq0 = k
bp_vals[my_eff].extend([mq0] * cnt)
#memory usage
#print(bp_vals[-2])
sns.boxplot(data=bp_vals, sym='', ax=ax)
ax.set_xticklabels(accepted_eff + ['OTHER'])
ax.set_ylabel('DP (variant)', fontsize='xx-large')
fig.suptitle('Distribution of variant DP per SNP type',
fontsize='xx-large')
fig.savefig('eff.png')
================================================
FILE: Chapter03/LCT.bed
================================================
track name=gene description="Gene information"
2 135836529 135837180 ENSE00002202258 0 -
2 135833110 135833190 ENSE00001660765 0 -
2 135829592 135829676 ENSE00001731451 0 -
2 135823900 135824003 ENSE00001659892 0 -
2 135822019 135822098 ENSE00001777620 0 -
2 135817340 135818061 ENSE00001602826 0 -
2 135812310 135812956 ENSE00000776576 0 -
2 135808442 135809993 ENSE00001008768 0 -
2 135807127 135807396 ENSE00000776573 0 -
2 135804766 135805057 ENSE00000776572 0 -
2 135803929 135804128 ENSE00000776571 0 -
2 135800606 135800809 ENSE00000776570 0 -
2 135798028 135798138 ENSE00003515081 0 -
2 135794640 135794775 ENSE00001630333 0 -
2 135790657 135790881 ENSE00001667885 0 -
2 135789570 135789798 ENSE00001728878 0 -
2 135787839 135788544 ENSE00001653704 0 -
2 135812310 135812959 ENSE00001745158 0 -
2 135808442 135809993 ENSE00001008768 0 -
2 135807127 135807396 ENSE00000776573 0 -
2 135804766 135805057 ENSE00000776572 0 -
2 135803929 135804128 ENSE00000776571 0 -
2 135798028 135798138 ENSE00003459353 0 -
2 135794336 135794775 ENSE00001635523 0 -
2 135810168 135810279 ENSE00001438557 0 -
2 135820190 135820639 ENSE00001732580 0 +
2 135821674 135823087 ENSE00001695040 0 +
2 135836529 135837180 NM_002299.2.1 0 -
2 135833110 135833190 NM_002299.2.2 0 -
2 135829592 135829676 NM_002299.2.3 0 -
2 135823900 135824003 NM_002299.2.4 0 -
2 135822019 135822098 NM_002299.2.5 0 -
2 135817340 135818061 NM_002299.2.6 0 -
2 135812310 135812956 NM_002299.2.7 0 -
2 135808442 135809993 NM_002299.2.8 0 -
2 135807127 135807396 NM_002299.2.9 0 -
2 135804766 135805057 NM_002299.2.10 0 -
2 135803929 135804128 NM_002299.2.11 0 -
2 135800606 135800809 NM_002299.2.12 0 -
2 135798028 135798138 NM_002299.2.13 0 -
2 135794640 135794775 NM_002299.2.14 0 -
2 135790657 135790881 NM_002299.2.15 0 -
2 135789570 135789798 NM_002299.2.16 0 -
2 135787844 135788544 NM_002299.2.17 0 -
2 135836529 135837169 CCDS2178.117 0 -
2 135833110 135833190 CCDS2178.116 0 -
2 135829592 135829676 CCDS2178.115 0 -
2 135823900 135824003 CCDS2178.114 0 -
2 135822019 135822098 CCDS2178.113 0 -
2 135817340 135818061 CCDS2178.112 0 -
2 135812310 135812956 CCDS2178.111 0 -
2 135808442 135809993 CCDS2178.110 0 -
2 135807127 135807396 CCDS2178.19 0 -
2 135804766 135805057 CCDS2178.18 0 -
2 135803929 135804128 CCDS2178.17 0 -
2 135800606 135800809 CCDS2178.16 0 -
2 135798028 135798138 CCDS2178.15 0 -
2 135794640 135794775 CCDS2178.14 0 -
2 135790657 135790881 CCDS2178.13 0 -
2 135789570 135789798 CCDS2178.12 0 -
2 135788323 135788544 CCDS2178.11 0 -
================================================
FILE: Chapter03/Processing_BED_with_HTSeq.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
from collections import defaultdict
import re
import HTSeq
lct_bed = HTSeq.BED_Reader('LCT.bed')
# +
feature_types = defaultdict(int)
for rec in lct_bed:
last_rec = rec
feature_types[re.search('([A-Z]+)', rec.name).group(0)] += 1
print(feature_types)
#Code specific to this dataset, document
# -
print(last_rec)
print(last_rec.name)
print(type(last_rec))
interval = last_rec.iv
print(interval)
print(type(interval))
# +
print(interval.chrom, interval.start, interval.end)
print(interval.strand)
print(interval.length)
print(interval.start_d)
print(interval.start_as_pos)
print(type(interval.start_as_pos))
#talk about overlaps
# -
exon_start = None
exon_end = None
sizes = []
for rec in lct_bed:
if not rec.name.startswith('CCDS'):
continue
interval = rec.iv
exon_start = min(interval.start, exon_start or interval.start)
exon_end = max(interval.length, exon_end or interval.end)
sizes.append(interval.length)
sizes.sort()
print("Num exons: %d / Begin: %d / End %d" % (len(sizes), exon_start, exon_end))
print("Smaller exon: %d / Larger exon: %d / Mean size: %.1f" % (sizes[0], sizes[-1], sum(sizes)/len(sizes)))
================================================
FILE: Chapter03/Working_with_BAM.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Getting the necessary data
# You just need to do this only once
# !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam 2>/dev/null
# !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai 2>/dev/null
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai
# # The recipe
# +
#pip install pysam
from collections import defaultdict
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pysam
# -
bam = pysam.AlignmentFile('NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam', 'rb')
headers = bam.header
for record_type, records in headers.items():
print (record_type)
for i, record in enumerate(records):
if type(record) == dict:
print('\t%d' % (i + 1))
for field, value in record.items():
print('\t\t%s\t%s' % (field, value))
else:
print('\t\t%s' % record)
#0-based
for rec in bam:
if rec.cigarstring.find('M') > -1 and rec.cigarstring.find('S') > -1 and not rec.is_unmapped and not rec.mate_is_unmapped:
break
print(rec.query_name, rec.reference_id, bam.getrname(rec.reference_id), rec.reference_start, rec.reference_end)
print(rec.cigarstring)
print(rec.query_alignment_start, rec.query_alignment_end, rec.query_alignment_length)
print(rec.next_reference_id, rec.next_reference_start, rec.template_length)
print(rec.is_paired, rec.is_proper_pair, rec.is_unmapped, rec.mapping_quality)
print(rec.query_qualities)
print(rec.query_alignment_qualities)
print(rec.query_sequence)
counts = [0] * 76
for n, rec in enumerate(bam.fetch('20', 0, 10000000)):
for i in range(rec.query_alignment_start, rec.query_alignment_end):
counts[i] += 1
freqs = [100 * x / (n + 1) for x in counts]
fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
ax.plot(range(1, 77), freqs)
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Percentage of mapped calls as a function of the position from the start of the sequencer read', fontsize='xx-large')
fig.savefig('map_perc.png')
phreds = defaultdict(list)
for rec in bam.fetch('20', 0, None):
for i in range(rec.query_alignment_start, rec.query_alignment_end):
phreds[i].append(rec.query_qualities[i])
maxs = [max(phreds[i]) for i in range(76)]
tops = [np.percentile(phreds[i], 95) for i in range(76)]
medians = [np.percentile(phreds[i], 50) for i in range(76)]
bottoms = [np.percentile(phreds[i], 5) for i in range(76)]
medians_fig = [x - y for x, y in zip(medians, bottoms)]
tops_fig = [x - y for x, y in zip(tops, medians)]
maxs_fig = [x - y for x, y in zip(maxs, tops)]
fig, ax = plt.subplots(figsize=(16,9),dpi=300, tight_layout=True)
ax.stackplot(range(1, 77), (bottoms, medians_fig, tops_fig, maxs_fig))
ax.plot(range(1, 77), maxs, 'k-')
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Distribution of PHRED scores as a function of the position in the read', fontsize='xx-large')
fig.savefig('phred2.png')
================================================
FILE: Chapter03/Working_with_FASTQ.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Getting the necessary data
# You just need to download this ~28 MB file only once
# !rm -f SRR003265.filt.fastq.gz 2>/dev/null
# !wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265.filt.fastq.gz
# # The recipe
# +
from collections import defaultdict
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO
# -
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
rec = next(recs)
print(rec.id, rec.description, rec.seq)
print(rec.letter_annotations)
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
cnt = defaultdict(int)
for rec in recs:
for letter in rec.seq:
cnt[letter] += 1
tot = sum(cnt.values())
for letter, cnt in cnt.items():
print('%s: %.2f %d' % (letter, 100 * cnt / tot, cnt))
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='UTF-8'), 'fastq')
n_cnt = defaultdict(int)
for rec in recs:
for i, letter in enumerate(rec.seq):
pos = i + 1
if letter == 'N':
n_cnt[pos] += 1
seq_len = max(n_cnt.keys())
positions = range(1, seq_len + 1)
fig, ax = plt.subplots(figsize=(16, 9), tight_layout=True, dpi=300)
fig.suptitle('Number of N calls as a function of the distance from the start of the sequencer read', fontsize='xx-large')
ax.plot(positions, [n_cnt[x] for x in positions])
ax.set_xlim(1, seq_len)
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('Number of N Calls', fontsize='xx-large')
fig.savefig('n_calls.png')
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
cnt_qual = defaultdict(int)
for rec in recs:
for i, qual in enumerate(rec.letter_annotations['phred_quality']):
if i < 25:
continue
cnt_qual[qual] += 1
tot = sum(cnt_qual.values())
for qual, cnt in cnt_qual.items():
print('%d: %.2f %d' % (qual, 100. * cnt / tot, cnt))
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
qual_pos = defaultdict(list)
for rec in recs:
for i, qual in enumerate(rec.letter_annotations['phred_quality']):
if i < 25 or qual == 40:
continue
pos = i + 1
qual_pos[pos].append(qual)
vps = []
poses = list(qual_pos.keys())
poses.sort()
for pos in poses:
vps.append(qual_pos[pos])
fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
sns.boxplot(data=vps, ax=ax)
ax.set_xticklabels([str(x) for x in range(26, max(qual_pos.keys()) + 1)])
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Distribution of PHRED scores as a function of read distance', fontsize='xx-large')
fig.savefig('phred.png')
# # There is more...
# ## Do this to download the paired end data
# Be careful as this will be 1GB of data (and fully optional)
# !rm -f SRR003265_1.filt.fastq.gz 2>/dev/null
# !rm -f SRR003265_2.filt.fastq.gz 2>/dev/null
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz
# +
f1 = gzip.open('SRR003265_1.filt.fastq.gz', 'rt', encoding='utf8')
f2 = gzip.open('SRR003265_2.filt.fastq.gz', 'rt', encoding='utf8')
recs1 = SeqIO.parse(f1, 'fastq')
recs2 = SeqIO.parse(f2, 'fastq')
cnt = 0
for rec1, rec2 in zip(recs1, recs2):
cnt +=1
print('Number of pairs: %d' % cnt)
# -
================================================
FILE: Chapter03/Working_with_VCF.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Getting the necessary data
# You just need to do this only once
# !rm -f genotypes.vcf.gz 2>/dev/null
# !tabix -fh ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/supporting/vcf_with_sample_level_annotation/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5_extra_anno.20130502.genotypes.vcf.gz 22:1-17000000|bgzip -c > genotypes.vcf.gz
# !tabix -p vcf genotypes.vcf.gz
# +
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from cyvcf2 import VCF
# +
v = VCF('genotypes.vcf.gz')
rec = next(v)
print('Variant Level information')
info = rec.INFO
for info in rec.INFO:
print(info)
print('Sample Level information')
for fmt in rec.FORMAT:
print(fmt)
# +
v = VCF('genotypes.vcf.gz')
samples = v.samples
print(len(samples)) # Order change
variant = next(v)
print(variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT, variant.QUAL, variant.FILTER)
print(variant.INFO)
print(variant.FORMAT)
print(variant.is_snp)
#rec.format('DP')
#rec.format('GT')
str_alleles = variant.gt_bases[0]
alleles = variant.genotypes[0][0:2]
is_phased = variant.genotypes[0][2]
print(str_alleles, alleles, is_phased)
print(variant.format('DP')[0])
# +
f = VCF('genotypes.vcf.gz')
my_type = defaultdict(int)
num_alts = defaultdict(int)
for variant in f:
my_type[variant.var_type, variant.var_subtype] += 1
if variant.var_type == 'snp':
num_alts[len(variant.ALT)] += 1
print(my_type)
print(num_alts)
# +
f = VCF('genotypes.vcf.gz')
sample_dp = defaultdict(int)
for variant in f:
if not variant.is_snp or len(variant.ALT) != 1:
continue
for dp in variant.format('DP'):
#dp = int(dp)
sample_dp[dp] += 1
# -
dps = list(sample_dp.keys())
dps.sort()
dp_dist = [sample_dp[x] for x in dps]
fig, ax = plt.subplots(figsize=(16, 9))
ax.plot(dp_dist[:50], 'r')
ax.axvline(dp_dist.index(max(dp_dist)))
================================================
FILE: Chapter04/2L.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
# %matplotlib inline
from collections import defaultdict
import gzip
import numpy as np
import matplotlib.pylab as plt
# -
num_parents = 8
dp_2L = np.load(gzip.open('DP_2L.npy.gz', 'rb'))
dp_2L.shape
for i in range(num_parents):
print(np.median(dp_2L[:,i]), np.median(dp_2L[50000:150000,i]))
window_size = 200000
parent_DP_windows = [defaultdict(list) for i in range(num_parents)]
# +
def insert_in_window(row):
for parent in range(num_parents):
parent_DP_windows[parent][row[-1] // window_size].append(row[parent])
insert_in_window_v = np.vectorize(insert_in_window, signature='(n)->()')
_ = insert_in_window_v(dp_2L)
# -
fig, axs = plt.subplots(2, num_parents // 2, figsize=(16, 9), sharex=True, sharey=True, squeeze=True)
for parent in range(num_parents):
ax = axs[parent // 4][parent % 4]
parent_data = parent_DP_windows[parent]
ax.set_ylim(10, 40)
ax.plot(*zip(*[(win*window_size, np.mean(lst)) for win, lst in parent_data.items()]), '.')
================================================
FILE: Chapter04/Exploration.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import gzip
import pickle
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
# %matplotlib inline
# -
fit = np.load(gzip.open('balanced_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True)
num_features = len(ordered_features)
fit_df = pd.DataFrame(fit, columns=ordered_features + ['pos', 'error'])
num_samples = 80
del fit
fig,ax = plt.subplots(figsize=(16,9))
_ = fit_df.hist(column=ordered_features, ax=ax)
fit_df['MeanDP'] = fit_df['DP'] / 80
fig, ax = plt.subplots()
_ = ax.hist(fit_df[fit_df['MeanDP']<50]['MeanDP'], bins=100)
errors_df = fit_df[fit_df['error'] == 1]
ok_df = fit_df[fit_df['error'] == 0]
ok_qual_above_df = ok_df[ok_df['QUAL']>0.005]
errors_qual_above_df = errors_df[errors_df['QUAL']>0.005]
print(ok_df.size, errors_df.size, ok_qual_above_df.size, errors_qual_above_df.size)
print(ok_qual_above_df.size / ok_df.size, errors_qual_above_df.size / errors_df.size)
ok_qd_above_df = ok_df[ok_df['QD']>0.05]
errors_qd_above_df = errors_df[errors_df['QD']>0.05]
print(ok_df.size, errors_df.size, ok_qd_above_df.size, errors_qd_above_df.size)
print(ok_qd_above_df.size / ok_df.size, errors_qd_above_df.size / errors_df.size)
not_bad_area_errors_df = errors_df[(errors_df['QUAL']<0.005)&(errors_df['QD']<0.05)]
_ = scatter_matrix(not_bad_area_errors_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02)
not_bad_area_ok_df = ok_df[(ok_df['QUAL']<0.005)&(ok_df['QD']<0.05)]
_ = scatter_matrix(not_bad_area_ok_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02)
all_fit_df = pd.DataFrame(np.load(gzip.open('feature_fit.npy.gz', 'rb')), columns=ordered_features + ['pos', 'error'])
potentially_good_corner_df = all_fit_df[(all_fit_df['QUAL']<0.005)&(all_fit_df['QD']<0.05)]
all_errors_df=all_fit_df[all_fit_df['error'] == 1]
print(len(all_fit_df), len(all_errors_df), len(all_errors_df) / len(all_fit_df))
potentially_good_corner_errors_df = potentially_good_corner_df[potentially_good_corner_df['error'] == 1]
print(len(potentially_good_corner_df), len(potentially_good_corner_errors_df), len(potentially_good_corner_errors_df) / len(potentially_good_corner_df))
print(len(potentially_good_corner_df)/len(all_fit_df))
================================================
FILE: Chapter04/Mendel.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import random
import matplotlib.pyplot as plt
# # Mendelian simulations
num_sims = 100000
num_ofs = 20
# +
num_hets_AA_AT = []
for sim in range(num_sims):
sim_hets = 0
for ofs in range(20):
sim_hets += 1 if random.choice([0, 1]) == 1 else 0
num_hets_AA_AT.append(sim_hets)
fig, ax = plt.subplots(1,1, figsize=(16,9))
ax.hist(num_hets_AA_AT, bins=range(20))
print(len([num_hets for num_hets in num_hets_AA_AT if num_hets==20]))
# -
num_AAs_AT_AT = []
num_hets_AT_AT = []
for sim in range(num_sims):
sim_AAs = 0
sim_hets = 0
for ofs in range(20):
derived_cnt = sum(random.choices([0, 1], k=2))
sim_AAs += 1 if derived_cnt == 0 else 0
sim_hets += 1 if derived_cnt == 1 else 0
num_AAs_AT_AT.append(sim_AAs)
num_hets_AT_AT.append(sim_hets)
fig, ax = plt.subplots(1,1, figsize=(16,9))
ax.hist([num_hets_AT_AT, num_AAs_AT_AT], histtype='step', fill=False, bins=range(20), label=['het', 'AA'])
plt.legend()
# # Balanced output
# +
import gzip
import pickle
import random
import numpy as np
# -
mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb'))
feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True)
num_features = len(ordered_features)
len(mendelian_errors), len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))
total_observations = len(mendelian_errors)
error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))
ok_observations = total_observations - error_observations
fraction_errors = error_observations/total_observations
print (total_observations, ok_observations, error_observations, 100*fraction_errors)
del mendelian_errors
# +
prob_ok_choice = error_observations / ok_observations
def accept_entry(row):
if row[-1] == 1:
return True
return random.random() <= prob_ok_choice
accept_entry_v = np.vectorize(accept_entry, signature='(i)->()')
accepted_entries = accept_entry_v(feature_fit)
balanced_fit = feature_fit[accepted_entries]
del feature_fit
balanced_fit.shape
len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0])
# -
np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False)
================================================
FILE: Chapter04/Preparation.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5
# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5
# +
import pickle
import gzip
import random
import numpy as np
import h5py
import pandas as pd
# -
samples = pd.read_csv('samples.tsv', sep='\t')
print(len(samples))
print(samples['cross'].unique())
print(samples[samples['cross'] == 'cross-29-2'][['id', 'function']])
print(len(samples[samples['cross'] == 'cross-29-2']))
print(samples[samples['function'] == 'parent'])
# # Chromosome arm 3L
# +
h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r')
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples']))
calldata_genotype = h5_3L['/3L/calldata/genotype']
MQ0 = h5_3L['/3L/variants/MQ0']
MQ = h5_3L['/3L/variants/MQ']
QD = h5_3L['/3L/variants/QD']
Coverage = h5_3L['/3L/variants/Coverage']
CoverageMQ0 = h5_3L['/3L/variants/CoverageMQ0']
HaplotypeScore = h5_3L['/3L/variants/HaplotypeScore']
QUAL = h5_3L['/3L/variants/QUAL']
FS = h5_3L['/3L/variants/FS']
DP = h5_3L['/3L/variants/DP']
HRun = h5_3L['/3L/variants/HRun']
ReadPosRankSum = h5_3L['/3L/variants/ReadPosRankSum']
my_features = {
'MQ': MQ,
'QD': QD,
'Coverage': Coverage,
'HaplotypeScore': HaplotypeScore,
'QUAL': QUAL,
'FS': FS,
'DP': DP,
'HRun': HRun,
'ReadPosRankSum': ReadPosRankSum
}
num_features = len(my_features)
num_alleles = h5_3L['/3L/variants/num_alleles']
is_snp = h5_3L['/3L/variants/is_snp']
POS = h5_3L['/3L/variants/POS']
# -
#compute mendelian errors (biallelic)
def compute_mendelian_errors(mother, father, offspring):
num_errors = 0
num_ofs_problems = 0
if len(mother.union(father)) == 1:
# Mother and father are homo and the same
for ofs in offspring:
if len(ofs) == 2:
# Offspring is het
num_errors += 1
num_ofs_problems += 1
elif len(ofs.intersection(mother)) == 0:
# Offspring is homo, but opposite from parents
num_errors += 2
num_ofs_problems += 1
elif len(mother) == 1 and len(father) == 1:
# Mother and father are homo and different
for ofs in offspring:
if len(ofs) == 1:
# Homo, should be het
num_errors += 1
num_ofs_problems += 1
elif len(mother) == 2 and len(father) == 2:
# Both are het, individual offspring can be anything
pass
else:
# One is het, the other is homo
homo = mother if len(mother) == 1 else father
for ofs in offspring:
if len(ofs) == 1 and not ofs.intersection(homo):
# homo, but not including the allele from parent that is homo
num_errors += 1
num_ofs_problems += 1
return num_errors, num_ofs_problems
# +
def acceptable_position_to_genotype():
for i, genotype in enumerate(calldata_genotype):
if is_snp[i] and num_alleles[i] == 2:
if len(np.where(genotype == -1)[0]) > 1:
# Missing data
continue
yield i
def acumulate(fun):
acumulator = {}
for res in fun():
if res is not None:
acumulator[res[0]] = res[1]
return acumulator
# +
def get_family_indexes(samples_hdf5, cross_pd):
offspring = []
for i, individual in cross_pd.T.iteritems():
index = samples_hdf5.index(individual.id)
if individual.function == 'parent':
if individual.sex == 'M':
father = index
else:
mother = index
else:
offspring.append(index)
return {'mother': mother, 'father': father, 'offspring': offspring}
cross_pd = samples[samples['cross'] == 'cross-29-2']
family_indexes = get_family_indexes(samples_hdf5, cross_pd)
# +
mother_index = family_indexes['mother']
father_index = family_indexes['father']
offspring_indexes = family_indexes['offspring']
all_errors = {}
def get_mendelian_errors():
for i in acceptable_position_to_genotype():
genotype = calldata_genotype[i]
mother = set(genotype[mother_index])
father = set(genotype[father_index])
offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes]
my_mendelian_errors = compute_mendelian_errors(mother, father, offspring)
yield POS[i], my_mendelian_errors
mendelian_errors = acumulate(get_mendelian_errors)
pickle.dump(mendelian_errors, gzip.open('mendelian_errors.pickle.gz', 'wb'))
# +
ordered_positions = sorted(mendelian_errors.keys())
ordered_features = sorted(my_features.keys()) #XXX on code?
num_features = len(ordered_features)
feature_fit = np.empty((len(ordered_positions), len(my_features) + 2), dtype=float)
for column, feature in enumerate(ordered_features): # 'Strange' order
print(feature)
current_hdf_row = 0
for row, genomic_position in enumerate(ordered_positions):
while POS[current_hdf_row] < genomic_position:
current_hdf_row +=1
feature_fit[row, column] = my_features[feature][current_hdf_row]
for row, genomic_position in enumerate(ordered_positions):
feature_fit[row, num_features] = genomic_position
feature_fit[row, num_features + 1] = 1 if mendelian_errors[genomic_position][0] > 0 else 0
np.save(gzip.open('feature_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False)
pickle.dump(ordered_features, open('ordered_features', 'wb'))
# -
# # Chromosome arm 2L
h5_2L = h5py.File('ag1000g.crosses.phase1.ar3sites.2L.h5', 'r')
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_2L['/2L/samples']))
calldata_DP = h5_2L['/2L/calldata/DP']
POS = h5_2L['/2L/variants/POS']
# +
def get_parent_indexes(samples_hdf5, parents_pd):
parents = []
for i, individual in parents_pd.T.iteritems():
index = samples_hdf5.index(individual.id)
parents.append(index)
return parents
parents_pd = samples[samples['function'] == 'parent']
parent_indexes = get_parent_indexes(samples_hdf5, parents_pd)
# -
all_dps = []
for i, pos in enumerate(POS):
if random.random() > 0.01:
continue
pos_dp = calldata_DP[i]
parent_pos_dp = [pos_dp[parent_index] for parent_index in parent_indexes]
all_dps.append(parent_pos_dp + [pos])
all_dps = np.array(all_dps)
np.save(gzip.open('DP_2L.npy.gz', 'wb'), all_dps, allow_pickle=False, fix_imports=False)
================================================
FILE: Chapter04/QIIME2_Metagenomics.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Important: Read this!
#
# This recipe does not work with the standard conda environment.
#
# If you are in the standard environment, do this:
#
# 1. Stop Jupyter
# 2. Activate QIIME2 environment on conda
# 3. Do `jupyter serverextension enable --py qiime2 --sys-prefix`
# 4. Start Jupyter inside QIIME2 environment
#
# Note that other recipes will not work inside this environment.
# # Check this out!
#
# This is based on on [QIIME2 Fecal Microbiota Transpant example](https://docs.qiime2.org/2018.8/tutorials/fmt/) (for the command line). You are strongly advised to read it before proceeding.
#
# There is an [amazing example](http://nbviewer.jupyter.org/gist/tkosciol/29de5198a4be81559a075756c2490fde) of using the Artifact API using the "Moving Pictures" tutorial of QIIME 2 produced by Tomasz Kościółek. I use a more convoluted approach than Tomasz's in order to go a little deeper in terms of understanding of the Python internals. That is more of a learning experience on the internals than a practical recommendatin. **My recommendation is to use Tomasz's dialect, not mine**.
#
# # Getting the data
# !wget https://data.qiime2.org/2018.8/tutorials/fmt/sample_metadata.tsv
# !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-1-10p.qza
# !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-2-10p.qza
# # The recipe
# +
import pandas as pd
from qiime2.metadata.metadata import Metadata
from qiime2.metadata.metadata import CategoricalMetadataColumn
from qiime2.sdk import Artifact
from qiime2.sdk import PluginManager
from qiime2.sdk import Result
# -
pm = PluginManager()
demux_plugin = pm.plugins['demux']
#demux_emp_single = demux_plugin.actions['emp_single']
demux_summarize = demux_plugin.actions['summarize']
pm.plugins
print(demux_summarize.description)
demux_summarize_signature = demux_summarize.signature
print(demux_summarize_signature.inputs)
print(demux_summarize_signature.parameters)
print(demux_summarize_signature.outputs)
# +
seqs1 = Result.load('fmt-tutorial-demux-1-10p.qza')
sum_data1 = demux_summarize(seqs1)
sum_data1.visualization
# +
seqs2 = Result.load('fmt-tutorial-demux-2-10p.qza')
sum_data2 = demux_summarize(seqs2)
print(dir(sum_data2))
print(type(sum_data2.visualization))
print(dir(sum_data2.visualization))
sum_data2.visualization
# -
#Quality control
dada2_plugin = pm.plugins['dada2']
dada2_denoise_single = dada2_plugin.actions['denoise_single']
qual_control1 = dada2_denoise_single(demultiplexed_seqs=seqs1,
trunc_len=150, trim_left=13)
qual_control2 = dada2_denoise_single(demultiplexed_seqs=seqs2,
trunc_len=150, trim_left=13)
metadata_plugin = pm.plugins['metadata']
metadata_tabulate = metadata_plugin.actions['tabulate']
stats_meta1 = metadata_tabulate(input=qual_control1.denoising_stats.view(Metadata))
stats_meta1.visualization
stats_meta2 = metadata_tabulate(input=qual_control2.denoising_stats.view(Metadata))
stats_meta2.visualization
# +
ft_plugin = pm.plugins['feature-table']
ft_merge = ft_plugin.actions['merge']
ft_merge_seqs = ft_plugin.actions['merge_seqs']
ft_summarize = ft_plugin.actions['summarize']
ft_tab_seqs = ft_plugin.actions['tabulate_seqs']
table_merge = ft_merge(tables=[qual_control1.table, qual_control2.table])
seqs_merge = ft_merge_seqs(data=[qual_control1.representative_sequences, qual_control2.representative_sequences])
# -
ft_sum = ft_summarize(table=table_merge.merged_table)
ft_sum.visualization
tab_seqs = ft_tab_seqs(data=seqs_merge.merged_data)
tab_seqs.visualization
================================================
FILE: Chapter04/samples.tsv
================================================
id cross sex function
AD0231-C cross-29-2 F parent
AD0232-C cross-29-2 M parent
AD0234-C cross-29-2 F progeny
AD0235-C cross-29-2 F progeny
AD0236-C cross-29-2 F progeny
AD0237-C cross-29-2 F progeny
AD0238-C cross-29-2 F progeny
AD0239-C cross-29-2 F progeny
AD0240-C cross-29-2 M progeny
AD0241-C cross-29-2 F progeny
AD0242-C cross-29-2 M progeny
AD0243-C cross-29-2 F progeny
AD0244-C cross-29-2 F progeny
AD0245-C cross-29-2 F progeny
AD0246-C cross-29-2 F progeny
AD0247-C cross-29-2 M progeny
AD0248-C cross-29-2 F progeny
AD0249-C cross-29-2 F progeny
AD0250-C cross-29-2 F progeny
AD0251-C cross-29-2 F progeny
AD0252-C cross-29-2 F progeny
AD0253-C cross-29-2 M progeny
AD0254-C cross-36-9 F parent
AD0255-C cross-36-9 M parent
AD0259-C cross-36-9 M progeny
AD0260-C cross-36-9 F progeny
AD0261-C cross-36-9 F progeny
AD0262-C cross-36-9 M progeny
AD0263-C cross-36-9 M progeny
AD0265-C cross-36-9 F progeny
AD0266-C cross-36-9 M progeny
AD0267-C cross-36-9 F progeny
AD0268-C cross-36-9 M progeny
AD0269-C cross-36-9 F progeny
AD0270-C cross-36-9 M progeny
AD0271-C cross-36-9 M progeny
AD0272-C cross-36-9 F progeny
AD0273-C cross-36-9 M progeny
AD0274-C cross-36-9 F progeny
AD0275-C cross-36-9 M progeny
AD0276-C cross-36-9 F progeny
AD0305-C cross-42-4 F parent
AD0306-C cross-42-4 M parent
AD0309-C cross-42-4 M progeny
AD0310-C cross-42-4 M progeny
AD0311-C cross-42-4 M progeny
AD0312-C cross-42-4 M progeny
AD0313-C cross-42-4 M progeny
AD0314-C cross-42-4 M progeny
AD0315-C cross-42-4 M progeny
AD0316-C cross-42-4 F progeny
AD0317-C cross-42-4 M progeny
AD0318-C cross-42-4 M progeny
AD0319-C cross-42-4 F progeny
AD0320-C cross-42-4 F progeny
AD0322-C cross-42-4 F progeny
AD0323-C cross-42-4 F progeny
AD0347-C cross-46-9 F parent
AD0348-C cross-46-9 M parent
AD0351-C cross-46-9 M progeny
AD0352-C cross-46-9 F progeny
AD0353-C cross-46-9 F progeny
AD0354-C cross-46-9 F progeny
AD0355-C cross-46-9 F progeny
AD0356-C cross-46-9 M progeny
AD0357-C cross-46-9 F progeny
AD0358-C cross-46-9 F progeny
AD0359-C cross-46-9 M progeny
AD0360-C cross-46-9 F progeny
AD0361-C cross-46-9 F progeny
AD0362-C cross-46-9 M progeny
AD0363-C cross-46-9 F progeny
AD0364-C cross-46-9 M progeny
AD0365-C cross-46-9 M progeny
AD0366-C cross-46-9 F progeny
AD0367-C cross-46-9 F progeny
AD0368-C cross-46-9 F progeny
AD0369-C cross-46-9 F progeny
AD0370-C cross-46-9 F progeny
AD0438-C cross-36-9 F progeny
================================================
FILE: Chapter05/.gitignore
================================================
*.fasta
ag.db
*gz
*png
================================================
FILE: Chapter05/Annotations.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
#pip install gffutils
from collections import defaultdict
import gffutils
import sqlite3
# -
# !rm -f ag.db
# !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff
# !gzip -9 gambiae.gff
try:
db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
except sqlite3.OperationalError:
db = gffutils.FeatureDB('ag.db')
print(list(db.featuretypes()))
for feat_type in db.featuretypes():
print(feat_type, db.count_features_of_type(feat_type))
seqids = set()
for e in db.all_features():
seqids.add(e.seqid)
for seqid in seqids:
print(seqid)
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for seqid in seqids:
cnt = 0
for gene in db.region(seqid=seqid, featuretype='protein_coding_gene'):
cnt += 1
span = abs(gene.start - gene.end) # strand
if span > max_span:
max_span = span
max_span_gene = gene
my_mRNAs = list(db.children(gene, featuretype='mRNA'))
num_mRNAs[len(my_mRNAs)] += 1
if len(my_mRNAs) == 0:
exon_check = [gene]
else:
exon_check = my_mRNAs
for check in exon_check:
my_exons = list(db.children(check, featuretype='exon'))
num_exons[len(my_exons)] += 1
if len(my_exons) > max_exons:
max_exons = len(my_exons)
max_exons_gene = gene
print(f'seqid {seqid}, number of genes {cnt}')
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)
================================================
FILE: Chapter05/Gene_Ontology.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
#use pip install as conda install requires a lot of downgrades at this stage
import pygraphviz as pgv
from IPython.core.display import Image
# ## The cell below comes from the Orthology notebook
# +
import requests
ensembl_server = 'http://rest.ensembl.org'
def do_request(server, service, *args, **kwargs):
params = ''
for a in args:
if a is not None:
params += '/' + a
req = requests.get('%s/%s%s' % (server, service, params),
params=kwargs,
headers={'Content-Type': 'application/json'})
if not req.ok:
req.raise_for_status()
return req.json()
# -
lct_id = 'ENSG00000115850'
refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1')
print(len(refs))
print(refs[0].keys())
for ref in refs:
go_id = ref['primary_id']
details = do_request(ensembl_server, 'ontology/id', go_id)
print('%s %s %s' % (go_id, details['namespace'], ref['description']))
print('%s\n' % details['definition'])
go_id = 'GO:0000016'
my_data = do_request(ensembl_server, 'ontology/id', go_id)
for k, v in my_data.items():
if k == 'parents':
for parent in v:
print(parent)
parent_id = parent['accession']
else:
print('%s: %s' % (k, str(v)))
print()
parent_data = do_request(ensembl_server, 'ontology/id', parent_id)
print(parent_id, len(parent_data['children']))
refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id)
for go, entry in refs.items():
print(go)
term = entry['term']
print('%s %s' % (term['name'], term['definition']))
is_a = entry.get('is_a', [])
print('\t is a: %s\n' % ', '.join([x['accession'] for x in is_a]))
def get_upper(go_id):
parents = {}
node_data = {}
refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id)
for ref, entry in refs.items():
my_data = do_request(ensembl_server, 'ontology/id', ref)
node_data[ref] = {'name': entry['term']['name'], 'children': my_data['children']}
try:
parents[ref] = [x['accession'] for x in entry['is_a']]
except KeyError:
pass # Top of hierarchy
return parents, node_data
parents, node_data = get_upper(go_id)
g = pgv.AGraph(directed=True)
for ofs, ofs_parents in parents.items():
ofs_text = '%s\n(%s)' % (node_data[ofs]['name'].replace(', ', '\n'), ofs)
for parent in ofs_parents:
parent_text = '%s\n(%s)' % (node_data[parent]['name'].replace(', ', '\n'), parent)
children = node_data[parent]['children']
if len(children) < 3:
for child in children:
if child['accession'] in node_data:
continue
g.add_edge(parent_text, child['accession'])
else:
g.add_edge(parent_text, '...%d...' % (len(children) - 1))
g.add_edge(parent_text, ofs_text)
print(g)
g.graph_attr['label']='Ontology tree for Lactase activity'
g.node_attr['shape']='rectangle'
g.layout(prog='dot')
g.draw('graph.png')
Image("graph.png")
print(go_id)
refs = do_request(ensembl_server, 'ontology/descendants', go_id)
for go in refs:
print(go['accession'], go['name'], go['definition'])
================================================
FILE: Chapter05/Getting_Gene.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import gffutils
import gzip
from Bio import Seq, SeqIO
# ## Retrieving data
# !rm -f ag.db
# !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff
# !gzip -9 gambiae.gff
db = gffutils.FeatureDB('ag.db')
# # Getting a gene
gene_id = 'AGAP004707'
gene = db[gene_id]
print(gene)
print(gene.seqid, gene.strand)
recs = SeqIO.parse(gzip.open('gambiae.fa.gz', 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
print(rec.description)
if rec.id == gene.seqid:
my_seq = rec.seq
break
# +
def get_sequence(chrom_seq, CDSs, strand):
seq = Seq.Seq('')
for CDS in CDSs:
# #FRAME???
my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1: CDS.end]))
seq += my_cds
return seq if strand == '+' else seq.reverse_complement()
# +
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
print(mRNA.id)
if mRNA.id.endswith('RA'):
break
CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)
print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)
# -
# # Reverse strand
reverse_transcript_id = 'AGAP004708-RA'
# +
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')
print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)
# -
================================================
FILE: Chapter05/Low_Quality.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import gzip
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO, SeqUtils
# -
# !rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null
# !wget https://vectorbase.org/common/downloads/Current_Release/AgambiaePEST/fasta/data/VectorBase-67_AgambiaePEST_Genome.fasta -O gambiae.fa
# !gzip -9 gambiae.fa
# !wget https://vectorbase.org/common/downloads/Current_Release/AatroparvusEBRO/fasta/data/VectorBase-67_AatroparvusEBRO_Genome.fasta -O atroparvus.fa
# !gzip -9 atroparvus.fa
gambiae_name = 'gambiae.fa.gz'
atroparvus_name = 'atroparvus.fa.gz'
recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
print(rec.description)
#Do not do this with atroparvus
recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta')
chrom_Ns = {}
chrom_sizes = {}
for rec in recs:
if rec.description.find('supercontig') > -1:
continue
print(rec.description, rec.id, rec)
chrom = rec.id.split('_')[1]
if chrom in ['UNKN']:#, 'Y_unplaced']:
continue
chrom_Ns[chrom] = []
on_N = False
curr_size = 0
for pos, nuc in enumerate(rec.seq):
if nuc in ['N', 'n']:
curr_size += 1
on_N = True
else:
if on_N:
chrom_Ns[chrom].append(curr_size)
curr_size = 0
on_N = False
if on_N:
chrom_Ns[chrom].append(curr_size)
chrom_sizes[chrom] = len(rec.seq)
for chrom, Ns in chrom_Ns.items():
size = chrom_sizes[chrom]
if len(Ns) > 0:
max_Ns = max(Ns)
else:
max_Ns = 'NA'
print(f'{chrom} ({size}): %Ns ({round(100 * sum(Ns) / size, 1)}), num Ns: {len(Ns)}, max N: {max_Ns}')
# ## Atroparvus super-contigs
recs = SeqIO.parse(gzip.open(atroparvus_name, 'rt', encoding='utf-8'), 'fasta')
sizes = []
size_N = []
for rec in recs:
size = len(rec.seq)
sizes.append(size)
count_N = 0
for nuc in rec.seq:
if nuc in ['n', 'N']:
count_N += 1
size_N.append((size, count_N / size))
print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
np.percentile(sizes, 10), np.percentile(sizes, 90))
small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), dpi=300, squeeze=False, sharey=True)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
fig.suptitle('Fraction of Ns per contig size', fontsize=26)
fig.savefig('frac.png')
================================================
FILE: Chapter05/Orthology.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import requests
ensembl_server = 'http://rest.ensembl.org'
def do_request(server, service, *args, **kwargs):
url_params = ''
for a in args:
if a is not None:
url_params += '/' + a
req = requests.get('%s/%s%s' % (server, service, url_params),
params=kwargs,
headers={'Content-Type': 'application/json'})
if not req.ok:
req.raise_for_status()
return req.json()
# -
answer = do_request(ensembl_server, 'info/species')
for i, sp in enumerate(answer['species']):
print(i, sp['name'])
ext_dbs = do_request(ensembl_server, 'info/external_dbs', 'homo_sapiens', filter='HGNC%')
print(ext_dbs)
answer = do_request(ensembl_server, 'lookup/symbol', 'homo_sapiens', 'LCT')
print(answer)
lct_id = answer['id']
lct_seq = do_request(ensembl_server, 'sequence/id', lct_id)
print(lct_seq)
lct_xrefs = do_request(ensembl_server, 'xrefs/id', lct_id)
for xref in lct_xrefs:
print(xref['db_display_name'])
print(xref)
refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1')
print(lct_id, refs)
hom_response = do_request(ensembl_server, 'homology/id', lct_id, type='orthologues', sequence='none')
#print(hom_response['data'][0]['homologies'])
homologies = hom_response['data'][0]['homologies']
for homology in homologies:
print(homology['target']['species'])
if homology['target']['species'] != 'equus_caballus':
continue
print(homology)
print(homology['taxonomy_level'])
horse_id = homology['target']['id']
horse_req = do_request(ensembl_server, 'lookup/id', horse_id)
print(horse_req)
# +
#maybe synteny of MCM6 and LCT with caballus and gorilla
================================================
FILE: Chapter05/Reference_Genome.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
from IPython.core.display import Image
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio import SeqIO
from Bio.Graphics import BasicChromosome
# -
# !rm -f PlasmoDB-9.3_Pfalciparum3D7_Genome.fasta 2>/dev/null
# vvvv 13.0
# !wget http://plasmodb.org/common/downloads/release-13.0/Pfalciparum3D7/fasta/data/PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta
genome_name = 'PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta'
recs = SeqIO.parse(genome_name, 'fasta')
chroms = {}
for rec in recs:
print(rec.description)
# +
from Bio import SeqUtils
chrom_sizes = {}
chrom_GC = {}
recs = SeqIO.parse(genome_name, 'fasta')
block_size = 50000
min_GC = 100.0
max_GC = 0.0
for rec in recs:
if rec.description.find('SO=chromosome') == -1:
continue
chrom = int(rec.description.split('_')[1])
chrom_GC[chrom] = []
size = len(rec.seq)
chrom_sizes[chrom] = size
num_blocks = size // block_size + 1
for block in range(num_blocks):
start = block_size * block
if block == num_blocks - 1:
end = size
else:
end = block_size + start + 1
block_seq = rec.seq[start:end]
block_GC = SeqUtils.GC(block_seq)
if block_GC < min_GC:
min_GC = block_GC
if block_GC > max_GC:
max_GC = block_GC
chrom_GC[chrom].append(block_GC)
print(min_GC, max_GC)
# +
chroms = list(chrom_sizes.keys())
chroms.sort()
biggest_chrom = max(chrom_sizes.values())
my_genome = BasicChromosome.Organism(output_format="png")
my_genome.page_size = (29.7*cm, 21*cm) # check
telomere_length = 10
bottom_GC = 17.5
top_GC = 22.0
for chrom in chroms:
chrom_size = chrom_sizes[chrom]
chrom_representation = BasicChromosome.Chromosome('Cr %d' % chrom)
chrom_representation.scale_num = biggest_chrom
tel = BasicChromosome.TelomereSegment()
tel.scale = telomere_length
chrom_representation.add(tel)
num_blocks = len(chrom_GC[chrom])
for block, gc in enumerate(chrom_GC[chrom]):
my_GC = chrom_GC[chrom][block]
body = BasicChromosome.ChromosomeSegment()
if my_GC > top_GC:
body.fill_color = colors.Color(1, 0, 0)
elif my_GC < bottom_GC:
body.fill_color = colors.Color(1, 1, 0)
else:
my_color = (my_GC - bottom_GC) / (top_GC - bottom_GC)
body.fill_color = colors.Color(my_color, my_color, 1)
if block < num_blocks - 1:
body.scale = block_size
else:
body.scale = chrom_size % block_size
chrom_representation.add(body)
tel = BasicChromosome.TelomereSegment(inverted=True)
tel.scale = telomere_length
chrom_representation.add(tel)
my_genome.add(chrom_representation)
my_genome.draw("falciparum.png", "Plasmodium falciparum")
Image("falciparum.png")
# -
================================================
FILE: Chapter06/.gitignore
================================================
*.log
*.ped
*.map
*.bed
*.bim
*.fam
exclude*.txt
relationships_w_pops_041510.txt
*.in
*.out
================================================
FILE: Chapter06/Admixture.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# +
from collections import defaultdict
import os
import matplotlib.pyplot as plt
from genomics.popgen.admix import cluster, plot
# %matplotlib notebook
# -
k_range = range(2, 10) # 2..9
# ### The next cell is very slow. Example outputs are provided (so you can avoid running it)
# +
#for k in k_range:
# os.system('admixture --cv=10 hapmap10_auto_noofs_ld.bed %d > admix.%d' % (k, k))
# -
# ## Individual order
f = open('hapmap10_auto_noofs_ld.fam')
ind_order = []
for l in f:
toks = l.rstrip().replace(' ', '\t').split('\t')
fam_id = toks[0]
ind_id = toks[1]
ind_order.append((fam_id, ind_id))
f.close()
# ## CV-plot
CVs = []
for k in k_range:
f = open('admix.%d' % k)
for l in f:
if l.find('CV error') > -1:
CVs.append(float(l.rstrip().split(' ')[-1]))
break
f.close()
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
ax.plot(k_range, CVs)
ax.set_title('Cross-Validation error')
ax.set_xlabel('K')
# ## Load meta-data
f = open('relationships_w_pops_121708.txt')
pop_ind = defaultdict(list)
f.readline() # header
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
if (fam_id, ind_id) not in ind_order:
continue
mom = toks[2]
dad = toks[3]
if mom != '0' or dad != '0':
continue
pop = toks[-1]
pop_ind[pop].append((fam_id, ind_id))
#ind_pop[('2469', 'NA20281')] = ind_pop[('2805', 'NA20281')]
f.close()
def load_Q(fname, ind_order):
ind_comps = {}
f = open(fname)
for i, l in enumerate(f):
comps = [float(x) for x in l.rstrip().split(' ')]
ind_comps[ind_order[i]] = comps
f.close()
return ind_comps
comps = {}
for k in k_range:
comps[k] = load_Q('hapmap10_auto_noofs_ld.%d.Q' % k, ind_order)
ordering = {}
for k in k_range:
ordering[k] = cluster(comps[k], pop_ind)
fig = plt.figure(figsize=(9, 9))
plot.single(comps[4], ordering[4], fig)
None
fig = plt.figure(figsize=(16, 9))
plot.stacked(comps, ordering[7], fig)
# ## Q files?
# ## Log-likelihood
================================================
FILE: Chapter06/Data_Formats.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# ## Data download
# +
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt
# -
# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz
# # Preparation
import os
from collections import defaultdict
# ## Loading HapMap meta-data
f = open('relationships_w_pops_041510.txt')
pop_ind = defaultdict(list)
f.readline() # header
offspring = []
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
mom = toks[2]
dad = toks[3]
if mom != '0' or dad != '0':
offspring.append((fam_id, ind_id))
pop = toks[-1]
pop_ind[pop].append((fam_id, ind_id))
f.close()
# ## Sub-sampling
os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap10 --thin 0.1 --geno 0.1 --export ped')
os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap1 --thin 0.01 --geno 0.1 --export ped')
# ## Getting only autosomal data
def get_non_auto_SNPs(map_file, exclude_file):
f = open(map_file)
w = open(exclude_file, 'w')
for l in f:
toks = l.rstrip().split('\t')
try:
chrom = int(toks[0])
except ValueError:
rs = toks[1]
w.write('%s\n' % rs)
w.close()
get_non_auto_SNPs('hapmap10.map', 'exclude10.txt')
get_non_auto_SNPs('hapmap1.map', 'exclude1.txt')
# !plink2 --pedmap hapmap10 --out hapmap10_auto --exclude exclude10.txt --export ped
# !plink2 --pedmap hapmap1 --out hapmap1_auto --exclude exclude1.txt --export ped
# ## Removing offspring
# !plink2 --pedmap hapmap10_auto --filter-founders --out hapmap10_auto_noofs --export ped
# ## LD-prunning
# !plink2 --pedmap hapmap10_auto_noofs --indep-pairwise 50 10 0.1 --out keep --export ped
# !plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --out hapmap10_auto_noofs_ld --export ped
# ## Different encoding
# !plink2 --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld_12 --export ped 12
# !plink2 --make-bed --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld
# ## Single chromosome
# !plink2 --pedmap hapmap10_auto_noofs --chr 2 --out hapmap10_auto_noofs_2 --export ped
================================================
FILE: Chapter06/Exploratory_Analysis.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# ## Loading HapMap data
# +
import numpy as np
import xarray as xr
import sgkit as sg
from sgkit.io import plink
data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -
data
print(data.dims)
variant_stats = sg.variant_stats(data)
variant_stats
variant_stats.variant_call_rate.to_series().describe()
print(type(variant_stats.variant_call_rate.to_series()))
sample_stats = sg.sample_stats(data)
sample_stats
sample_stats.sample_call_rate.to_series().hist()
data['sample_cohort'] = xr.DataArray(
np.zeros(data.dims['samples'], dtype=np.int64),
dims='samples')
# data["sample_cohort"] = xr.DataArray(np.repeat([0, 1], data.dims["samples"] // 2), dims="samples")
sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].values
sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].to_series().hist()
# # maf
cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values
min_freqs = map(
lambda x: x if x < 0.5 else 1 - x,
filter(
lambda x: x not in [0, 1],
cohort_allele_frequency[:, 0, 0]))
================================================
FILE: Chapter06/PCA.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.3
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# + jupyter={"outputs_hidden": false}
import os
from genomics.popgen.plink.convert import to_eigen
from genomics.popgen.pca import plot, smart
# %matplotlib inline
# -
# ## Meta-data load
# + jupyter={"outputs_hidden": false}
f = open('relationships_w_pops_121708.txt')
ind_pop = {}
f.readline() # header
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
pop = toks[-1]
ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
ind_pop['2469/NA20281'] = ind_pop['2805/NA20281']
# -
# ## Requires plink from data preparation
# + jupyter={"outputs_hidden": false}
to_eigen('hapmap10_auto_noofs_ld_12', 'hapmap10_auto_noofs_ld_12')
# -
# ## Running smartpca
# + jupyter={"outputs_hidden": false}
ctrl = smart.SmartPCAController('hapmap10_auto_noofs_ld_12')
ctrl.run()
# + jupyter={"outputs_hidden": false}
wei, wei_perc, ind_comp = smart.parse_evec('hapmap10_auto_noofs_ld_12.evec', 'hapmap10_auto_noofs_ld_12.eval')
# + jupyter={"outputs_hidden": false}
plot.render_pca(ind_comp, 1, 2, cluster=ind_pop)
#put weights
# + jupyter={"outputs_hidden": false}
plot.render_pca_eight(ind_comp, cluster=ind_pop)
# + jupyter={"outputs_hidden": false}
markers = { 'CHB': '*', 'CHD': '*', 'JPT': '*', 'GIH': '*',
'CEU': 'v', 'TSI': 'v', 'MEX': 'v',
'ASW': 'o', 'LWK': 'o', 'YRI': 'o', 'MKK': 'o'
}
# -
# ## With scikit-learn
# + jupyter={"outputs_hidden": false}
from sklearn.decomposition import PCA
import numpy as np
# + jupyter={"outputs_hidden": false}
f = open('hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
ninds += 1
toks = line[:100].replace(' ', '\t').split('\t') # for speed
fam_id = toks[0]
ind_id = toks[1]
ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
print (nsnps)
f.close()
# + jupyter={"outputs_hidden": false}
pca_array = np.empty((ninds, nsnps), dtype=int)
print(pca_array.shape)
f = open('hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
snps = line.replace(' ', '\t').split('\t')[6:]
for pos in range(len(snps) // 2):
a1 = int(snps[2 * pos])
a2 = int(snps[2 * pos])
my_code = a1 + a2 - 2
pca_array[ind, pos] = my_code
f.close()
#slow
# + jupyter={"outputs_hidden": false}
my_pca = PCA(n_components=8)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)
#Memory required
# + jupyter={"outputs_hidden": false}
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca_eight(sc_ind_comp, cluster=ind_pop)
# + jupyter={"outputs_hidden": false}
================================================
FILE: Chapter06/Pop_Stats.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# ## Loading HapMap meta-data
# +
from collections import defaultdict
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xarray as xr
import sgkit as sg
from sgkit.io import plink
data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -
data
f = open('relationships_w_pops_041510.txt')
pop_ind = defaultdict(list)
f.readline() # header
for line in f:
toks = line.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
pop = toks[-1]
pop_ind[pop].append((fam_id, ind_id))
pops = list(pop_ind.keys())
def assign_cohort(pops, pop_ind, sample_family_id, sample_id):
cohort = []
for fid, sid in zip(sample_family_id, sample_id):
processed = False
for i, pop in enumerate(pops):
if (fid, sid) in pop_ind[pop]:
processed = True
cohort.append(i)
break
if not processed:
raise Exception(f'Not processed {fid}, {sid}')
return cohort
cohort = assign_cohort(pops, pop_ind, data.sample_family_id.values, data.sample_id.values)
data['sample_cohort'] = xr.DataArray(
cohort, dims='samples')
# # monomorphic positions per pop
cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values
monom = {}
for i, pop in enumerate(pops):
monom[pop] = len(list(filter(lambda x: x, np.isin(cohort_allele_frequency[:, i, 0], [0, 1]))))
pprint(monom)
# # MAF
mafs = {}
for i, pop in enumerate(pops):
min_freqs = map(
lambda x: x if x < 0.5 else 1 - x,
filter(
lambda x: x not in [0, 1],
cohort_allele_frequency[:, i, 0]))
mafs[pop] = pd.Series(min_freqs)
maf_plot, maf_ax = plt.subplots(nrows=2, sharey=True)
mafs['YRI'].hist(ax=maf_ax[0], bins=50)
maf_ax[0].set_title('*YRI*')
mafs['JPT'].hist(ax=maf_ax[1], bins=50)
maf_ax[1].set_title('*JPT*')
maf_ax[1].set_xlabel('MAF')
# # Fst
fst = sg.Fst(data)
fst = fst.assign_coords({"cohorts_0": pops, "cohorts_1": pops})
remove_nan = lambda data: filter(lambda x: not np.isnan(x), data)
ceu_chb = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CEU', cohorts_1='CHB').values))
chb_chd = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CHB', cohorts_1='CHD').values))
ceu_chb.describe()
chb_chd.describe()
mean_fst = {}
for i, pop_i in enumerate(pops):
for j, pop_j in enumerate(pops):
if j <= i:
continue
pair_fst = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0=pop_i, cohorts_1=pop_j).values))
mean = pair_fst.mean()
mean_fst[(pop_i, pop_j)] = mean
min_pair = min(mean_fst.values())
max_pair = max(mean_fst.values())
sns.set_style("white")
num_pops = len(pops)
arr = np.ones((num_pops - 1, num_pops - 1, 3), dtype=float)
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
for row in range(num_pops - 1):
pop_i = pops[row]
for col in range(row + 1, num_pops):
pop_j = pops[col]
val = mean_fst[(pop_i, pop_j)]
norm_val = (val - min_pair) / (max_pair - min_pair)
ax.text(col - 1, row, '%.3f' % val, ha='center')
if norm_val == 0.0:
arr[row, col - 1, 0] = 1
arr[row, col - 1, 1] = 1
arr[row, col - 1, 2] = 0
elif norm_val == 1.0:
arr[row, col - 1, 0] = 1
arr[row, col - 1, 1] = 0
arr[row, col - 1, 2] = 1
else:
arr[row, col - 1, 0] = 1 - norm_val
arr[row, col - 1, 1] = 1
arr[row, col - 1, 2] = 1
ax.imshow(arr, interpolation='none')
ax.set_title('Multilocus Pairwise FST')
ax.set_xticks(range(num_pops - 1))
ax.set_xticklabels(pops[1:])
ax.set_yticks(range(num_pops - 1))
ax.set_yticklabels(pops[:-1])
================================================
FILE: Chapter06/Sgkit.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import os
from collections import defaultdict
# ## Loading HapMap data
# +
import numpy as np
from sgkit.io import plink
data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -
data
print(data.dims)
print(len(data.sample_id.values))
print(data.sample_id.values)
print(data.sample_family_id.values)
print(data.sample_sex.values)
print(data.contigs)
print(len(data.variant_contig.values))
print(data.variant_contig.values)
print(data.variant_position.values)
print(data.variant_allele.values)
print(data.variant_id.values)
data.call_genotype
call_genotype = data.call_genotype.values
print(call_genotype.shape)
first_individual = call_genotype[:,0,:]
first_variant = call_genotype[0,:,:]
first_variant_of_first_individual = call_genotype[0,0,:]
print(first_variant_of_first_individual)
print(data.sample_family_id.values[0], data.sample_id.values[0])
print(data.variant_allele.values[0])
================================================
FILE: Chapter07/.gitignore
================================================
*fasta
trim.fasta.reduced
*nex
bp_rx
================================================
FILE: Chapter07/Alignment.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import os
import dendropy
# -
# ## Genome alignment
from Bio.Align.Applications import MafftCommandline
mafft_cline = MafftCommandline(input='sample.fasta', ep=0.123, reorder=True, maxiterate=1000, localpair=True)
print(mafft_cline)
stdout, stderr = mafft_cline()
with open('align.fasta', 'w') as w:
w.write(stdout)
os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta')
# ## Protein alignment
# +
from Bio.Align.Applications import MuscleCommandline
my_genes = ['NP', 'L', 'VP35', 'VP40']
for gene in my_genes:
muscle_cline = MuscleCommandline(input='%s_P.fasta' % gene)
print(muscle_cline)
stdout, stderr = muscle_cline()
with open('%s_P_align.fasta' % gene, 'w') as w:
w.write(stdout)
# +
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# XXX vvv
# from Bio.Alphabet import generic_protein
for gene in my_genes:
gene_seqs = {}
unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta')
for rec in unal_gene:
gene_seqs[rec.id] = rec.seq
al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta')
al_genes = []
for protein in al_prot:
my_id = protein.id
seq = ''
pos = 0
for c in protein.seq:
if c == '-':
seq += '---'
else:
seq += str(gene_seqs[my_id][pos:pos + 3])
pos += 3
al_genes.append(SeqRecord(Seq(seq), id=my_id))
SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta')
# -
================================================
FILE: Chapter07/Comparison.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import os
from collections import OrderedDict
import numpy as np
import pandas as pd
import dendropy
from dendropy.calculate import popgenstat
# -
# ## Genes
# +
genes_species = OrderedDict()
my_species = ['RESTV', 'SUDV']
my_genes = ['NP', 'L', 'VP35', 'VP40']
for name in my_genes:
gene_name = name.split('.')[0]
char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')
genes_species[gene_name] = {}
for species in my_species:
genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()
for taxon, char_map in char_mat.items():
species = taxon.label.split('_')[0]
if species in my_species:
genes_species[gene_name][species].taxon_namespace.add_taxon(taxon)
genes_species[gene_name][species][taxon] = char_map
# -
summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))
stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']
for row, (gene, species_data) in enumerate(genes_species.items()):
for col_base, species in enumerate(my_species):
summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])
summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])
summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])
summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])
columns = []
for species in my_species:
columns.extend(['%s (%s)' % (stat, species) for stat in stats])
df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)
df # vs print(df)
# ## Genomes
def do_basic_popgen(seqs):
num_seg_sites = popgenstat.num_segregating_sites(seqs)
avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)
nuc_div = popgenstat.nucleotide_diversity(seqs)
print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))
print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs))
print("Tajima's D: %s" % popgenstat.tajimas_d(seqs))
#XXX change
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path(
'trim.fasta', schema='fasta', data_type='dna')
sl_2014 = []
drc_2007 = []
ebov2007_set = dendropy.DnaCharacterMatrix()
ebov2014_set = dendropy.DnaCharacterMatrix()
for taxon, char_map in ebov_seqs.items():
print(taxon.label)
if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:
sl_2014.append(char_map)
ebov2014_set.taxon_namespace.add_taxon(taxon)
ebov2014_set[taxon] = char_map
elif taxon.label.startswith('EBOV_2007'):
drc_2007.append(char_map)
ebov2007_set.taxon_namespace.add_taxon(taxon)
ebov2007_set[taxon] = char_map
#ebov2007_set.extend_map({taxon: char_map})
del ebov_seqs
# +
print('2007 outbreak:')
print('Number of individuals: %s' % len(ebov2007_set.taxon_namespace))
do_basic_popgen(ebov2007_set)
print('\n2014 outbreak:')
print('Number of individuals: %s' % len(ebov2014_set.taxon_namespace))
do_basic_popgen(ebov2014_set)
# -
print(len(sl_2014))
print(len(drc_2007))
pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)
print('Average number of pairwise differences irrespective of population: %.2f' %
pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %.2f' %
pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %.2f' %
pair_stats.average_number_of_pairwise_differences_within)
print('Average number of net pairwise differences : %.2f' %
pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %d' %
pair_stats.num_segregating_sites)
print("Watterson's theta: %.2f" %
pair_stats.wattersons_theta)
print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi)
print("Tajima's D: %.2f" % pair_stats.tajimas_d)
================================================
FILE: Chapter07/Exploration.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import dendropy
from dendropy.interop import genbank
# ## Getting the data
# +
def get_ebov_2014_sources():
#EBOV_2014
#yield 'EBOV_2014', genbank.GenBankDna(id_range=(233036, 233118), prefix='KM')
yield 'EBOV_2014', genbank.GenBankDna(id_range=(34549, 34563), prefix='KM0')
def get_other_ebov_sources():
#EBOV other
yield 'EBOV_1976', genbank.GenBankDna(ids=['AF272001', 'KC242801'])
yield 'EBOV_1995', genbank.GenBankDna(ids=['KC242796', 'KC242799'])
yield 'EBOV_2007', genbank.GenBankDna(id_range=(84, 90), prefix='KC2427')
def get_other_ebolavirus_sources():
#BDBV
yield 'BDBV', genbank.GenBankDna(id_range=(3, 6), prefix='KC54539')
yield 'BDBV', genbank.GenBankDna(ids=['FJ217161'])
#RESTV
yield 'RESTV', genbank.GenBankDna(ids=['AB050936', 'JX477165', 'JX477166', 'FJ621583', 'FJ621584', 'FJ621585'])
#SUDV
yield 'SUDV', genbank.GenBankDna(ids=['KC242783', 'AY729654', 'EU338380',
'JN638998', 'FJ968794', 'KC589025', 'JN638998'])
#yield 'SUDV', genbank.GenBankDna(id_range=(89, 92), prefix='KC5453')
#TAFV
yield 'TAFV', genbank.GenBankDna(ids=['FJ217162'])
# +
other = open('other.fasta', 'w')
sampled = open('sample.fasta', 'w')
for species, recs in get_other_ebolavirus_sources():
tn = dendropy.TaxonNamespace()
char_mat = recs.generate_char_matrix(taxon_namespace=tn,
gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession)))
char_mat.write_to_stream(other, 'fasta')
char_mat.write_to_stream(sampled, 'fasta')
other.close()
ebov_2014 = open('ebov_2014.fasta', 'w')
ebov = open('ebov.fasta', 'w')
for species, recs in get_ebov_2014_sources():
tn = dendropy.TaxonNamespace()
char_mat = recs.generate_char_matrix(taxon_namespace=tn,
gb_to_taxon_fn=lambda gb: tn.require_taxon(label='EBOV_2014_%s' % gb.accession))
char_mat.write_to_stream(ebov_2014, 'fasta')
char_mat.write_to_stream(sampled, 'fasta')
char_mat.write_to_stream(ebov, 'fasta')
ebov_2014.close()
ebov_2007 = open('ebov_2007.fasta', 'w')
for species, recs in get_other_ebov_sources():
tn = dendropy.TaxonNamespace()
char_mat = recs.generate_char_matrix(taxon_namespace=tn,
gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession)))
char_mat.write_to_stream(ebov, 'fasta')
char_mat.write_to_stream(sampled, 'fasta')
if species == 'EBOV_2007':
char_mat.write_to_stream(ebov_2007, 'fasta')
ebov.close()
ebov_2007.close()
sampled.close()
# -
# ## Genes
# +
my_genes = ['NP', 'L', 'VP35', 'VP40']
def dump_genes(species, recs, g_dls, p_hdls):
for rec in recs:
for feature in rec.feature_table:
if feature.key == 'CDS':
gene_name = None
for qual in feature.qualifiers:
if qual.name == 'gene':
if qual.value in my_genes:
gene_name = qual.value
elif qual.name == 'translation':
protein_translation = qual.value
if gene_name is not None:
locs = feature.location.split('.')
start, end = int(locs[0]), int(locs[-1])
g_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession))
p_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession))
g_hdls[gene_name].write('%s\n' % rec.sequence_text[start - 1 : end])
p_hdls[gene_name].write('%s\n' % protein_translation)
g_hdls = {}
p_hdls = {}
for gene in my_genes:
g_hdls[gene] = open('%s.fasta' % gene, 'w')
p_hdls[gene] = open('%s_P.fasta' % gene, 'w')
for species, recs in get_other_ebolavirus_sources():
if species in ['RESTV', 'SUDV']:
dump_genes(species, recs, g_hdls, p_hdls)
for gene in my_genes:
g_hdls[gene].close()
p_hdls[gene].close()
# -
# ## Genome exploration
def describe_seqs(seqs):
print('Number of sequences: %d' % len(seqs.taxon_namespace))
print('First 10 taxon sets: %s' % ' '.join([taxon.label for taxon in seqs.taxon_namespace[:10]]))
lens = []
for tax, seq in seqs.items():
lens.append(len([x for x in seq.symbols_as_list() if x != '-']))
print('Genome length: min %d, mean %.1f, max %d' % (min(lens), sum(lens) / len(lens), max(lens)))
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('ebov.fasta', schema='fasta', data_type='dna')
print('EBOV')
describe_seqs(ebov_seqs)
del ebov_seqs
print('ebolavirus sequences')
ebolav_seqs = dendropy.DnaCharacterMatrix.get_from_path('other.fasta', schema='fasta', data_type='dna')
describe_seqs(ebolav_seqs)
from collections import defaultdict
species = defaultdict(int)
for taxon in ebolav_seqs.taxon_namespace:
toks = taxon.label.split('_')
my_species = toks[0]
if my_species == 'EBOV':
ident = '%s (%s)' % (my_species, toks[1])
else:
ident = my_species
species[ident] += 1
for my_species, cnt in species.items():
print("%20s: %d" % (my_species, cnt))
del ebolav_seqs
# ## Genes
# +
import os
gene_length = {}
my_genes = ['NP', 'L', 'VP35', 'VP40']
for name in my_genes:
gene_name = name.split('.')[0]
seqs = dendropy.DnaCharacterMatrix.get_from_path('%s.fasta' % name, schema='fasta', data_type='dna')
gene_length[gene_name] = []
for tax, seq in seqs.items():
gene_length[gene_name].append(len([x for x in seq.symbols_as_list() if x != '-']))
for gene, lens in gene_length.items():
print ('%6s: %d' % (gene, sum(lens) / len(lens)))
# -
================================================
FILE: Chapter07/Reconstruction.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import os
import random
import shutil
import sys
import dendropy
from dendropy.interop import raxml
# -
ebola_data = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta', 'fasta')
rx = raxml.RaxmlRunner()
ebola_tree = rx.estimate_tree(ebola_data, ['-m', 'GTRGAMMA', '-N', '10'])
print('RAxML temporary directory: %s' % rx.working_dir_path)
del ebola_data
ebola_tree.write_to_path('my_ebola.nex', 'nexus')
# +
import matplotlib.pyplot as plt
from Bio import Phylo
# # %matplotlib inline
my_ebola_tree = Phylo.read('my_ebola.nex', 'nexus')
my_ebola_tree.name = 'Our Ebolavirus tree'
fig = plt.figure(figsize=(16, 18))
ax = fig.add_subplot(1, 1, 1)
Phylo.draw(my_ebola_tree, axes=ax)
# -
# ## RAxML with Biopython
# XXX change
from Bio.Phylo.Applications import RaxmlCommandline
raxml_cline = RaxmlCommandline(sequences='trim.fasta',
model='GTRGAMMA', name='biopython',
num_replicates='10',
parsimony_seed=random.randint(0, sys.maxsize),
working_dir=os.getcwd() + os.sep + 'bp_rx')
print(raxml_cline)
try:
os.mkdir('bp_rx')
except OSError:
shutil.rmtree('bp_rx')
os.mkdir('bp_rx')
out, err = raxml_cline()
from Bio import Phylo
biopython_tree = Phylo.read('bp_rx/RAxML_bestTree.biopython', 'newick')
print(biopython_tree)
================================================
FILE: Chapter07/Selection.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# +
### XXX This is probably to remove
# -
sl_2014 = []
drc_2007 = []
for seq in ebola_seqs.taxon_set:
if seq.label.startswith('EBOV_2014') and len(sl_2014) < 8:
sl_2014.append(ebola_seqs[seq])
elif seq.label.startswith('EBOV_2007'):
drc_2007.append(ebola_seqs[seq])
print(len(sl_2014))
print(len(drc_2007))
pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)
print('Average number of pairwise differences (total): %s' %
pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %s' %
pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %s' %
pair_stats.average_number_of_pairwise_differences_within)
print('Average number of new pairwise differences : %s' %
pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %s' %
pair_stats.num_segregating_sites)
print("Watterson's theta: %s" %
pair_stats.wattersons_theta)
print("Wakeley's Psi: %s" % pair_stats.wakeleys_psi)
print("Tajima's D: %s" % pair_stats.tajimas_d)
================================================
FILE: Chapter07/Trees.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import dendropy
ebola_raxml = dendropy.Tree.get_from_path('my_ebola.nex', 'nexus')
# +
def compute_level(node, level=0):
for child in node.child_nodes():
compute_level(child, level + 1)
if node.taxon is not None:
print("%s: %d %d" % (node.taxon, node.level(), level))
compute_level(ebola_raxml.seed_node)
# +
def compute_height(node):
children = node.child_nodes()
if len(children) == 0:
height = 0
else:
height = 1 + max(map(lambda x: compute_height(x), children))
desc = node.taxon or 'Internal'
print("%s: %d %d" % (desc, height, node.level()))
return height
compute_height(ebola_raxml.seed_node)
# +
def compute_nofs(node):
children = node.child_nodes()
nofs = len(children)
map(lambda x: compute_nofs(x), children)
desc = node.taxon or 'Internal'
print("%s: %d %d" % (desc, nofs, node.level()))
compute_nofs(ebola_raxml.seed_node)
# +
def print_nodes(node):
for child in node.child_nodes():
print_nodes(child)
if node.taxon is not None:
print('%s (%d)' % (node.taxon, node.level()))
print_nodes(ebola_raxml.seed_node)
# +
from collections import deque
def print_breadth(tree):
queue = deque()
queue.append(tree.seed_node)
while len(queue) > 0:
process_node = queue.popleft()
if process_node.taxon is not None:
print('%s (%d)' % (process_node.taxon, process_node.level()))
else:
for child in process_node.child_nodes():
queue.append(child)
print_breadth(ebola_raxml)
# +
from copy import deepcopy
simple_ebola = deepcopy(ebola_raxml)
def simplify_tree(node):
prefs = set()
for leaf in node.leaf_nodes():
my_toks = leaf.taxon.label.split(' ')[0].split('_')
if my_toks[0] == 'EBOV':
prefs.add('EBOV' + my_toks[1])
else:
prefs.add(my_toks[0])
if len(prefs) == 1:
print(prefs, len(node.leaf_nodes()))
node.taxon = dendropy.Taxon(label=list(prefs)[0])
#node.collapse_clade()
node.set_child_nodes([])
else:
for child in node.child_nodes():
simplify_tree(child)
simplify_tree(simple_ebola.seed_node)
simple_ebola.ladderize()
simple_ebola.write_to_path('ebola_simple.nex', 'nexus')
# -
================================================
FILE: Chapter07/Visualization.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
from copy import deepcopy
import matplotlib.pyplot as plt
from Bio import Phylo
ebola_tree = Phylo.read('my_ebola.nex', 'nexus')
ebola_tree.name = 'Ebolavirus tree'
ebola_simple_tree = Phylo.read('ebola_simple.nex', 'nexus')
ebola_simple_tree.name = 'Ebolavirus simplified tree'
Phylo.draw_ascii(ebola_simple_tree)
Phylo.draw_ascii(ebola_tree)
fig = plt.figure(figsize=(16, 22))
ax = fig.add_subplot(111)
Phylo.draw(ebola_simple_tree, axes=ax, branch_labels=
lambda c: c.branch_length if c.branch_length > 0.02 else None)
# +
fig = plt.figure(figsize=(16, 22))
ax = fig.add_subplot(111)
from collections import OrderedDict
my_colors = OrderedDict({
'EBOV_2014': 'red',
'EBOV': 'magenta',
'BDBV': 'cyan',
'SUDV': 'blue',
'RESTV' : 'green',
'TAFV' : 'yellow'
})
def get_color(name):
for pref, color in my_colors.items():
if name.find(pref) > -1:
return color
return 'grey'
def color_tree(node, fun_color=get_color):
if node.is_terminal():
node.color = fun_color(node.name)
else:
my_children = set()
for child in node.clades:
color_tree(child, fun_color)
my_children.add(child.color.to_hex())
if len(my_children) == 1:
node.color = child.color
else:
node.color = 'grey'
ebola_color_tree = deepcopy(ebola_tree)
color_tree(ebola_color_tree.root)
Phylo.draw(ebola_color_tree, axes=ax, label_func=
lambda x: x.name.split(' ')[0][1:] if x.name is not None else None)
# -
================================================
FILE: Chapter08/.gitignore
================================================
*ent
*fasta
================================================
FILE: Chapter08/Distance.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import math
import timeit
from Bio import PDB
# -
repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', file_format='pdb', pdir='.') # XXX
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')
zns = []
for atom in p53_1tup.get_atoms():
if atom.element == 'ZN':
#print(atom, dir(atom), atom.mass, atom.element, atom.coord[0])
zns.append(atom)
for zn in zns:
print(zn, zn.coord)
# +
#Suggest a pymol viewing
# -
#Try this in numba?
def get_closest_atoms(pdb_struct, ref_atom, distance):
atoms = {}
rx, ry, rz = ref_atom.coord
for atom in pdb_struct.get_atoms():
if atom == ref_atom:
continue
x, y, z = atom.coord
my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2)
if my_dist < distance:
atoms[atom] = my_dist
return atoms
for zn in zns:
print()
print(zn.coord)
atoms = get_closest_atoms(p53_1tup, zn, 4)
for atom, distance in atoms.items():
print(atom.element, distance, atom.coord)
for distance in [1, 2, 4, 8, 16, 32, 64, 128]:
my_atoms = []
for zn in zns:
atoms = get_closest_atoms(p53_1tup, zn, distance)
my_atoms.append(len(atoms))
print(distance, my_atoms)
nexecs = 10
print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], 4.0)',
'from __main__ import get_closest_atoms, p53_1tup, zns',
number=nexecs) / nexecs * 1000)
def get_closest_alternative(pdb_struct, ref_atom, distance):
atoms = {}
rx, ry, rz = ref_atom.coord
for atom in pdb_struct.get_atoms():
if atom == ref_atom:
continue
x, y, z = atom.coord
if abs(x - rx) > distance or abs(y - ry) > distance or abs(z - rz) > distance:
continue
my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2)
if my_dist < distance:
atoms[atom] = my_dist
return atoms
print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], 4.0)',
'from __main__ import get_closest_alternative, p53_1tup, zns',
number=nexecs) / nexecs * 1000)
print('Standard')
for distance in [1, 4, 16, 64, 128]:
print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], distance)',
'from __main__ import get_closest_atoms, p53_1tup, zns, distance',
number=nexecs) / nexecs * 1000)
print('Optimized')
for distance in [1, 4, 16, 64, 128]:
print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], distance)',
'from __main__ import get_closest_alternative, p53_1tup, zns, distance',
number=nexecs) / nexecs * 1000)
# +
#for interesting distances
================================================
FILE: Chapter08/Intro.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
from collections import defaultdict
import requests
from Bio import ExPASy, SwissProt
# -
#explain why not biopython
server = 'https://rest.uniprot.org/uniprotkb/search'
def do_request(server, **kwargs):
params = ''
req = requests.get(server, params=kwargs)
if not req.ok:
req.raise_for_status()
return req
req = do_request(server,
# 1. Filtering human p53, reviewed entries
query='gene:p53 AND reviewed:true AND organism_id:9606',
format='tsv',
# 2. Specifying output columns with REST API field names
fields='accession,id,protein_name,gene_names,organism_name,length',
size=50
)
print(req.text)
#We might revisit this for KEGG
# +
#XXX - stringio
import pandas as pd
import io
uniprot_list = pd.read_table(io.StringIO(req.text))
uniprot_list.rename(columns={'Organism ID': 'ID'},
inplace=True)
print(uniprot_list)
# -
p53_human = uniprot_list[
(uniprot_list.Entry == 'P04637') &
(uniprot_list['Entry Name'].str.contains('P53_HUMAN'))]['Entry'].iloc[0]
handle = ExPASy.get_sprot_raw(p53_human)
sp_rec = SwissProt.read(handle)
print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name)
print(sp_rec.description)
print(sp_rec.organism, sp_rec.seqinfo)
print(sp_rec.sequence)
print(sp_rec.comments)
print(sp_rec.keywords)
help(sp_rec)
done_features = set()
print('Total features:', len(sp_rec.features))
for feature in sp_rec.features:
if feature in done_features:
continue
else:
done_features.add(feature)
print(feature)
print('Cross references: ',len(sp_rec.cross_references))
per_source = defaultdict(list)
for xref in sp_rec.cross_references:
source = xref[0]
per_source[source].append(xref[1:])
print(per_source.keys())
done_GOs = set()
print('Annotation SOURCES:', len(per_source['GO']))
for annot in per_source['GO']:
if annot[1][0] in done_GOs:
continue
else:
done_GOs.add(annot[1][0])
print(annot)
================================================
FILE: Chapter08/Mass.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import numpy as np
import pandas as pd
from Bio import PDB
# +
# #!rm -f 1tup.cif 2>/dev/null
# #!wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif
#parser = PDB.MMCIFParser()
#p53_1tup = parser.get_structure('P53', '1tup.cif')
# -
repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')
my_residues = set()
for residue in p53_1tup.get_residues():
my_residues.add(residue.id[0])
print(my_residues)
# +
def get_mass(atoms, accept_fun=lambda atom: atom.parent.id[0] != 'W'):
return sum([atom.mass for atom in atoms if accept_fun(atom)])
chain_names = [chain.id for chain in p53_1tup.get_chains()]
my_mass = np.ndarray((len(chain_names), 3))
for i, chain in enumerate(p53_1tup.get_chains()):
my_mass[i, 0] = get_mass(chain.get_atoms())
my_mass[i, 1] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] not in [' ', 'W'])
my_mass[i, 2] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] == 'W')
masses = pd.DataFrame(my_mass, index=chain_names, columns=['No Water', 'Zincs', 'Water'])
masses
# -
def get_center(atoms, weight_fun=lambda atom: 1 if atom.parent.id[0] != 'W' else 0):
xsum = ysum = zsum = 0.0
acum = 0.0
for atom in atoms:
x, y, z = atom.coord
weight = weight_fun(atom)
acum += weight
xsum += weight * x
ysum += weight * y
zsum += weight * z
return xsum / acum, ysum / acum, zsum / acum
print(get_center(p53_1tup.get_atoms()))
print(get_center(p53_1tup.get_atoms(),
weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0))
my_center = np.ndarray((len(chain_names), 6))
for i, chain in enumerate(p53_1tup.get_chains()):
x, y, z = get_center(chain.get_atoms())
my_center[i, 0] = x
my_center[i, 1] = y
my_center[i, 2] = z
x, y, z = get_center(chain.get_atoms(), weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0)
my_center[i, 3] = x
my_center[i, 4] = y
my_center[i, 5] = z
weights = pd.DataFrame(my_center, index=chain_names, columns=['X', 'Y', 'Z', 'X (Mass)', 'Y (Mass)', 'Z (Mass)'])
weights
# +
#Pymol viz
================================================
FILE: Chapter08/PDB.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
from Bio import PDB
repository = PDB.PDBList()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')
repository.retrieve_pdb_file('1OLG', pdir='.', file_format='pdb')
repository.retrieve_pdb_file('1YCQ', pdir='.', file_format='pdb')
parser = PDB.PDBParser()
p53_1tup = parser.get_structure('P 53 - DNA Binding', 'pdb1tup.ent')
p53_1olg = parser.get_structure('P 53 - Tetramerization', 'pdb1olg.ent')
p53_1ycq = parser.get_structure('P 53 - Transactivation', 'pdb1ycq.ent')
# +
def print_pdb_headers(headers, indent=0):
ind_text = ' ' * indent
for header, content in headers.items():
if type(content) == dict:
print('\n%s%20s:' % (ind_text, header))
print_pdb_headers(content, indent + 4)
print()
elif type(content) == list:
print('%s%20s:' % (ind_text, header))
for elem in content:
print('%s%21s %s' % (ind_text, '->', elem))
else:
print('%s%20s: %s' % (ind_text, header, content))
print_pdb_headers(p53_1tup.header)
# -
print(p53_1tup.header['compound'])
print(p53_1olg.header['compound'])
print(p53_1ycq.header['compound'])
def describe_model(name, pdb):
print()
for model in pdb:
for chain in model:
print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' %
(name, chain.id, len(chain), len(list(chain.get_atoms()))))
describe_model('1TUP', p53_1tup)
describe_model('1OLG', p53_1olg)
describe_model('1YCQ', p53_1ycq)
#will go deep in a next recipe (bottom up)
for residue in p53_1tup.get_residues():
if residue.id[0] in [' ', 'W']:
continue
print(residue.id)
res = next(p53_1tup[0]['A'].get_residues())
print(res)
for atom in res:
print(atom, atom.serial_number, atom.element)
print(p53_1tup[0]['A'][94]['CA'])
# +
from Bio.SeqIO import PdbIO, FastaIO
from Bio import SeqIO
def get_fasta(pdb_file, fasta_file, transfer_ids=None):
records = list(PdbIO.PdbSeqresIterator(pdb_file))
if transfer_ids is not None:
records = [rec for rec in records if rec.id in transfer_ids and len(rec.seq) > 0]
else:
records = [rec for rec in records if len(rec.seq) > 0]
with open(fasta_file, 'w') as out_handle:
SeqIO.write(records, out_handle, 'fasta')
for rec in records:
print(rec.id, rec.seq, len(rec.seq))
get_fasta('pdb1tup.ent', '1tup.fasta', transfer_ids=['1TUP:B'])
get_fasta('pdb1olg.ent', '1olg.fasta', transfer_ids=['1OLG:B'])
get_fasta('pdb1ycq.ent', '1ycq.fasta', transfer_ids=['1YCQ:B'])
# -
================================================
FILE: Chapter08/Parser.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
from Bio import PDB
#XXX
repository = PDB.PDBList()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')
# +
rec_types = {
#single line
'HEADER': [(str, 11, 49), (str, 50, 58), (str, 62, 65)],
#multi_line
'SOURCE': [(int, 7, 9), (str, 10, 78)],
#multi_rec
'LINK' : [(str, 12, 15), (str, 16, 16), (str, 17, 19), (str, 21, 21), (int, 22, 25),
(str, 26, 26), (str, 42, 45), (str, 46, 46), (str, 47, 49), (str, 51, 51),
(int, 52, 55), (str, 56, 56), (str, 59, 64), (str, 66, 71), (float, 73, 77)],
'HELIX': [(int, 7, 9), (str, 11, 13), (str, 15, 17), (str, 19, 19), (int, 21, 24),
(str, 25, 25), (str, 27, 29), (str, 31, 31),
(int, 33, 36), (str, 37 ,37), (int, 38, 39), (str, 40, 69), (int, 71, 75)],
'SHEET': [(int, 7, 9), (str, 11, 13), (int, 14, 15), (str, 17, 19), (str, 21, 21),
(int, 22, 24), (str, 26, 26), (str, 28, 30),
(str, 32, 32), (int, 33, 36), (str, 37, 37), (int, 38, 39), (str, 41, 44),
(str, 45, 47), (str, 49, 49), (int, 50, 53), (str, 54, 54), (str, 56, 59),
(str, 60, 62), (str, 64, 64), (int, 65, 68), (str, 69, 69)],
}
def parse_pdb(hdl):
for line in hdl:
line = line[:-1] # remove \n but not other whitespace
toks = []
for section, elements in rec_types.items():
if line.startswith(section):
for fun, start, end in elements:
try:
toks.append(fun(line[start: end + 1]))
except ValueError:
toks.append(None) # eg continuation
yield (section, toks)
if len(toks) == 0:
yield ('UNKNOWN', line)
# -
hdl = open('pdb1tup.ent')
done_rec = set()
for rec in parse_pdb(hdl):
if rec[0] == 'UNKNOWN' or rec[0] in done_rec:
continue
print(rec)
done_rec.add(rec[0])
# +
multi_lines = ['SOURCE']
#assume multi is just a string
def process_multi_lines(hdl):
current_multi = ''
current_multi_name = None
for rec_type, toks in parse_pdb(hdl):
if current_multi_name is not None and current_multi_name != rec_type:
yield current_multi_name, [current_multi]
current_multi = ''
current_multi_name = None
if rec_type in multi_lines:
current_multi += toks[1].strip().rstrip() + ' '
current_multi_name = rec_type
else:
if len(current_multi) != 0:
yield current_multi_name, [current_multi]
current_multi = ''
current_multi_name = None
yield rec_type, toks
if len(current_multi) != 0:
yield current_multi_name, [current_multi]
# -
hdl = open('pdb1tup.ent')
done_rec = set()
for rec in process_multi_lines(hdl):
if rec[0] == 'UNKNOWN' or rec[0] in done_rec:
continue
print(rec)
done_rec.add(rec[0])
# +
def get_spec_list(my_str):
#ignoring escape characters
spec_list = {}
elems = my_str.strip().strip().split(';')
for elem in elems:
toks = elem.split(':')
spec_list[toks[0].strip()] = toks[1].strip()
return spec_list
struct_types = {
'SOURCE': [get_spec_list]
}
def process_struct_types(hdl):
for rec_type, toks in process_multi_lines(hdl):
if rec_type in struct_types.keys():
funs = struct_types[rec_type]
struct_toks = []
for tok, fun in zip(toks, funs):
struct_toks.append(fun(tok))
yield rec_type, struct_toks
else:
yield rec_type, toks
# -
hdl = open('pdb1tup.ent')
for rec in process_struct_types(hdl):
if rec[0] != 'SOURCE':
continue
print(rec)
================================================
FILE: Chapter08/PyMol_Intro.py
================================================
import threading
def dump_thread():
print
for thr in threading.enumerate():
print(thr)
dump_thread()
import pymol
pymol.pymol_launch=4
pymol.pymol_argv = [ 'pymol', '-qc'] # Quiet / no GUI
from pymol import cmd
pymol.finish_launching()
dump_thread()
#cmd.fetch('1TUP', async=False)
cmd.fetch('1TUP')
cmd.disable('all')
cmd.enable('1TUP')
cmd.bg_color('white')
cmd.hide('all')
cmd.show('cartoon')
#cmd.hide('cartoon', 'chain E+F')
#cmd.show('ribbon', 'chain E+F')
cmd.select('zinc', 'name zn')
cmd.show('sphere', 'zinc')
cmd.set('ray_trace_mode', 3)
cmd.png('1TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False)
dump_thread()
cmd.set('ray_trace_mode', 1)
cmd.png('TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False)
cmd.quit()
================================================
FILE: Chapter08/PyMol_Movie.py
================================================
import pymol
from pymol import cmd
#pymol.pymol_argv = [ 'pymol', '-qc'] # Quiet / no GUI
pymol.finish_launching()
#cmd.fetch('1TUP', async=False)
cmd.fetch('1TUP')
cmd.disable('all')
cmd.enable('1TUP')
cmd.hide('all')
cmd.show('sphere', 'name zn')
cmd.show('surface', 'chain A+B+C')
cmd.show('cartoon', 'chain E+F')
cmd.scene('S0', action='store', view=0, frame=0, animate=-1)
cmd.show('cartoon')
cmd.hide('surface')
cmd.scene('S1', action='store', view=0, frame=0, animate=-1)
cmd.hide('cartoon', 'chain A+B+C')
cmd.show('mesh', 'chain A')
cmd.show('sticks', 'chain A+B+C')
cmd.scene('S2', action='store', view=0, frame=0, animate=-1)
cmd.set('ray_trace_mode', 0)
cmd.mset(1, 500)
cmd.frame(0)
cmd.scene('S0')
cmd.mview()
cmd.frame(60)
cmd.set_view((-0.175534308, -0.331560850, -0.926960170,
0.541812420, 0.753615797, -0.372158051,
0.821965039, -0.567564785, 0.047358301,
0.000000000, 0.000000000, -249.619018555,
58.625568390, 15.602619171, 77.781631470,
196.801528931, 302.436492920, -20.000000000))
cmd.mview()
cmd.frame(90)
cmd.set_view((-0.175534308, -0.331560850, -0.926960170,
0.541812420, 0.753615797, -0.372158051,
0.821965039, -0.567564785, 0.047358301,
-0.000067875, 0.000017881, -249.615447998,
54.029174805, 26.956727982, 77.124832153,
196.801528931, 302.436492920, -20.000000000))
cmd.mview()
cmd.frame(150)
cmd.set_view((-0.175534308, -0.331560850, -0.926960170,
0.541812420, 0.753615797, -0.372158051,
0.821965039, -0.567564785, 0.047358301,
-0.000067875, 0.000017881, -55.406421661,
54.029174805, 26.956727982, 77.124832153,
2.592475891, 108.227416992, -20.000000000))
cmd.mview()
cmd.frame(200)
cmd.scene('S1')
cmd.mview()
cmd.frame(350)
cmd.scene('S1')
cmd.set_view((0.395763457, -0.173441306, 0.901825786,
0.915456235, 0.152441502, -0.372427106,
-0.072881661, 0.972972929, 0.219108686,
0.000070953, 0.000013039, -37.689743042,
57.748500824, 14.325904846, 77.241867065,
-15.123448372, 90.511535645, -20.000000000))
cmd.mview()
cmd.frame(351)
cmd.scene('S2')
cmd.mview()
cmd.frame(500)
cmd.scene('S2')
cmd.mview()
cmd.mplay()
cmd.mpng('p53_1tup')
cmd.quit()
================================================
FILE: Chapter08/Stats.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.8
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
from collections import defaultdict
import sys
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# #%matplotlib inline
from Bio import PDB
# -
repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') #XXX
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')
# +
atom_cnt = defaultdict(int)
atom_chain = defaultdict(int)
atom_res_types = defaultdict(int)
for atom in p53_1tup.get_atoms():
my_residue = atom.parent
my_chain = my_residue.parent
atom_chain[my_chain.id] += 1
if my_residue.resname != 'HOH':
atom_cnt[atom.element] += 1
atom_res_types[my_residue.resname] += 1
print(dict(atom_res_types))
print(dict(atom_chain))
print(dict(atom_cnt))
# -
res_types = defaultdict(int)
res_per_chain = defaultdict(int)
for residue in p53_1tup.get_residues():
res_types[residue.resname] += 1
res_per_chain[residue.parent.id] +=1
print(dict(res_types))
print(dict(res_per_chain))
def get_bounds(my_atoms):
my_min = [sys.maxsize] * 3
my_max = [-sys.maxsize] * 3
for atom in my_atoms:
for i, coord in enumerate(atom.coord):
if coord < my_min[i]:
my_min[i] = coord
if coord > my_max[i]:
my_max[i] = coord
return my_min, my_max
chain_bounds = {}
for chain in p53_1tup.get_chains():
print(chain.id, get_bounds(chain.get_atoms()))
chain_bounds[chain.id] = get_bounds(chain.get_atoms())
print(get_bounds(p53_1tup.get_atoms()))
#matplotlib 3d plot
fig = plt.figure(figsize=(16, 9))
ax3d = fig.add_subplot(111, projection='3d')
ax_xy = fig.add_subplot(331)
ax_xy.set_title('X/Y')
ax_xz = fig.add_subplot(334)
ax_xz.set_title('X/Z')
ax_zy = fig.add_subplot(337)
ax_zy.set_title('Z/Y')
color = {'A': 'r', 'B': 'g', 'C': 'b', 'E': '0.5', 'F': '0.75'}
zx, zy, zz = [], [], []
for chain in p53_1tup.get_chains():
xs, ys, zs = [], [], []
for residue in chain.get_residues():
ref_atom = next(residue.get_iterator())
x, y, z = ref_atom.coord
if ref_atom.element == 'ZN':
zx.append(x)
zy.append(y)
zz.append(z)
continue
xs.append(x)
ys.append(y)
zs.append(z)
ax3d.scatter(xs, ys, zs, color=color[chain.id])
ax_xy.scatter(xs, ys, marker='.', color=color[chain.id])
ax_xz.scatter(xs, zs, marker='.', color=color[chain.id])
ax_zy.scatter(zs, ys, marker='.', color=color[chain.id])
ax3d.set_xlabel('X')
ax3d.set_ylabel('Y')
ax3d.set_zlabel('Z')
ax3d.scatter(zx, zy, zz, color='k', marker='v', s=300)
ax_xy.scatter(zx, zy, color='k', marker='v', s=80)
ax_xz.scatter(zx, zz, color='k', marker='v', s=80)
ax_zy.scatter(zz, zy, color='k', marker='v', s=80)
for ax in [ax_xy, ax_xz, ax_zy]:
ax.get_yaxis().set_visible(False)
ax.get_xaxis().set_visible(False)
================================================
FILE: Chapter08/mmCIF.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
from Bio import PDB
# !rm -f 1tup.cif 2>/dev/null
# !wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif
parser = PDB.MMCIFParser()
p53_1tup = parser.get_structure('P53_HUMAN', '1tup.cif')
def describe_model(name, pdb):
print()
for model in p53_1tup:
for chain in model:
print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' %
(name, chain.id, len(chain), len(list(chain.get_atoms()))))
describe_model('1TUP', p53_1tup)
done_chain = set()
for residue in p53_1tup.get_residues():
chain = residue.parent
if chain.id in done_chain:
continue
done_chain.add(chain.id)
print(chain.id, residue.id)
mmcif_dict = PDB.MMCIF2Dict.MMCIF2Dict('1tup.cif')
for k, v in mmcif_dict.items():
print(k, v)
print()
================================================
FILE: Chapter09/galaxy/.gitignore
================================================
galaxy.yaml.enc
tool
salt
================================================
FILE: Chapter09/galaxy/LCT.bed
================================================
track name=gene description="Gene information"
2 135836529 135837180 ENSE00002202258 0 -
2 135833110 135833190 ENSE00001660765 0 -
2 135829592 135829676 ENSE00001731451 0 -
2 135823900 135824003 ENSE00001659892 0 -
2 135822019 135822098 ENSE00001777620 0 -
2 135817340 135818061 ENSE00001602826 0 -
2 135812310 135812956 ENSE00000776576 0 -
2 135808442 135809993 ENSE00001008768 0 -
2 135807127 135807396 ENSE00000776573 0 -
2 135804766 135805057 ENSE00000776572 0 -
2 135803929 135804128 ENSE00000776571 0 -
2 135800606 135800809 ENSE00000776570 0 -
2 135798028 135798138 ENSE00003515081 0 -
2 135794640 135794775 ENSE00001630333 0 -
2 135790657 135790881 ENSE00001667885 0 -
2 135789570 135789798 ENSE00001728878 0 -
2 135787839 135788544 ENSE00001653704 0 -
2 135812310 135812959 ENSE00001745158 0 -
2 135808442 135809993 ENSE00001008768 0 -
2 135807127 135807396 ENSE00000776573 0 -
2 135804766 135805057 ENSE00000776572 0 -
2 135803929 135804128 ENSE00000776571 0 -
2 135798028 135798138 ENSE00003459353 0 -
2 135794336 135794775 ENSE00001635523 0 -
2 135810168 135810279 ENSE00001438557 0 -
2 135820190 135820639 ENSE00001732580 0 +
2 135821674 135823087 ENSE00001695040 0 +
2 135836529 135837180 NM_002299.2.1 0 -
2 135833110 135833190 NM_002299.2.2 0 -
2 135829592 135829676 NM_002299.2.3 0 -
2 135823900 135824003 NM_002299.2.4 0 -
2 135822019 135822098 NM_002299.2.5 0 -
2 135817340 135818061 NM_002299.2.6 0 -
2 135812310 135812956 NM_002299.2.7 0 -
2 135808442 135809993 NM_002299.2.8 0 -
2 135807127 135807396 NM_002299.2.9 0 -
2 135804766 135805057 NM_002299.2.10 0 -
2 135803929 135804128 NM_002299.2.11 0 -
2 135800606 135800809 NM_002299.2.12 0 -
2 135798028 135798138 NM_002299.2.13 0 -
2 135794640 135794775 NM_002299.2.14 0 -
2 135790657 135790881 NM_002299.2.15 0 -
2 135789570 135789798 NM_002299.2.16 0 -
2 135787844 135788544 NM_002299.2.17 0 -
2 135836529 135837169 CCDS2178.117 0 -
2 135833110 135833190 CCDS2178.116 0 -
2 135829592 135829676 CCDS2178.115 0 -
2 135823900 135824003 CCDS2178.114 0 -
2 135822019 135822098 CCDS2178.113 0 -
2 135817340 135818061 CCDS2178.112 0 -
2 135812310 135812956 CCDS2178.111 0 -
2 135808442 135809993 CCDS2178.110 0 -
2 135807127 135807396 CCDS2178.19 0 -
2 135804766 135805057 CCDS2178.18 0 -
2 135803929 135804128 CCDS2178.17 0 -
2 135800606 135800809 CCDS2178.16 0 -
2 135798028 135798138 CCDS2178.15 0 -
2 135794640 135794775 CCDS2178.14 0 -
2 135790657 135790881 CCDS2178.13 0 -
2 135789570 135789798 CCDS2178.12 0 -
2 135788323 135788544 CCDS2178.11 0 -
================================================
FILE: Chapter09/galaxy/api.py
================================================
import base64
from collections import defaultdict
#import ftplib
import getpass
import pprint
import warnings
from ruamel.yaml import YAML
from cryptography.fernet import Fernet
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
import pandas as pd
from bioblend.galaxy import GalaxyInstance
import paramiko
pp = pprint.PrettyPrinter()
warnings.filterwarnings('ignore')
# explain above, and warn
with open('galaxy.yaml.enc', 'rb') as f:
enc_conf = f.read()
password = getpass.getpass('Please enter the password:').encode()
with open('salt', 'rb') as f:
salt = f.read()
kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt,
iterations=100000, backend=default_backend())
key = base64.urlsafe_b64encode(kdf.derive(password))
fernet = Fernet(key)
yaml = YAML()
conf = yaml.load(fernet.decrypt(enc_conf).decode())
server = conf['server']
rest_protocol = conf['rest_protocol']
rest_port = conf['rest_port']
user = conf['user']
password = conf['password']
sftp_port = int(conf['sftp_port'])
api_key = conf['api_key']
rest_url = '%s://%s:%d' % (rest_protocol, server, rest_port)
history_name = 'bioinf_example'
gi = GalaxyInstance(url=rest_url, key=api_key)
gi.verify = False
histories = gi.histories
print('Existing histories:')
for history in histories.get_histories():
if history['name'] == history_name:
histories.delete_history(history['id'])
print(' - ' + history['name'])
print()
ds_history = histories.create_history(history_name)
print('Uploading file')
transport = paramiko.Transport((server, sftp_port))
transport.connect(None, user, password)
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.put('LCT.bed', 'LCT.bed')
sftp.close()
transport.close()
#ftp = ftplib.FTP()
#ftp.connect(host=server, port=ftp_port)
#ftp.login(user=user, passwd=password)
#f = open('LCT.bed', 'rb')
#ftp.set_pasv(True) # explain
##ftp.storbinary('STOR LCT.bed', f)
#s = ftp.transfercmd('STOR LCT.bed')
#s.send(f.read())
#s.close()
#f.close()
#ftp.close()
gi.tools.upload_from_ftp('LCT.bed', ds_history['id'])
print()
contents = gi.histories.show_history(ds_history['id'], contents=True)
def summarize_contents(contents):
summary = defaultdict(list)
for item in contents:
summary['íd'].append(item['id'])
summary['híd'].append(item['hid'])
summary['name'].append(item['name'])
summary['type'].append(item['type'])
summary['extension'].append(item['extension'])
return pd.DataFrame.from_dict(summary)
print('History contents:')
pd_contents = summarize_contents(contents)
print(pd_contents)
print()
print('Metadata for LCT.bed')
bed_ds = contents[0]
pp.pprint(bed_ds)
print()
print('Metadata about all tools')
all_tools = gi.tools.get_tools()
pp.pprint(all_tools)
print()
bed2gff = gi.tools.get_tools(name='Convert BED to GFF')[0]
print("Convert BED to GFF metadata:")
pp.pprint(gi.tools.show_tool(bed2gff['id'], io_details=True, link_details=True))
print()
def dataset_to_param(dataset):
return dict(src='hda', id=dataset['id'])
tool_inputs = {
'input1': dataset_to_param(bed_ds)
}
#hid!
gi.tools.run_tool(ds_history['id'], bed2gff['id'], tool_inputs=tool_inputs)
================================================
FILE: Chapter09/galaxy/encrypt.py
================================================
"Encrypt an YAML file with the script configuration"
import base64
import getpass
from io import StringIO
import os
from ruamel.yaml import YAML
from cryptography.fernet import Fernet
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
password = getpass.getpass('Please enter the password:').encode()
salt = os.urandom(16)
kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt,
iterations=100000, backend=default_backend())
key = base64.urlsafe_b64encode(kdf.derive(password))
fernet = Fernet(key)
with open('salt', 'wb') as w:
w.write(salt)
yaml = YAML()
content = yaml.load(open('galaxy.yaml', 'rt', encoding='utf-8'))
print(type(content), content)
output = StringIO()
yaml.dump(content, output)
print ('Encrypting:\n%s' % output.getvalue())
enc_output = fernet.encrypt(output.getvalue().encode())
with open('galaxy.yaml.enc', 'wb') as w:
w.write(enc_output)
print("Complete, the clear version should be deleted now")
================================================
FILE: Chapter09/galaxy/galaxy.yaml
================================================
rest_protocol: http
server: localhost
rest_port: 8080
sftp_port: 8022
user: admin@galaxy.org
password: password
api_key: fakekey
================================================
FILE: Chapter09/nextflow/.gitignore
================================================
data
pca.png
work
.nextflow*
report
================================================
FILE: Chapter09/nextflow/pipeline.nf
================================================
nextflow.enable.dsl=2
download_root = "https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3"
process plink_download {
output:
path 'hapmap.map.gz'//, emit: mapgz
path 'hapmap.ped.gz'//, emit: pedgz
script:
"""
wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz -O hapmap.map.gz
wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz -O hapmap.ped.gz
"""
}
process uncompress_plink {
publishDir 'data', glob: '*', mode: 'copy'
input:
path mapgz
path pedgz
output:
path 'hapmap.map'
path 'hapmap.ped'
script:
"""
gzip -dc $mapgz > hapmap.map
gzip -dc $pedgz > hapmap.ped
"""
}
//DSL 2 and docs
//conda
process subsample_1p {
input:
path 'hapmap.map'
path 'hapmap.ped'
output:
path 'hapmap1.map'
path 'hapmap1.ped'
script:
"""
plink2 --pedmap hapmap --out hapmap1 --thin 0.01 --geno 0.1 --export ped
"""
}
process plink_pca {
input:
path 'hapmap.map'
path 'hapmap.ped'
output:
path 'hapmap.eigenvec'
path 'hapmap.eigenval'
script:
"""
plink2 --pca --pedmap hapmap -out hapmap
"""
}
process plot_pca {
publishDir '.', glob: '*', mode: 'copy'
input:
path 'hapmap.eigenvec'
path 'hapmap.eigenval'
output:
path 'pca.png'
script:
"""
#!/usr/bin/env python
import pandas as pd
pca_df = pd.read_csv('hapmap.eigenvec', sep='\t')
ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9))
ax.figure.savefig('pca.png')
"""
}
/*
workflow {
plink_download | uncompress_plink
}
*/
/*
workflow {
ped_file = file('data/hapmap.ped')
map_file = file('data/hapmap.map')
if (!ped_file.exists() | !map_file.exists()) {
plink_download | uncompress_plink
}
}
*/
workflow {
ped_file = file('data/hapmap.ped')
map_file = file('data/hapmap.map')
if (!ped_file.exists() | !map_file.exists()) {
plink_download | uncompress_plink | subsample_1p | plink_pca | plot_pca
}
else {
subsample_1p(
Channel.fromPath('data/hapmap.map'),
Channel.fromPath('data/hapmap.ped')) | plink_pca | plot_pca
}
}
================================================
FILE: Chapter09/snakemake/.gitignore
================================================
data
scratch
.snakemake
pca.png
dag.svg
bio.png
bio.svg
================================================
FILE: Chapter09/snakemake/Snakefile
================================================
rule all:
input:
"pca.png"
rule plink_download:
output:
map="scratch/hapmap.map.gz",
ped="scratch/hapmap.ped.gz",
rel="data/relationships.txt"
shell:
"""
python -c "import urllib.request; urllib.request.urlretrieve(
'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz',
'{output.map}')"
python -c "import urllib.request; urllib.request.urlretrieve(
'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz',
'{output.ped}')"
python -c "import urllib.request; urllib.request.urlretrieve(
'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt',
'{output.rel}')"
"""
PLINKEXTS = ['ped', 'map']
rule uncompress_plink:
input:
"scratch/hapmap.{plinkext}.gz"
output:
"data/hapmap.{plinkext}"
shell:
"gzip -dc {input} > {output}"
rule subsample_1p:
input:
"data/hapmap.ped",
"data/hapmap.map"
output:
"data/hapmap1.ped",
"data/hapmap1.map"
run:
shell(f"plink2 --pedmap {input[0][:-4]} --out {output[0][:-4]} --thin 0.01 --geno 0.1 --export ped")
# snakemake and software requirements
# https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#automatic-deployment-of-software-dependencies
#plink2 --pedmap data/hapmap --out data/hapmap10 --thin 0.1 --geno 0.1 --export ped
rule plink_pca:
input:
"data/hapmap1.ped",
"data/hapmap1.map"
output:
"data/hapmap1.eigenvec",
"data/hapmap1.eigenval"
shell:
"plink2 --pca --pedmap data/hapmap1 -out data/hapmap1"
rule plot_pca:
input:
"data/hapmap1.eigenvec",
"data/hapmap1.eigenval"
output:
"pca.png"
script:
"./plot_pca.py"
================================================
FILE: Chapter09/snakemake/plot_pca.py
================================================
import pandas as pd
eigen_fname = snakemake.input[0] if snakemake.input[0].endswith('eigenvec') else snakemake.input[1]
pca_df = pd.read_csv(eigen_fname, sep='\t')
ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9))
ax.figure.savefig(snakemake.output[0])
================================================
FILE: Chapter10/Clustering.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# + jupyter={"outputs_hidden": false}
import os
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
from genomics.popgen.pca import plot
# -
# ## Meta-data load
# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/relationships_w_pops_041510.txt')
ind_pop = {}
f.readline() # header
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
pop = toks[-1]
ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
# -
# ## With scikit-learn
# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
ninds += 1
toks = line[:100].replace(' ', '\t').split('\t') # for speed
fam_id = toks[0]
ind_id = toks[1]
ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
print (nsnps)
f.close()
# + jupyter={"outputs_hidden": false}
all_array = np.empty((ninds, nsnps), dtype=int)
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
snps = line.replace(' ', '\t').split('\t')[6:]
for pos in range(len(snps) // 2):
a1 = int(snps[2 * pos])
a2 = int(snps[2 * pos])
my_code = a1 + a2 - 2
all_array[ind, pos] = my_code
f.close()
#slow
# -
predict_case = all_array[-1, :]
pca_array = all_array[:-1,:]
last_ind = ind_order[-1]
last_ind, ind_pop[last_ind]
my_pca = PCA(n_components=2)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca(sc_ind_comp, cluster=ind_pop)
# + jupyter={"outputs_hidden": false}
def plot_kmeans_pca(trans, kmeans):
x_min, x_max = trans[:, 0].min() - 1, trans[:, 0].max() + 1
y_min, y_max = trans[:, 1].min() - 1, trans[:, 1].max() + 1
mesh_x, mesh_y = np.meshgrid(np.arange(x_min, x_max, 0.5), np.arange(y_min, y_max, 0.5))
k_surface = kmeans.predict(np.c_[mesh_x.ravel(), mesh_y.ravel()]).reshape(mesh_x.shape)
fig, ax = plt.subplots(1,1, dpi=300)
ax.imshow(
k_surface, origin="lower", cmap=plt.cm.Pastel1,
extent=(mesh_x.min(), mesh_x.max(), mesh_y.min(), mesh_y.max()),
)
ax.plot(trans[:, 0], trans[:, 1], "k.", markersize=2)
ax.set_title("KMeans clustering of PCA data")
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())
return ax
# + jupyter={"outputs_hidden": false}
kmeans11 = KMeans(n_clusters=11).fit(trans)
plot_kmeans_pca(trans, kmeans11)
# -
kmeans4 = KMeans(n_clusters=4).fit(trans)
plot_kmeans_pca(trans, kmeans4)
pca_predict = my_pca.transform([predict_case])
kmeans4.predict(pca_predict)
last_train = ind_order[-2]
last_train, ind_pop[last_train]
kmeans4.predict(trans)[0]
================================================
FILE: Chapter10/Decision_Tree.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# + jupyter={"outputs_hidden": false}
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
# + [markdown] jupyter={"outputs_hidden": false}
# http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29
# + jupyter={"outputs_hidden": false}
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
# -
# ## With scikit-learn
# + jupyter={"outputs_hidden": false}
f = open('breast-cancer-wisconsin.data')
w = open('clean.data', 'w')
for line in f:
if line.find('?') > -1:
continue
w.write(line)
f.close()
w.close()
# + jupyter={"outputs_hidden": false}
column_names = [
'sample_id', 'clump_thickness', 'uniformity_cell_size',
'uniformity_cell shape', 'marginal_adhesion',
'single_epithelial_cell_size', 'bare_nuclei',
'bland_chromatin', 'normal_nucleoli', 'mitoses',
'class'
]
samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0)
samples
# + jupyter={"outputs_hidden": false}
training_input = samples.iloc[:,:-1]
target = samples.iloc[:,-1].apply(lambda x: 0 if x == 2 else 1)
# + jupyter={"outputs_hidden": false}
clf = tree.DecisionTreeClassifier(max_depth=3)
# + jupyter={"outputs_hidden": false}
clf.fit(training_input, target)
# + jupyter={"outputs_hidden": false}
importances = pd.Series(
clf.feature_importances_ * 100,
index=training_input.columns).sort_values(ascending=False)
importances
# + jupyter={"outputs_hidden": false}
100 * clf.score(training_input, target)
# + jupyter={"outputs_hidden": false}
fig, ax = plt.subplots(1, dpi=300)
tree.plot_tree(clf,ax=ax, feature_names=training_input.columns, class_names=['Benign', 'Malignant'])
# -
================================================
FILE: Chapter10/PCA.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# + jupyter={"outputs_hidden": false}
import os
from sklearn.decomposition import PCA
import numpy as np
from genomics.popgen.pca import plot
# -
# ## Meta-data load
# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/relationships_w_pops_041510.txt')
ind_pop = {}
f.readline() # header
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
pop = toks[-1]
ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
# -
# ## With scikit-learn
# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
ninds += 1
toks = line[:100].replace(' ', '\t').split('\t') # for speed
fam_id = toks[0]
ind_id = toks[1]
ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
f.close()
# + jupyter={"outputs_hidden": false}
pca_array = np.empty((ninds, nsnps), dtype=int)
print(pca_array.shape)
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
snps = line.replace(' ', '\t').split('\t')[6:]
for pos in range(len(snps) // 2):
a1 = int(snps[2 * pos])
a2 = int(snps[2 * pos])
my_code = a1 + a2 - 2
pca_array[ind, pos] = my_code
f.close()
# + jupyter={"outputs_hidden": false}
my_pca = PCA(n_components=8)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)
#Memory required
# + jupyter={"outputs_hidden": false}
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca_eight(sc_ind_comp, cluster=ind_pop)
# + jupyter={"outputs_hidden": false}
================================================
FILE: Chapter10/Random_Forest.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# + jupyter={"outputs_hidden": false}
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
# + [markdown] jupyter={"outputs_hidden": false}
# http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29
# + jupyter={"outputs_hidden": false}
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
# -
# ## With scikit-learn
# + jupyter={"outputs_hidden": false}
f = open('breast-cancer-wisconsin.data')
w = open('clean.data', 'w')
for line in f:
if line.find('?') > -1:
continue
w.write(line)
f.close()
w.close()
# + jupyter={"outputs_hidden": false}
column_names = [
'sample_id', 'clump_thickness', 'uniformity_cell_size',
'uniformity_cell shape', 'marginal_adhesion',
'single_epithelial_cell_size', 'bare_nuclei',
'bland_chromatin', 'normal_nucleoli', 'mitoses',
'class'
]
samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0)
samples
# + jupyter={"outputs_hidden": false}
trainning_input = samples.iloc[:,:-1]
target = samples.iloc[:,-1]
# + jupyter={"outputs_hidden": false}
clf = RandomForestClassifier(max_depth=3, n_estimators=200)
# + jupyter={"outputs_hidden": false}
clf.fit(trainning_input, target)
# + jupyter={"outputs_hidden": false}
importances = pd.Series(
clf.feature_importances_ * 100,
index=trainning_input.columns).sort_values(ascending=False)
importances
# -
100 * clf.score(trainning_input, target)
for test_size in [0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99]:
X_train, X_test, y_train, y_test = train_test_split(
trainning_input, target, test_size=test_size)
tclf = RandomForestClassifier(max_depth=3)
tclf.fit(X_train, y_train)
score = tclf.score(X_test, y_test)
print(f'{1 - test_size:.1%} {score:.2%}')
# Random number generator
================================================
FILE: Chapter11/.gitignore
================================================
dask-worker-space
data
mydask.png
x.png
================================================
FILE: Chapter11/Dask_Distributed.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
#import dask
#from dask.base import get_scheduler
#import dask.array as da
#
#mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
#print(get_scheduler(collections=[mosquito]).__module__)
# +
import zarr
import dask.dataframe as dd
from dask.distributed import Client
#client = Client('127.0.0.1:8786')
client = Client()
client
# +
import numpy as np
import dask.array as da
mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
# -
mosquito
mosquito.shape[0]
mosquito = mosquito.rechunk((mosquito.shape[0]//8, 81, 2))
mosquito = mosquito.persist()
mosquito.visualize()
mosquito
mosquito.chunks
def calc_stats(my_chunk):
num_miss = np.sum(np.equal(my_chunk[0][0][:,:,0], -1), axis=1)
return num_miss
stats = da.blockwise(calc_stats, 'i', mosquito, 'ijk', dtype=np.uint8)
stats.visualize()
stat_results = stats.compute()
stat_results
================================================
FILE: Chapter11/Dask_Intro.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
import zarr
mosquito = zarr.open('data/AG1000G-AO/2L/calldata/GT')
mosquito
zarr.array(mosquito, chunks=(1 + 48525747 // 4, 81, 2), store='data/rechunk')
mosquito = zarr.open('data/rechunk')
mosquito.chunks
# +
import numpy as np
import dask.array as da
mosquito = da.from_zarr('data/rechunk')
#mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
# ^^^ load array
# -
mosquito
print(mosquito[0])
mosquito[0].compute()
mosquito.visualize(rankdir='TB')
def calc_stats(variant):
variant = variant.reshape(variant.shape[0] // 2, 2)
num_misses = np.sum(np.equal(variant, -1)) // 2
return num_misses
mosquito_2d = mosquito.reshape(mosquito.shape[0], mosquito.shape[1] * mosquito.shape[2])
mosquito_2d.visualize(rankdir='TB')
mosquito_2d
max_pos = 10000000
stats = da.apply_along_axis(
calc_stats, 1, mosquito_2d[:max_pos,:],
shape=(max_pos,), dtype=np.int64)
stats.visualize('x.png',rankdir='TB')
a = stats.compute()
a
================================================
FILE: Chapter11/MP_intro.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Downloading data
# https://malariagen.github.io/vector-data/ag3/download.html
# !mkdir -p data/AG1000G-AO/
# !gsutil -m rsync -r \
# -x '.*/calldata/(AD|GQ|MQ)/.*' \
# gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \
# data/AG1000G-AO/ > /dev/null
# !mkdir -p data/metadata/
# !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/
# # BLA
# +
import numpy as np
import zarr
mosquito = zarr.open('data/AG1000G-AO')
print(mosquito.tree())
gt_2l = mosquito['/2L/calldata/GT']
gt_2l.info
dir(gt_2l)
gt_2l.shape[0]
# +
from math import ceil
from multiprocessing import Pool
def calc_stats(my_chunk):
num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1)
num_anc_hom = np.sum(
np.all([
np.equal(my_chunk[:,:,0], 0),
np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1)
num_het = np.sum(
np.not_equal(
my_chunk[:,:,0],
my_chunk[:,:,1]), axis=1)
return num_miss, num_anc_hom, num_het
chunk_pos_size = gt_2l.chunks[0]
max_pos = gt_2l.shape[0]
intervals = []
for chunk_pos in range(ceil(max_pos / chunk_pos_size)):
start_pos = chunk_pos * chunk_pos_size
end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size)
intervals.append((start_pos, end_pos))
def compute_interval(interval):
start_pos, end_pos = interval
my_chunk = gt_2l[start_pos:end_pos, :, :]
num_samples = my_chunk.shape[1]
num_miss, num_anc_hom, num_het = calc_stats(my_chunk)
chunk_complete_data = np.sum(np.equal(num_miss, 0))
chunk_more_anc_hom = np.sum(num_anc_hom > num_het)
return chunk_complete_data, chunk_more_anc_hom
with Pool() as p:
print(p)
chunk_returns = p.map(compute_interval, intervals)
complete_data = sum(map(lambda x: x[0], chunk_returns))
more_anc_hom = sum(map(lambda x: x[1], chunk_returns))
print(complete_data, more_anc_hom)
# -
================================================
FILE: Chapter11/Zarr_Intro.py
================================================
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Downloading data
# https://malariagen.github.io/vector-data/ag3/download.html
# !mkdir -p data/AG1000G-AO/
# !gsutil -m rsync -r \
# -x '.*/calldata/(AD|GQ|MQ)/.*' \
# gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \
# data/AG1000G-AO/ > /dev/null
# !mkdir -p data/metadata/
# !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/
# # BLA
# +
import numpy as np
import zarr
mosquito = zarr.open('data/AG1000G-AO')
print(mosquito.tree())
# -
mosquito['samples']
np.array(mosquito['samples'])
gt_2l = mosquito['/2L/calldata/GT']
gt_2l
gt_2l.info
gt_2l[400000,:,:]
# +
# Do not do np.array(gt_2l)
# -
dir(gt_2l)
gt_2l.shape[0]
# +
from math import ceil
chunk_pos_size = gt_2l.chunks[0]
max_pos = gt_2l.shape[0]
def calc_stats(my_chunk):
num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1)
num_anc_hom = np.sum(
np.all([
np.equal(my_chunk[:,:,0], 0),
np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1)
num_het = np.sum(
np.not_equal(
my_chunk[:,:,0],
my_chunk[:,:,1]), axis=1)
return num_miss, num_anc_hom, num_het
complete_data = 0
more_anc_hom = 0
total_pos = 0
for chunk_pos in range(ceil(max_pos / chunk_pos_size)):
start_pos = chunk_pos * chunk_pos_size
end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size)
my_chunk = gt_2l[start_pos:end_pos, :, :]
#print(start_pos, end_pos, my_chunk.shape)
num_samples = my_chunk.shape[1]
num_miss, num_anc_hom, num_het = calc_stats(my_chunk)
chunk_complete_data = np.sum(np.equal(num_miss, 0))
#print(end_pos - start_pos, my_chunk.shape, num_anc_hom.shape, num_het.shape)
chunk_more_anc_hom = np.sum(num_anc_hom > num_het)
print(np.sum(num_anc_hom > num_het))
complete_data += chunk_complete_data
more_anc_hom += chunk_more_anc_hom
total_pos += (end_pos - start_pos)
print(complete_data, more_anc_hom, total_pos)
# -
================================================
FILE: Chapter12/Builtin.py
================================================
import functools
@functools.cache
def fibo(n):
if n == 0:
return 0
if n == 1:
return 1
return fibo(n - 1) + fibo(n - 2)
fibo(1000)
def gene_min_reads(source, min_reads):
return map(
lambda x: x[0],
filter(
lambda x: x[1] >= min_reads,
source.items()))
list(gene_min_reads({'LCT': 10, 'MRAP2': 1}, 2))
multiplication = lambda x, y: x * y
double = functools.partial(multiplication, 2)
double(3)
================================================
FILE: Chapter12/Lazy.py
================================================
import pandas as pd
def load(file_name):
df = pd.read_csv(file_name).set_index('gene')
return dict(df['count'])
def get_min_reads(all_data, min_reads):
return {
gene: count
for gene, count in all_data.items()
if count >= min_reads
}
def has_min_observations(subset_data, min_observations):
return len(subset_data) >= min_observations
print(has_min_observations(
get_min_reads(
load('my_genes.csv'), 4
), 3))
def get_rec(file_name):
with open(file_name) as f:
f.readline() # header
for line in f:
toks = line.strip().split(',')
yield toks[0], int(toks[1])
def gene_min_reads(source, min_reads):
for gene, count in source:
if count >= min_reads:
yield gene
def gene_min_observations(subset_source, min_observations):
my_observations = 0
for gene in subset_source:
my_observations += 1
if my_observations == min_observations:
return True
return False
print(gene_min_observations(
gene_min_reads(
get_rec('my_genes.csv'), 4
), 2))
================================================
FILE: Chapter12/Mutability.py
================================================
import shutil
import pandas as pd
def restore_db(file_name):
shutil.copyfile(f'{file_name}.base', file_name)
def load(file_name):
df = pd.read_csv(file_name).set_index('gene')
return dict(df['count'])
def save(dict_db, file_name):
pd.Series(dict_db).to_csv(
file_name, index_label='gene', header=['count'])
def add_sample_dict(dict_db, gene_list):
for gene in gene_list:
dict_db[gene] = dict_db.get(0) + 1
def add_sample_new_dict(dict_db, gene_list):
my_dict_db = dict(dict_db) # next recipe
for gene in gene_list:
my_dict_db[gene] = my_dict_db.get(0) + 1
return my_dict_db
gene_count = load('my_genes.csv')
add_sample_dict(gene_count, ['DEPP'])
new_gene_count = add_sample_new_dict(gene_count, ['DEPP'])
================================================
FILE: Chapter12/Persistence1.py
================================================
import shutil
import pandas as pd
def restore_db(file_name):
shutil.copyfile(f'{file_name}.base', file_name)
def load(file_name):
df = pd.read_csv(file_name).set_index('gene')
return dict(df['count'])
def save(dict_db, file_name):
pd.Series(dict_db).to_csv(
file_name, index_label='gene', header=['count'])
def add_sample_csv(gene_list):
gene_count = load('my_genes.csv')
for gene in gene_list:
gene_count[gene] = gene_count.get(0) + 1
save(gene_count, 'my_genes.csv')
restore_db('my_genes.csv')
add_sample_csv(['MC4R', 'TYR'])
add_sample_csv(['LCT', 'HLA-A'])
add_sample_csv(['HLA-B', 'HLA-C'])
================================================
FILE: Chapter12/Persistence2.py
================================================
import shutil
import pandas as pd
def restore_db(file_name):
shutil.copyfile(f'{file_name}.base', file_name)
def load(file_name):
df = pd.read_csv(file_name).set_index('gene')
return dict(df['count'])
def save(dict_db, file_name):
pd.Series(dict_db).to_csv(
file_name, index_label='gene', header=['count'])
def add_sample_new_dict(dict_db, gene_list):
my_dict_db = dict(dict_db) # next recipe
for gene in gene_list:
my_dict_db[gene] = my_dict_db.get(0) + 1
return my_dict_db
restore_db('my_genes.csv')
gene_count = load('my_genes.csv')
gene_count = add_sample_new_dict(gene_count, ['MC4R', 'TYR'])
gene_count = add_sample_new_dict(gene_count, ['LCT', 'HLA-A'])
gene_count = add_sample_new_dict(gene_count, ['HLA-B', 'HLA-C'])
save(gene_count, 'my_genes.csv')
================================================
FILE: Chapter12/Pure.py
================================================
import shutil
import pandas as pd
def restore_db(file_name):
shutil.copyfile(f'{file_name}.base', file_name)
def load(file_name):
df = pd.read_csv(file_name).set_index('gene')
return dict(df['count'])
def save(dict_db, file_name):
pd.Series(dict_db).to_csv(
file_name, index_label='gene', header=['count'])
def add_sample_csv(gene_list):
gene_count = load('my_genes.csv')
for gene in gene_list:
gene_count[gene] = gene_count.get(0) + 1
save(gene_count, 'my_genes.csv')
def add_sample_global_dict(gene_list):
global gene_count
for gene in gene_list:
gene_count[gene] = gene_count.get(0) + 1
def add_sample_dict(dict_db, gene_list):
for gene in gene_list:
dict_db[gene] = dict_db.get(0) + 1
gene_count = load('my_genes.csv')
add_sample_csv(['MC4R', 'TYR'])
add_sample_dict(gene_count, ['MC4R', 'TYR'])
save(gene_count, 'my_genes.csv')
================================================
FILE: Chapter12/Recursion.py
================================================
def fibo_iter(n):
if n < 2:
return n
last = 1
second_last = 0
for _i in range(2, n + 1):
result = second_last + last
second_last = last
last = result
return result
def fibo_naive(n):
if n == 0:
return 0
if n == 1:
return 1
return fibo_naive(n - 1) + fibo_naive(n - 2)
fibo_iter(0)
fibo_iter(1)
fibo_iter(2)
fibo_iter(3)
fibo_iter(4)
fibo_iter(5)
fibo_iter(6)
fibo_naive(1000)
def factorial(n):
if n == 1:
return 1
return n * factorial(n - 1)
factorial(5)
factorial(20000)
================================================
FILE: Chapter12/Tools.py
================================================
import functools
def fibo_iter(n):
if n == 0:
return 0
if n == 1:
return 1
last = 1
second_last = 1
for i in range(3, n + 1):
result = second_last + last
second_last = last
last = result
return result
def fibo_naive(n):
if n == 0:
return 0
if n == 1:
return 1
return fibo_naive(n - 1) + fibo_naive(n - 2)
@functools.lru_cache
def fibo(n):
if n == 0:
return 0
if n == 1:
return 1
return fibo(n - 1) + fibo(n - 2)
time fibo_iter(100)
#time fibo_naive(1000)
time fibo(1000)
def factorial(n):
if n == 1:
return 1
return n * factorial(n - 1)
factorial(20000)
================================================
FILE: Chapter12/my_genes.csv
================================================
gene,count
LCT,5
LEPR,4
MRAP2,1
================================================
FILE: Chapter12/my_genes.csv.base
================================================
gene,count
LCT,5
LEPR,4
MRAP2,1
================================================
FILE: Datasets.py
================================================
# # Datasets for the book
#
# Here we provide links to the datasets used in the book.
#
# Important Notes:
#
# 1. Note that these datasets are provided on external servers by third parties
# # Python and the Surrounding Software Ecology
#
# ## R sections
#
# http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index
#
# # PDB
#
#
# ## Parsing mmCIF files with Biopython
#
# [1TUP.cif](http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP)"
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2021 Packt
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Bioinformatics-with-Python-Cookbook-third-edition
<a href="https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421"><img src="https://static.packt-cdn.com/products/9781803236421/cover/smaller" alt="Bioinformatics with Python Cookbook - Third Edition" height="256px" align="right"></a>
This is the code repository for [Bioinformatics with Python Cookbook - Third Edition](https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421), published by Packt.
**Use modern Python libraries and applications to solve real-world computational biology problems**
## What is this book about?
Bioinformatics is an active research field that uses a range of simple-to-advanced computations to extract valuable information from biological data, and this book will show you how to manage these tasks using Python.
This updated third edition of the Bioinformatics with Python Cookbook begins with a quick overview of the various tools and libraries in the Python ecosystem that will help you convert, analyze, and visualize biological datasets. Next, you'll cover key techniques for next-generation sequencing, single-cell analysis, genomics, metagenomics, population genetics, phylogenetics, and proteomics with the help of real-world examples. You'll learn how to work with important pipeline systems, such as Galaxy servers and Snakemake, and understand the various modules in Python for functional and asynchronous programming. This book will also help you explore topics such as SNP discovery using statistical approaches under high-performance computing frameworks, including Dask and Spark. In addition to this, you’ll explore the application of machine learning algorithms in bioinformatics.
By the end of this bioinformatics Python book, you'll be equipped with the knowledge you need to implement the latest programming techniques and frameworks, empowering you to deal with bioinformatics data on every scale.
This book covers the following exciting features:
* Become well-versed with data processing libraries such as NumPy, pandas, arrow, and zarr in the context of bioinformatic analysis
* Interact with genomic databases
* Solve real-world problems in the fields of population genetics, phylogenetics, and proteomics
* Build bioinformatics pipelines using a Galaxy server and Snakemake
* Work with functools and itertools for functional programming
* Perform parallel processing with Dask on biological data
* Explore principal component analysis (PCA) techniques with scikit-learn
If you feel this book is for you, get your [copy](https://www.amazon.in/Bioinformatics-Python-Cookbook-bioinformatics-computational/dp/1789344697/ref=sr_1_2?keywords=Bioinformatics+with+Python+Cookbook+-+Third+Edition&qid=1665382032&sr=8-2) today!
<a href="https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" alt="https://www.packtpub.com/" border="5" /></a>
## Instructions and Navigations
All of the code is organized into folders.
The code will look like the following:
```
from
gitextract_jf5fqbmn/
├── .gitignore
├── Chapter01/
│ ├── Interfacing_R.py
│ ├── R_magic.py
│ ├── base_setup.sh
│ └── bioinformatics_base.txt
├── Chapter02/
│ ├── .gitignore
│ ├── Arrow.py
│ ├── Matplotlib.py
│ ├── NumPy.py
│ ├── Pandas_Basic.py
│ ├── Pandas_Join.py
│ └── Pandas_Memory.py
├── Chapter03/
│ ├── Accessing_Databases.py
│ ├── Basic_Sequence_Processing.py
│ ├── Filtering_SNPs.py
│ ├── LCT.bed
│ ├── Processing_BED_with_HTSeq.py
│ ├── Working_with_BAM.py
│ ├── Working_with_FASTQ.py
│ └── Working_with_VCF.py
├── Chapter04/
│ ├── 2L.py
│ ├── Exploration.py
│ ├── Mendel.py
│ ├── Preparation.py
│ ├── QIIME2_Metagenomics.py
│ └── samples.tsv
├── Chapter05/
│ ├── .gitignore
│ ├── Annotations.py
│ ├── Gene_Ontology.py
│ ├── Getting_Gene.py
│ ├── Low_Quality.py
│ ├── Orthology.py
│ └── Reference_Genome.py
├── Chapter06/
│ ├── .gitignore
│ ├── Admixture.py
│ ├── Data_Formats.py
│ ├── Exploratory_Analysis.py
│ ├── PCA.py
│ ├── Pop_Stats.py
│ └── Sgkit.py
├── Chapter07/
│ ├── .gitignore
│ ├── Alignment.py
│ ├── Comparison.py
│ ├── Exploration.py
│ ├── Reconstruction.py
│ ├── Selection.py
│ ├── Trees.py
│ └── Visualization.py
├── Chapter08/
│ ├── .gitignore
│ ├── Distance.py
│ ├── Intro.py
│ ├── Mass.py
│ ├── PDB.py
│ ├── Parser.py
│ ├── PyMol_Intro.py
│ ├── PyMol_Movie.py
│ ├── Stats.py
│ └── mmCIF.py
├── Chapter09/
│ ├── galaxy/
│ │ ├── .gitignore
│ │ ├── LCT.bed
│ │ ├── api.py
│ │ ├── encrypt.py
│ │ └── galaxy.yaml
│ ├── nextflow/
│ │ ├── .gitignore
│ │ └── pipeline.nf
│ └── snakemake/
│ ├── .gitignore
│ ├── Snakefile
│ └── plot_pca.py
├── Chapter10/
│ ├── Clustering.py
│ ├── Decision_Tree.py
│ ├── PCA.py
│ └── Random_Forest.py
├── Chapter11/
│ ├── .gitignore
│ ├── Dask_Distributed.py
│ ├── Dask_Intro.py
│ ├── MP_intro.py
│ └── Zarr_Intro.py
├── Chapter12/
│ ├── Builtin.py
│ ├── Lazy.py
│ ├── Mutability.py
│ ├── Persistence1.py
│ ├── Persistence2.py
│ ├── Pure.py
│ ├── Recursion.py
│ ├── Tools.py
│ ├── my_genes.csv
│ └── my_genes.csv.base
├── Datasets.py
├── LICENSE
├── README.md
├── Welcome.ipynb
└── docker/
├── Chapter01/
│ └── Dockerfile
└── main/
└── Dockerfile
SYMBOL INDEX (94 symbols across 37 files) FILE: Chapter02/NumPy.py function compute_frac (line 40) | def compute_frac(arr_1d): FILE: Chapter03/Filtering_SNPs.py function do_window (line 42) | def do_window(recs, size, fun): function apply_win_funs (line 57) | def apply_win_funs(wins, funs): function get_sample (line 93) | def get_sample(rec, annot, my_type): function get_sample_relation (line 119) | def get_sample_relation(recs, f1, f2): function plot_hz_rel (line 146) | def plot_hz_rel(dps, ax, ax2, name, rel): function get_variant_relation (line 181) | def get_variant_relation(recs, f1, f2): function eff_to_int (line 202) | def eff_to_int(rec): FILE: Chapter04/2L.py function insert_in_window (line 37) | def insert_in_window(row): FILE: Chapter04/Mendel.py function accept_entry (line 78) | def accept_entry(row): FILE: Chapter04/Preparation.py function compute_mendelian_errors (line 76) | def compute_mendelian_errors(mother, father, offspring): function acceptable_position_to_genotype (line 112) | def acceptable_position_to_genotype(): function acumulate (line 120) | def acumulate(fun): function get_family_indexes (line 129) | def get_family_indexes(samples_hdf5, cross_pd): function get_mendelian_errors (line 152) | def get_mendelian_errors(): function get_parent_indexes (line 196) | def get_parent_indexes(samples_hdf5, parents_pd): FILE: Chapter05/Gene_Ontology.py function do_request (line 26) | def do_request(server, service, *args, **kwargs): function get_upper (line 75) | def get_upper(go_id): FILE: Chapter05/Getting_Gene.py function get_sequence (line 45) | def get_sequence(chrom_seq, CDSs, strand): FILE: Chapter05/Orthology.py function do_request (line 20) | def do_request(server, service, *args, **kwargs): FILE: Chapter06/Admixture.py function load_Q (line 83) | def load_Q(fname, ind_order): FILE: Chapter06/Data_Formats.py function get_non_auto_SNPs (line 58) | def get_non_auto_SNPs(map_file, exclude_file): FILE: Chapter06/Pop_Stats.py function assign_cohort (line 46) | def assign_cohort(pops, pop_ind, sample_family_id, sample_id): FILE: Chapter07/Comparison.py function do_basic_popgen (line 65) | def do_basic_popgen(seqs): FILE: Chapter07/Exploration.py function get_ebov_2014_sources (line 22) | def get_ebov_2014_sources(): function get_other_ebov_sources (line 27) | def get_other_ebov_sources(): function get_other_ebolavirus_sources (line 33) | def get_other_ebolavirus_sources(): function dump_genes (line 92) | def dump_genes(species, recs, g_dls, p_hdls): function describe_seqs (line 129) | def describe_seqs(seqs): FILE: Chapter07/Trees.py function compute_level (line 21) | def compute_level(node, level=0): function compute_height (line 31) | def compute_height(node): function compute_nofs (line 45) | def compute_nofs(node): function print_nodes (line 56) | def print_nodes(node): function print_breadth (line 67) | def print_breadth(tree): function simplify_tree (line 84) | def simplify_tree(node): FILE: Chapter07/Visualization.py function get_color (line 45) | def get_color(name): function color_tree (line 51) | def color_tree(node, fun_color=get_color): FILE: Chapter08/Distance.py function get_closest_atoms (line 41) | def get_closest_atoms(pdb_struct, ref_atom, distance): function get_closest_alternative (line 74) | def get_closest_alternative(pdb_struct, ref_atom, distance): FILE: Chapter08/Intro.py function do_request (line 25) | def do_request(server, **kwargs): FILE: Chapter08/Mass.py function get_mass (line 40) | def get_mass(atoms, accept_fun=lambda atom: atom.parent.id[0] != 'W'): function get_center (line 55) | def get_center(atoms, weight_fun=lambda atom: 1 if atom.parent.id[0] != ... FILE: Chapter08/PDB.py function print_pdb_headers (line 29) | def print_pdb_headers(headers, indent=0): function describe_model (line 51) | def describe_model(name, pdb): function get_fasta (line 77) | def get_fasta(pdb_file, fasta_file, transfer_ids=None): FILE: Chapter08/Parser.py function parse_pdb (line 41) | def parse_pdb(hdl): function process_multi_lines (line 72) | def process_multi_lines(hdl): function get_spec_list (line 105) | def get_spec_list(my_str): function process_struct_types (line 118) | def process_struct_types(hdl): FILE: Chapter08/PyMol_Intro.py function dump_thread (line 2) | def dump_thread(): FILE: Chapter08/Stats.py function get_bounds (line 57) | def get_bounds(my_atoms): FILE: Chapter08/mmCIF.py function describe_model (line 24) | def describe_model(name, pdb): FILE: Chapter09/galaxy/api.py function summarize_contents (line 92) | def summarize_contents(contents): function dataset_to_param (line 122) | def dataset_to_param(dataset): FILE: Chapter10/Clustering.py function plot_kmeans_pca (line 89) | def plot_kmeans_pca(trans, kmeans): FILE: Chapter11/Dask_Distributed.py function calc_stats (line 54) | def calc_stats(my_chunk): FILE: Chapter11/Dask_Intro.py function calc_stats (line 42) | def calc_stats(variant): FILE: Chapter11/MP_intro.py function calc_stats (line 47) | def calc_stats(my_chunk): function compute_interval (line 71) | def compute_interval(interval): FILE: Chapter11/Zarr_Intro.py function calc_stats (line 61) | def calc_stats(my_chunk): FILE: Chapter12/Builtin.py function fibo (line 5) | def fibo(n): function gene_min_reads (line 16) | def gene_min_reads(source, min_reads): FILE: Chapter12/Lazy.py function load (line 4) | def load(file_name): function get_min_reads (line 9) | def get_min_reads(all_data, min_reads): function has_min_observations (line 17) | def has_min_observations(subset_data, min_observations): function get_rec (line 27) | def get_rec(file_name): function gene_min_reads (line 35) | def gene_min_reads(source, min_reads): function gene_min_observations (line 41) | def gene_min_observations(subset_source, min_observations): FILE: Chapter12/Mutability.py function restore_db (line 5) | def restore_db(file_name): function load (line 9) | def load(file_name): function save (line 14) | def save(dict_db, file_name): function add_sample_dict (line 19) | def add_sample_dict(dict_db, gene_list): function add_sample_new_dict (line 24) | def add_sample_new_dict(dict_db, gene_list): FILE: Chapter12/Persistence1.py function restore_db (line 5) | def restore_db(file_name): function load (line 9) | def load(file_name): function save (line 14) | def save(dict_db, file_name): function add_sample_csv (line 19) | def add_sample_csv(gene_list): FILE: Chapter12/Persistence2.py function restore_db (line 5) | def restore_db(file_name): function load (line 9) | def load(file_name): function save (line 14) | def save(dict_db, file_name): function add_sample_new_dict (line 19) | def add_sample_new_dict(dict_db, gene_list): FILE: Chapter12/Pure.py function restore_db (line 5) | def restore_db(file_name): function load (line 9) | def load(file_name): function save (line 14) | def save(dict_db, file_name): function add_sample_csv (line 19) | def add_sample_csv(gene_list): function add_sample_global_dict (line 26) | def add_sample_global_dict(gene_list): function add_sample_dict (line 32) | def add_sample_dict(dict_db, gene_list): FILE: Chapter12/Recursion.py function fibo_iter (line 1) | def fibo_iter(n): function fibo_naive (line 13) | def fibo_naive(n): function factorial (line 31) | def factorial(n): FILE: Chapter12/Tools.py function fibo_iter (line 4) | def fibo_iter(n): function fibo_naive (line 18) | def fibo_naive(n): function fibo (line 27) | def fibo(n): function factorial (line 40) | def factorial(n):
Condensed preview — 93 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (207K chars).
[
{
"path": ".gitignore",
"chars": 40,
"preview": ".ipynb_checkpoints\n.Rhistory\n__pycache__"
},
{
"path": "Chapter01/Interfacing_R.py",
"chars": 4021,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: percent\n# fo"
},
{
"path": "Chapter01/R_magic.py",
"chars": 1995,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: percent\n# fo"
},
{
"path": "Chapter01/base_setup.sh",
"chars": 355,
"preview": "conda create -n bioinformatics_base python=3.9.7 \n\nconda activate bioinformatics_base\nconda config --add channels biocon"
},
{
"path": "Chapter01/bioinformatics_base.txt",
"chars": 14664,
"preview": "# This file may be used to create an environment using:\n# $ conda create --name <env> --file <this file>\n# platform: lin"
},
{
"path": "Chapter02/.gitignore",
"chars": 43,
"preview": "*png\nVAERSDataUseGuide_en_September2021.pdf"
},
{
"path": "Chapter02/Arrow.py",
"chars": 1555,
"preview": "import gzip\nimport pandas as pd\nfrom pyarrow import csv\nimport pyarrow.compute as pc\n\nvdata_pd = pd.read_csv(\"2021VAERSD"
},
{
"path": "Chapter02/Matplotlib.py",
"chars": 4409,
"preview": "import numpy as np\nimport pandas as pd\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\n\nvdata = pd.read_csv(\n "
},
{
"path": "Chapter02/NumPy.py",
"chars": 1812,
"preview": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nvdata = pd.read_csv(\n \"2021VAERSDATA.csv.gz\","
},
{
"path": "Chapter02/Pandas_Basic.py",
"chars": 1943,
"preview": "# # Using Pandas to process vaccine adverse events\n#\n# ## Data Access\n#\n# Go to https://vaers.hhs.gov/data/datasets.html"
},
{
"path": "Chapter02/Pandas_Join.py",
"chars": 2004,
"preview": "# # Pandas advanced\n\nimport numpy as np\nimport pandas as pd\n\n# # Code to sample original data\n#\n# ```\n# vdata = pd.read_"
},
{
"path": "Chapter02/Pandas_Memory.py",
"chars": 1446,
"preview": "# # Pandas advanced\n\nimport numpy as np\nimport pandas as pd\n\nvdata = pd.read_csv(\"2021VAERSDATA.csv.gz\", encoding=\"iso-8"
},
{
"path": "Chapter03/Accessing_Databases.py",
"chars": 1968,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/Basic_Sequence_Processing.py",
"chars": 1044,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/Filtering_SNPs.py",
"chars": 6611,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/LCT.bed",
"chars": 2523,
"preview": "track name=gene description=\"Gene information\"\n2\t135836529\t135837180\tENSE00002202258\t0\t-\n2\t135833110\t135833190\tENSE00001"
},
{
"path": "Chapter03/Processing_BED_with_HTSeq.py",
"chars": 1439,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/Working_with_BAM.py",
"chars": 3556,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/Working_with_FASTQ.py",
"chars": 3812,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter03/Working_with_VCF.py",
"chars": 2178,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/2L.py",
"chars": 1269,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/Exploration.py",
"chars": 2620,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/Mendel.py",
"chars": 2594,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/Preparation.py",
"chars": 6862,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/QIIME2_Metagenomics.py",
"chars": 3873,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter04/samples.tsv",
"chars": 2414,
"preview": "id\tcross\tsex\tfunction\nAD0231-C\tcross-29-2\tF\tparent\nAD0232-C\tcross-29-2\tM\tparent\nAD0234-C\tcross-29-2\tF\tprogeny\nAD0235-C\tc"
},
{
"path": "Chapter05/.gitignore",
"chars": 22,
"preview": "*.fasta\nag.db\n*gz\n*png"
},
{
"path": "Chapter05/Annotations.py",
"chars": 1950,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter05/Gene_Ontology.py",
"chars": 3516,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter05/Getting_Gene.py",
"chars": 1782,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter05/Low_Quality.py",
"chars": 3126,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter05/Orthology.py",
"chars": 1989,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter05/Reference_Genome.py",
"chars": 3126,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/.gitignore",
"chars": 91,
"preview": "*.log\n*.ped\n*.map\n*.bed\n*.bim\n*.fam\nexclude*.txt\nrelationships_w_pops_041510.txt\n*.in\n*.out"
},
{
"path": "Chapter06/Admixture.py",
"chars": 2335,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/Data_Formats.py",
"chars": 2737,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/Exploratory_Analysis.py",
"chars": 1394,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/PCA.py",
"chars": 2936,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/Pop_Stats.py",
"chars": 4062,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter06/Sgkit.py",
"chars": 1193,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/.gitignore",
"chars": 36,
"preview": "*fasta\ntrim.fasta.reduced\n*nex\nbp_rx"
},
{
"path": "Chapter07/Alignment.py",
"chars": 1800,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Comparison.py",
"chars": 4307,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Exploration.py",
"chars": 6036,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Reconstruction.py",
"chars": 1650,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Selection.py",
"chars": 1437,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Trees.py",
"chars": 2581,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter07/Visualization.py",
"chars": 1796,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/.gitignore",
"chars": 11,
"preview": "*ent\n*fasta"
},
{
"path": "Chapter08/Distance.py",
"chars": 3052,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/Intro.py",
"chars": 2241,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/Mass.py",
"chars": 2601,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/PDB.py",
"chars": 2870,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/Parser.py",
"chars": 4076,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/PyMol_Intro.py",
"chars": 769,
"preview": "import threading\ndef dump_thread():\n print\n for thr in threading.enumerate():\n print(thr)\ndump_thread()\nimp"
},
{
"path": "Chapter08/PyMol_Movie.py",
"chars": 2462,
"preview": "import pymol\nfrom pymol import cmd\n#pymol.pymol_argv = [ 'pymol', '-qc'] # Quiet / no GUI\npymol.finish_launching()\n\n#cm"
},
{
"path": "Chapter08/Stats.py",
"chars": 3154,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter08/mmCIF.py",
"chars": 1125,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter09/galaxy/.gitignore",
"chars": 25,
"preview": "galaxy.yaml.enc\ntool\nsalt"
},
{
"path": "Chapter09/galaxy/LCT.bed",
"chars": 2523,
"preview": "track name=gene description=\"Gene information\"\n2\t135836529\t135837180\tENSE00002202258\t0\t-\n2\t135833110\t135833190\tENSE00001"
},
{
"path": "Chapter09/galaxy/api.py",
"chars": 3326,
"preview": "import base64\nfrom collections import defaultdict\n#import ftplib\n\nimport getpass\nimport pprint\nimport warnings\n\nfrom rua"
},
{
"path": "Chapter09/galaxy/encrypt.py",
"chars": 1090,
"preview": "\"Encrypt an YAML file with the script configuration\"\n\nimport base64\nimport getpass\nfrom io import StringIO\nimport os\n\nfr"
},
{
"path": "Chapter09/galaxy/galaxy.yaml",
"chars": 129,
"preview": "rest_protocol: http\nserver: localhost\nrest_port: 8080\nsftp_port: 8022\nuser: admin@galaxy.org\npassword: password\napi_key:"
},
{
"path": "Chapter09/nextflow/.gitignore",
"chars": 35,
"preview": "data\npca.png\nwork\n.nextflow*\nreport"
},
{
"path": "Chapter09/nextflow/pipeline.nf",
"chars": 2139,
"preview": "nextflow.enable.dsl=2\n\ndownload_root = \"https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3\"\n\n\nprocess plink_downlo"
},
{
"path": "Chapter09/snakemake/.gitignore",
"chars": 55,
"preview": "data\nscratch\n.snakemake\npca.png\ndag.svg\nbio.png\nbio.svg"
},
{
"path": "Chapter09/snakemake/Snakefile",
"chars": 1999,
"preview": "rule all:\n input:\n \"pca.png\"\n\nrule plink_download:\n output:\n map=\"scratch/hapmap.map.gz\",\n pe"
},
{
"path": "Chapter09/snakemake/plot_pca.py",
"chars": 258,
"preview": "import pandas as pd\n\neigen_fname = snakemake.input[0] if snakemake.input[0].endswith('eigenvec') else snakemake.input[1]"
},
{
"path": "Chapter10/Clustering.py",
"chars": 3156,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter10/Decision_Tree.py",
"chars": 2161,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter10/PCA.py",
"chars": 1940,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter10/Random_Forest.py",
"chars": 2375,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter11/.gitignore",
"chars": 39,
"preview": "dask-worker-space\ndata\nmydask.png\nx.png"
},
{
"path": "Chapter11/Dask_Distributed.py",
"chars": 1153,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter11/Dask_Intro.py",
"chars": 1236,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter11/MP_intro.py",
"chars": 2217,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter11/Zarr_Intro.py",
"chars": 2257,
"preview": "# ---\n# jupyter:\n# jupytext:\n# text_representation:\n# extension: .py\n# format_name: light\n# form"
},
{
"path": "Chapter12/Builtin.py",
"chars": 477,
"preview": "import functools\n\n\n@functools.cache\ndef fibo(n):\n if n == 0:\n return 0\n if n == 1:\n return 1\n ret"
},
{
"path": "Chapter12/Lazy.py",
"chars": 1128,
"preview": "import pandas as pd\n\n\ndef load(file_name):\n df = pd.read_csv(file_name).set_index('gene')\n return dict(df['count']"
},
{
"path": "Chapter12/Mutability.py",
"chars": 776,
"preview": "import shutil\nimport pandas as pd\n\n\ndef restore_db(file_name):\n shutil.copyfile(f'{file_name}.base', file_name)\n\n\ndef"
},
{
"path": "Chapter12/Persistence1.py",
"chars": 651,
"preview": "import shutil\nimport pandas as pd\n\n\ndef restore_db(file_name):\n shutil.copyfile(f'{file_name}.base', file_name)\n\n\ndef"
},
{
"path": "Chapter12/Persistence2.py",
"chars": 815,
"preview": "import shutil\nimport pandas as pd\n\n\ndef restore_db(file_name):\n shutil.copyfile(f'{file_name}.base', file_name)\n\n\ndef"
},
{
"path": "Chapter12/Pure.py",
"chars": 924,
"preview": "import shutil\nimport pandas as pd\n\n\ndef restore_db(file_name):\n shutil.copyfile(f'{file_name}.base', file_name)\n\n\ndef"
},
{
"path": "Chapter12/Recursion.py",
"chars": 576,
"preview": "def fibo_iter(n):\n if n < 2:\n return n\n last = 1\n second_last = 0\n for _i in range(2, n + 1):\n "
},
{
"path": "Chapter12/Tools.py",
"chars": 702,
"preview": "import functools\n\n\ndef fibo_iter(n):\n if n == 0:\n return 0\n if n == 1:\n return 1\n last = 1\n se"
},
{
"path": "Chapter12/my_genes.csv",
"chars": 31,
"preview": "gene,count\nLCT,5\nLEPR,4\nMRAP2,1"
},
{
"path": "Chapter12/my_genes.csv.base",
"chars": 31,
"preview": "gene,count\nLCT,5\nLEPR,4\nMRAP2,1"
},
{
"path": "Datasets.py",
"chars": 510,
"preview": "\n# # Datasets for the book\n#\n# Here we provide links to the datasets used in the book.\n#\n# Important Notes:\n#\n# 1. Note "
},
{
"path": "LICENSE",
"chars": 1062,
"preview": "MIT License\n\nCopyright (c) 2021 Packt\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof t"
},
{
"path": "README.md",
"chars": 5750,
"preview": "\n\n# Bioinformatics-with-Python-Cookbook-third-edition\n\n<a href=\"https://www.packtpub.com/product/bioinformatics-with-pyt"
},
{
"path": "Welcome.ipynb",
"chars": 996,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"3040340b-bd0b-4266-a7a6-8b48d9a94625\",\n \"metadata\": {},\n \"so"
},
{
"path": "docker/Chapter01/Dockerfile",
"chars": 549,
"preview": "FROM tiagoantao/bio3\nMAINTAINER Tiago Antao <tiago@tiago.org>\n# RUN conda create -n bioinformatics_r --clone bioinformat"
},
{
"path": "docker/main/Dockerfile",
"chars": 1067,
"preview": "FROM continuumio/anaconda3:2021.05\nMAINTAINER Tiago Antao <tiago@tiago.org>\n#ENV DEBIAN_FRONTEND noninteractive\n\n#RUN ap"
}
]
About this extraction
This page contains the full source code of the PacktPublishing/Bioinformatics-with-Python-Cookbook-third-edition GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 93 files (188.9 KB), approximately 63.5k tokens, and a symbol index with 94 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.