Repository: PacktPublishing/Bioinformatics-with-Python-Cookbook-third-edition
Branch: main
Commit: 9b10894b1a19
Files: 93
Total size: 188.9 KB

Directory structure:
gitextract_jf5fqbmn/

├── .gitignore
├── Chapter01/
│   ├── Interfacing_R.py
│   ├── R_magic.py
│   ├── base_setup.sh
│   └── bioinformatics_base.txt
├── Chapter02/
│   ├── .gitignore
│   ├── Arrow.py
│   ├── Matplotlib.py
│   ├── NumPy.py
│   ├── Pandas_Basic.py
│   ├── Pandas_Join.py
│   └── Pandas_Memory.py
├── Chapter03/
│   ├── Accessing_Databases.py
│   ├── Basic_Sequence_Processing.py
│   ├── Filtering_SNPs.py
│   ├── LCT.bed
│   ├── Processing_BED_with_HTSeq.py
│   ├── Working_with_BAM.py
│   ├── Working_with_FASTQ.py
│   └── Working_with_VCF.py
├── Chapter04/
│   ├── 2L.py
│   ├── Exploration.py
│   ├── Mendel.py
│   ├── Preparation.py
│   ├── QIIME2_Metagenomics.py
│   └── samples.tsv
├── Chapter05/
│   ├── .gitignore
│   ├── Annotations.py
│   ├── Gene_Ontology.py
│   ├── Getting_Gene.py
│   ├── Low_Quality.py
│   ├── Orthology.py
│   └── Reference_Genome.py
├── Chapter06/
│   ├── .gitignore
│   ├── Admixture.py
│   ├── Data_Formats.py
│   ├── Exploratory_Analysis.py
│   ├── PCA.py
│   ├── Pop_Stats.py
│   └── Sgkit.py
├── Chapter07/
│   ├── .gitignore
│   ├── Alignment.py
│   ├── Comparison.py
│   ├── Exploration.py
│   ├── Reconstruction.py
│   ├── Selection.py
│   ├── Trees.py
│   └── Visualization.py
├── Chapter08/
│   ├── .gitignore
│   ├── Distance.py
│   ├── Intro.py
│   ├── Mass.py
│   ├── PDB.py
│   ├── Parser.py
│   ├── PyMol_Intro.py
│   ├── PyMol_Movie.py
│   ├── Stats.py
│   └── mmCIF.py
├── Chapter09/
│   ├── galaxy/
│   │   ├── .gitignore
│   │   ├── LCT.bed
│   │   ├── api.py
│   │   ├── encrypt.py
│   │   └── galaxy.yaml
│   ├── nextflow/
│   │   ├── .gitignore
│   │   └── pipeline.nf
│   └── snakemake/
│       ├── .gitignore
│       ├── Snakefile
│       └── plot_pca.py
├── Chapter10/
│   ├── Clustering.py
│   ├── Decision_Tree.py
│   ├── PCA.py
│   └── Random_Forest.py
├── Chapter11/
│   ├── .gitignore
│   ├── Dask_Distributed.py
│   ├── Dask_Intro.py
│   ├── MP_intro.py
│   └── Zarr_Intro.py
├── Chapter12/
│   ├── Builtin.py
│   ├── Lazy.py
│   ├── Mutability.py
│   ├── Persistence1.py
│   ├── Persistence2.py
│   ├── Pure.py
│   ├── Recursion.py
│   ├── Tools.py
│   ├── my_genes.csv
│   └── my_genes.csv.base
├── Datasets.py
├── LICENSE
├── README.md
├── Welcome.ipynb
└── docker/
    ├── Chapter01/
    │   └── Dockerfile
    └── main/
        └── Dockerfile

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.ipynb_checkpoints
.Rhistory
__pycache__

================================================
FILE: Chapter01/Interfacing_R.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# %% [markdown]
# ## The next cell will get a ~65 MB data file 'sequence.index', you only need to run the cell once

# %%
# !rm sequence.index 2>/dev/null
# !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index

# %% [markdown]
# # Interfacing with R

# %%
import os

from IPython.display import Image

import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
from rpy2.robjects.functions import SignatureTranslatedFunction

import pandas as pd

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

# %%
read_delim = robjects.r('read.delim')
seq_data = read_delim('sequence.index', header=True, stringsAsFactors=False)
#In R:
#  seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)

# %%
print('This data frame has %d columns and %d rows' % (seq_data.ncol, seq_data.nrow))
print(seq_data.colnames)
#In R:
#  print(colnames(seq.data))
#  print(nrow(seq.data))
#  print(ncol(seq.data))

print('Columns in Python %d ' % robjects.r.ncol(seq_data)[0])

#access some functions
as_integer = robjects.r('as.integer')
match = robjects.r.match

my_col = match('READ_COUNT', seq_data.colnames)[0] # Vector returned
print('Type of read count before as.integer: %s' % seq_data[my_col - 1].rclass[0])
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])
print('Type of read count after as.integer: %s' % seq_data[my_col - 1].rclass[0])

my_col = match('BASE_COUNT', seq_data.colnames)[0] # Vector returned
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])

my_col = match('CENTER_NAME', seq_data.colnames)[0]
seq_data[my_col - 1] = robjects.r.toupper(seq_data[my_col - 1])
robjects.r.assign('seq.data', seq_data)
robjects.r('print(c("Column names in R: ",colnames(seq.data)))')

robjects.r('seq.data <- seq.data[seq.data$WITHDRAWN==0, ]')
#Lets remove all withdrawn sequences

robjects.r("seq.data <- seq.data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]")
#Lets shorten the dataframe

#Population as factor
robjects.r('seq.data$POPULATION <- as.factor(seq.data$POPULATION)')

# %%
ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme,
                                            init_prm_translate = {'axis_text_x': 'axis.text.x'})
bar = ggplot2.ggplot(seq_data) + ggplot2.geom_bar() + ggplot2.aes_string(x='CENTER_NAME') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1, size=40), axis_text_y=ggplot2.element_text(size=40), text=ggplot2.element_text(size=40))
robjects.r.png('out.png', width=16, height=9, units="in", res=600) 
bar.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')

# %%
#Get Yoruba and CEU
robjects.r('yri_ceu <- seq.data[seq.data$POPULATION %in% c("YRI", "CEU") & seq.data$BASE_COUNT < 2E9 & seq.data$READ_COUNT < 3E7, ]')
yri_ceu = robjects.r('yri_ceu')

# %%
scatter = ggplot2.ggplot(yri_ceu) + ggplot2.aes_string(x='BASE_COUNT', y='READ_COUNT', shape='factor(POPULATION)', col='factor(ANALYSIS_GROUP)') + ggplot2.geom_point()
robjects.r.png('out.png', width=16, height=9, units="in", res=600)
scatter.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')

# %%
with localconverter(ro.default_converter + pandas2ri.converter):
  pd_yri_ceu = ro.conversion.rpy2py(yri_ceu)
del pd_yri_ceu['PAIRED_FASTQ']
# no_paired = pandas2ri.py2ri(pd_yri_ceu)
with localconverter(ro.default_converter + pandas2ri.converter):
  no_paired = ro.conversion.py2rpy(pd_yri_ceu)
robjects.r.assign('no.paired', no_paired)
robjects.r("print(colnames(no.paired))")

# %%


================================================
FILE: Chapter01/R_magic.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# %% [markdown]
# ## The cell below will get the data file, you only need to run it once 

# %% [markdown]
# (you do not need to do this if you have done it in the Interfacing_R notebook)

# %%
# !rm sequence.index 2>/dev/null
# !wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index

# %%
import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2

# %load_ext rpy2.ipython

# %% language="R"
# seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)
# seq.data$READ_COUNT <- as.integer(seq.data$READ_COUNT)
# seq.data$BASE_COUNT <- as.integer(seq.data$BASE_COUNT)

# %%
# seq_data = %R seq.data
print(type(seq_data))  #pandas dataframe???

# %%
my_col = list(seq_data.columns).index("CENTER_NAME")
seq_data['CENTER_NAME'] = seq_data['CENTER_NAME'].apply(lambda x: x.upper())

# %%
# %R -i seq_data
# %R print(colnames(seq_data))

# %% language="R"
# seq_data <- seq_data[seq_data$WITHDRAWN==0, ]
# seq_data$POPULATION <- as.factor(seq_data$POPULATION)

# %% language="R"
# bar <- ggplot(seq_data) +  aes(factor(CENTER_NAME)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
# print(bar)

# %% language="R"
# seq_data$POPULATION <- as.factor(seq_data$POPULATION)
# yri_ceu <- seq_data[seq_data$POPULATION %in% c("YRI", "CEU") & seq_data$BASE_COUNT < 2E9 & seq_data$READ_COUNT < 3E7, ]

# %% language="R"
# scatter <- ggplot(yri_ceu, aes(x=BASE_COUNT, y=READ_COUNT, col=factor(ANALYSIS_GROUP), shape=POPULATION)) + geom_point()
# print(scatter)

# %% language="R"
# library(gridExtra)
# library(grid)
# g <- grid.arrange(bar, scatter, ncol=1)
# g

# %% language="R"
# png('fig.png')
# g
# dev.off()


================================================
FILE: Chapter01/base_setup.sh
================================================
conda create -n bioinformatics_base python=3.9.7 

conda activate bioinformatics_base
conda config --add channels bioconda 
conda config --add channels conda-forge
conda install \
	biopython==1.79 \
	jupyterlab==3.2.1 \
	jupytext==1.13 \
	matplotlib==3.4.3 \
	numpy==1.21.3 \
	pandas==1.3.4 \
	scipy==1.7.1
conda list --explicit > bioinformatics_base.txt


================================================
FILE: Chapter01/bioinformatics_base.txt
================================================
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2021.10.8-ha878542_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.36.1-hea4e1c9_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-11.2.0-h5c6108e_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-11.2.0-he4da1e4_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mysql-common-8.0.27-ha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pandoc-2.15-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/tzdata-2021e-he74cb21_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-11.2.0-h69a702a_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgomp-11.2.0-h1d223b6_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-1_gnu.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-11.2.0-h1d223b6_11.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.3-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/expat-2.4.1-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/icu-68.2-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jbig-2.1-h7f98852_2003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jpeg-9d-h36c2ea0_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lerc-3.0-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.8-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h9c3ff4c_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.16-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.4-h7f98852_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.18-pthreads_h8fe5266_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libopus-1.3.1-h7f98852_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.18-h36c2ea0_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h7f98852_1000.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.2.1-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.11-h36c2ea0_1013.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.3-h9c3ff4c_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.2-h58526e2_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nspr-4.32-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1l-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pcre-8.45-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h7f98852_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.5-h516909a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h516909a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-h73d1719_1008.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.10-h9b69904_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h7f98852_1003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/readline-8.1-h46c0cb4_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.4-h9c3ff4c_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h36c2ea0_1013.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libglib-2.70.0-h174f98d_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-12_linux64_openblas.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libllvm11-11.1.0-hf817b99_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-h21135ba_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.12-h72842e0_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.36.0-h9cd32fc_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.11-h27826a3_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.0-ha95c52a_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.4-h0708190_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.70.0-h780b84a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.18.5-h76c114f_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/krb5-1.19.2-hcc1bbae_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libclang-11.1.0-default_ha53f305_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.3.0-h6f004c6_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.0.3-he3ba5ed_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mysql-libs-8.0.27-hfa10184_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nss-3.69-hb5efdd6_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/python-3.9.7-hb7a2778_3_cpython.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-hba837de_1005.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/glib-2.70.0-h780b84a_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.18.5-hf529b03_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.12-hddcbb42_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libpq-13.3-hd57d9b9_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.4.0-hb52868f_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.11.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd3deb0d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pycparser-2.20-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.9-2_cp39.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/certifi-2021.10.8-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/cffi-1.14.6-py39h4bc2ebd_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/chardet-4.0.0-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h48d8840_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/debugpy-1.4.1-py39he80948d_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/importlib-metadata-4.8.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jedi-0.18.0-py39hf3d152e_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/jupyter_core-4.9.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.3.2-py39h1a9c180_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-1.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/markupsafe-2.0.1-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/mistune-0.8.4-py39h3811e60_1004.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/numpy-1.21.3-py39hdbf815f_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pexpect-4.8.0-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pillow-8.3.2-py39ha612740_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-4.19.18-py39he80948d_7.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyrsistent-0.17.3-py39h3811e60_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pysocks-1.7.1-py39hf3d152e_3.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyzmq-22.3.0-py39h37b5a0c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/setuptools-58.2.0-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/sniffio-1.2.0-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/tornado-6.1-py39h3811e60_1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/websocket-client-0.57.0-py39hf3d152e_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/anyio-3.3.4-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/argon2-cffi-21.1.0-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/biopython-1.79-py39h3811e60_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/brotlipy-0.7.0-py39h3811e60_1001.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/cryptography-35.0.0-py39h95dcef6_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.1.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.4.3-py39h2fa2bec_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.2.8-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pandas-1.3.4-py39hde0f152_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/qt-5.12.9-hda022c4_4.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/scipy-1.7.1-py39hee8e79c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/terminado-0.12.1-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt-impl-5.12.3-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupytext-1.13.0-pyh6002c4b_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.21-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqtchart-5.12-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqtwebengine-5.12.1-py39h0fcd23e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ipython-7.28.0-py39hef51801_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/nbconvert-6.2.0-py39hf3d152e_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.12.3-py39hf3d152e_7.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ipykernel-6.4.2-py39hef51801_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.4.3-py39hf3d152e_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/requests-unixsocket-0.2.0-py_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.1-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.1-pyhd8ed1ab_0.tar.bz2


================================================
FILE: Chapter02/.gitignore
================================================
*png
VAERSDataUseGuide_en_September2021.pdf

================================================
FILE: Chapter02/Arrow.py
================================================
import gzip
import pandas as pd
from pyarrow import csv
import pyarrow.compute as pc

vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
columns = list(vdata_pd.columns)
vdata_pd.info(memory_usage="deep")

vdata_arrow = csv.read_csv("2021VAERSDATA.csv.gz")
tot_bytes = sum([
    vdata_arrow[name].nbytes
    for name in vdata_arrow.column_names])
print(f"Total {tot_bytes // (1024 ** 2)} MB")

for name in vdata_arrow.column_names:
    arr_bytes = vdata_arrow[name].nbytes
    arr_type = vdata_arrow[name].type
    pd_bytes = vdata_pd[name].memory_usage(index=False, deep=True)
    pd_type = vdata_pd[name].dtype
    print(
        name,
        arr_type, arr_bytes // (1024 ** 2),
        pd_type, pd_bytes // (1024 ** 2),)


# %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
# %timeit csv.read_csv("2021VAERSDATA.csv.gz")


# REMOVE SYMPTOM_TEXT


vdata_pd = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
data_pd.info(memory_usage="deep")

#columns.remove("SYMPTOM_TEXT")
vdata_arrow = csv.read_csv(
    "2021VAERSDATA.csv.gz",
     convert_options=csv.ConvertOptions(include_columns=columns))
vdata_arrow.nbytes

# %timeit pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
# %timeit csv.read_csv("2021VAERSDATA.csv.gz", convert_options=csv.ConvertOptions(include_columns=columns))

vdata = vdata_arrow.to_pandas()
vdata.info(memory_usage="deep")


# Theres more
vdata = vdata_arrow.to_pandas(self_destruct=True)


================================================
FILE: Chapter02/Matplotlib.py
================================================
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

vdata = pd.read_csv(
    "2021VAERSDATA.csv.gz", encoding="iso-8859-1",
    usecols=lambda name: name != "SYMPTOM_TEXT")

num_rows = len(vdata)
perc_nan = {}
for col_name in vdata.columns:
    num_nans = len(vdata[col_name][vdata[col_name].isna()])
    perc_nan[col_name] = 100 * num_nans / num_rows
labels = perc_nan.keys()
bar_values = list(perc_nan.values())
x_positions = np.arange(len(labels))

fig = plt.figure()
fig.suptitle("Fraction of empty values per column")
ax = fig.add_subplot()
ax.bar(x_positions, bar_values)
ax.set_ylabel("Percent of empty values")
ax.set_xlabel("Column")
ax.set_xticks(x_positions)
ax.set_xticklabels(labels)
ax.legend()
fig.savefig("naive_chart.png")

# OO interface vs matlab...

fig = plt.figure(figsize=(16, 9), tight_layout=True, dpi=600)
fig.suptitle("Fraction of empty values per column", fontsize="48")
ax = fig.add_subplot()
b1 = ax.bar(x_positions, bar_values)
ax.set_ylabel("Percent of empty values", fontsize="xx-large")
ax.set_xticks(x_positions)
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_ylim(0, 100)
ax.set_xlim(-0.5, len(labels))
for i, x in enumerate(x_positions):
    ax.text(
        x, 2, "%.1f" % bar_values[i], rotation=90,
        va="bottom", ha="center",
        backgroundcolor="white")
fig.text(0.2, 0.01, "Column", fontsize="xx-large")
fig.savefig("cleaner_chart.png")


dead = vdata[vdata.DIED == "Y"]
vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID")

vax.groupby("VAX_TYPE").size().sort_values()

vax_dead = dead.join(vax, on="VAERS_ID", how="inner")
# join on id, discuss


vax_dead.iloc[0]

dead_counts = vax_dead["VAX_TYPE"].value_counts()
large_values = dead_counts[dead_counts >= 10]
other_sum = dead_counts[dead_counts < 10].sum()
large_values = large_values.append(pd.Series({"OTHER": other_sum}))

distance_df = vax_dead[vax_dead.DATEDIED.notna() & vax_dead.VAX_DATE.notna()]
distance_df["DATEDIED"] = pd.to_datetime(distance_df["DATEDIED"])
distance_df["VAX_DATE"] = pd.to_datetime(distance_df["VAX_DATE"])
distance_df = distance_df[distance_df.DATEDIED >= "2021"]
distance_df = distance_df[distance_df.VAX_DATE >= "2021"]
distance_df = distance_df[distance_df.DATEDIED >= distance_df.VAX_DATE]
time_distances = distance_df["DATEDIED"] - distance_df["VAX_DATE"]
time_distances_d = time_distances.astype(int) / (10**9 * 60 * 60 * 24)

date_died = pd.to_datetime(vax_dead[vax_dead.DATEDIED.notna()]["DATEDIED"])
date_died = date_died[date_died >= "2021"]
date_died_counts = date_died.value_counts().sort_index()
cum_deaths = date_died_counts.cumsum()

state_dead = vax_dead[vax_dead["STATE"].notna()][["STATE", "SEX"]]
top_states = sorted(state_dead["STATE"].value_counts().head(10).index)
top_state_dead = state_dead[state_dead["STATE"].isin(top_states)].groupby(["STATE", "SEX"]).size()#.reset_index()
top_state_dead.loc["MN", "U"] = 0  # XXXX
top_state_dead = top_state_dead.sort_index().reset_index()
top_state_females = top_state_dead[top_state_dead.SEX == "F"][0]
top_state_males = top_state_dead[top_state_dead.SEX == "M"][0]
top_state_unk = top_state_dead[top_state_dead.SEX == "U"][0]

fig, ((vax_cnt, time_dist), (death_time, state_reps)) = plt.subplots(
    2, 2,
    figsize=(16, 9), tight_layout=True, dpi=600)

vax_cnt.set_title("Vaccines involved in deaths")
wedges, texts = vax_cnt.pie(large_values)
vax_cnt.legend(wedges, large_values.index, loc="lower left")

time_dist.hist(time_distances_d, bins=50)
time_dist.set_title("Days between vaccine administration and death")
time_dist.set_xlabel("Days")
time_dist.set_ylabel("Observations")

death_time.plot(date_died_counts.index, date_died_counts, ".")
death_time.set_title("Deaths over time")
death_time.set_ylabel("Daily deaths")
death_time.set_xlabel("Date")
tw = death_time.twinx()
tw.plot(cum_deaths.index, cum_deaths)
tw.set_ylabel("Cummulative deaths")

state_reps.set_title("Deaths per state stratified by sex")
state_reps.bar(top_states, top_state_females, label="Females")
state_reps.bar(top_states, top_state_males, label="Males", bottom=top_state_females)
state_reps.bar(top_states, top_state_unk, label="Unknown",
               bottom=top_state_females.values + top_state_males.values)
state_reps.legend()
state_reps.set_xlabel("State")
state_reps.set_ylabel("Deaths")

fig.savefig("summary.png")

fig


================================================
FILE: Chapter02/NumPy.py
================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

vdata = pd.read_csv(
    "2021VAERSDATA.csv.gz", encoding="iso-8859-1")

vdata["STATE"] = vdata["STATE"].str.upper()
top_states = pd.DataFrame({
    "size": vdata.groupby("STATE").size().sort_values(ascending=False).head(5)}).reset_index()
top_states["rank"] = top_states.index
top_states = top_states.set_index("STATE")
top_vdata = vdata[vdata["STATE"].isin(top_states.index)]
top_vdata["state_code"] = top_vdata["STATE"].apply(
    lambda state: top_states["rank"].at[state]
).astype(np.uint8)
top_vdata = top_vdata[top_vdata["AGE_YRS"].notna()]
top_vdata.loc[:,"AGE_YRS"] = top_vdata["AGE_YRS"].astype(int)
top_states

age_state = top_vdata[["state_code", "AGE_YRS"]]
age_state["state_code"]
state_code_arr = age_state["state_code"].values
type(state_code_arr), state_code_arr.shape, state_code_arr.dtype

age_state["AGE_YRS"]
age_arr = age_state["AGE_YRS"].values
type(age_arr), age_arr.shape, age_arr.dtype

age_arr.max()

age_state_mat = np.zeros((5,6), dtype=np.uint64)
for row in age_state.itertuples():
    age_state_mat[row.state_code, row.AGE_YRS//20] += 1
age_state_mat

cal = age_state_mat[0,:]
kids = age_state_mat[:,0]

def compute_frac(arr_1d):
    return arr_1d / arr_1d.sum()

frac_age_stat_mat = np.apply_along_axis(compute_frac, 1, age_state_mat)

perc_age_stat_mat = frac_age_stat_mat * 100
perc_age_stat_mat = perc_age_stat_mat.astype(np.uint8)
perc_age_stat_mat

perc_age_stat_mat = perc_age_stat_mat[:, :5]
perc_age_stat_mat


fig = plt.figure()
ax = fig.add_subplot()
ax.matshow(perc_age_stat_mat, cmap=plt.get_cmap("Greys"))
ax.set_yticks(range(5))
ax.set_yticklabels(top_states.index)
ax.set_xticks(range(6))
ax.set_xticklabels(["0-19", "20-39", "40-59", "60-79", "80-99", "100-119"])
fig.savefig("matrix.png")


================================================
FILE: Chapter02/Pandas_Basic.py
================================================
# # Using Pandas to process vaccine adverse events
#
# ## Data Access
#
# Go to https://vaers.hhs.gov/data/datasets.html and Download 2021 **zip** Data. Please do not download only the CSV File.
#
# Drop it on the directory where this notebook is.


# !unzip 2021VAERSData.zip
# !gzip -9 *csv


import pandas as pd
import matplotlib.pyplot as plt

vdata = pd.read_csv(
    "2021VAERSDATA.csv.gz", encoding="iso-8859-1")

vdata.columns

vdata.dtypes

vdata.shape

vdata.iloc[0]

vdata = vdata.set_index("VAERS_ID")

vdata.loc[916600]

vdata.head(3)

vdata.iloc[:3]

vdata.iloc[:5, 2:4]

vdata["AGE_YRS"].max()

vdata.AGE_YRS.max()

vdata["AGE_YRS"].sort_values().plot(use_index=False)

vdata["AGE_YRS"].sort_values().plot(use_index=False)

fig, ax = plt.subplots(1, 2, sharey=True, dpi=300)
fig.suptitle("Age of adverse events")
vdata["AGE_YRS"].sort_values().plot(
    use_index=False, ax=ax[0],
    xlabel="Obervation", ylabel="Age")
vdata["AGE_YRS"].plot.hist(bins=20, orientation="horizontal")
fig.savefig("adverse.png")

vdata["AGE_YRS"].dropna().apply(lambda x: int(x)).value_counts()
# not documented

vdata.DIED.value_counts(dropna=False)
# NA is a problem, how to be implemented


vdata["is_dead"] = (vdata.DIED == "Y")


dead = vdata[vdata.is_dead]
vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID")
print(vax.columns)
print(vax.shape)
print(vax.VAX_TYPE.unique())

vax.groupby("VAX_TYPE").size().sort_values()

vax19 = vax[vax.VAX_TYPE == "COVID19"]
vax19_dead = dead.join(vax19)
# join on id, discuss
vax19_dead.index.value_counts()

baddies = vax19_dead.groupby("VAX_LOT").size().sort_values(ascending=False)
for i, (lot, cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby("STATE")))
    if i == 10:
        break


# The data above is not totally correct - at least in terms of interpretation, but for that we need to check the next recipe


================================================
FILE: Chapter02/Pandas_Join.py
================================================
# # Pandas advanced

import numpy as np
import pandas as pd

# # Code to sample original data
#
# ```
# vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
# vdata.sample(frac=0.9).to_csv("vdata_sample.csv.gz", index=False)
# vax = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1")
# vax.sample(frac=0.9).to_csv("vax_sample.csv.gz", index=False)
# ```

vdata = pd.read_csv("vdata_sample.csv.gz") # No encoding
vax = pd.read_csv("vax_sample.csv.gz")

vdata_with_vax = vdata.join(
    vax.set_index("VAERS_ID"),
    on="VAERS_ID",
    how="inner")

len(vdata), len(vax), len(vdata_with_vax)

lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)]
lost_vdata

lost_vax = vax[~vax["VAERS_ID"].isin(vdata_with_vax["VAERS_ID"])]
lost_vax


# Left, Right and outer caveats


vdata_with_vax_left = vdata.join(
    vax.set_index("VAERS_ID"),
    on="VAERS_ID")

vdata_with_vax_left.groupby("VAERS_ID").size().sort_values()

len(vdata_with_vax_left), len(vdata_with_vax_left.VAERS_ID.unique())

# +
#vdata_all = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")
#vax_all = pd.read_csv("2021VAERSVAX.csv.gz", encoding="iso-8859-1")
# -

dead = vdata[vdata.DIED == "Y"]
vax19 = vax[vax.VAX_TYPE == "COVID19"]
vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how="right")
# join on id, discuss

len(vax19), len(dead), len(vax19_dead)

len(vax19_dead[vax19_dead.VAERS_ID.duplicated()])

len(vax19_dead) - len(dead)

vax19_dead["STATE"] = vax19_dead["STATE"].str.upper()
dead_lot = vax19_dead[["VAERS_ID", "VAX_LOT", "STATE"]].set_index(["VAERS_ID", "VAX_LOT"])
dead_lot_clean = dead_lot[~dead_lot.index.duplicated()]
dead_lot_clean = dead_lot_clean.reset_index()
dead_lot_clean[dead_lot_clean.VAERS_ID.isna()]

baddies = dead_lot_clean.groupby("VAX_LOT").size().sort_values(ascending=False)
for i, (lot, cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby("STATE")))
    if i == 10:
        break


================================================
FILE: Chapter02/Pandas_Memory.py
================================================
# # Pandas advanced

import numpy as np
import pandas as pd

vdata = pd.read_csv("2021VAERSDATA.csv.gz", encoding="iso-8859-1")

vdata.info(memory_usage="deep")

for name in vdata.columns:
    col_bytes = vdata[name].memory_usage(index=False, deep=True)
    col_type = vdata[name].dtype
    print(
        name,
        col_type, col_bytes // (1024 ** 2))

vdata.DIED.memory_usage(index=False, deep=True)

vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True)

vdata.STATE.unique()

vdata["STATE"] = vdata.STATE.str.upper()

states = list(vdata["STATE"].unique())
states

vdata["encoded_state"] = vdata.STATE.apply(lambda state: states.index(state))
vdata["encoded_state"] = vdata["encoded_state"].astype(np.uint8)

vdata[["encoded_state", "STATE"]].head(10)

vdata["STATE"].memory_usage(index=False, deep=True)

vdata["encoded_state"].memory_usage(index=False, deep=True)

vdata.index

states = list(pd.read_csv(
    "vdata_sample.csv.gz",
    converters={
       "STATE": lambda state: state.upper()  # You need to know the states in advance
    },
    usecols=["STATE"]
)["STATE"].unique())

vdata = pd.read_csv(
    "vdata_sample.csv.gz",
    index_col="VAERS_ID",
    converters={
       "DIED": lambda died: died == "Y",
       "STATE": lambda state: states.index(state.upper())
    },
    usecols=lambda name: name != "SYMPTOM_TEXT"
)
vdata["STATE"] = vdata["STATE"].astype(np.uint8)
vdata.info(memory_usage="deep")


================================================
FILE: Chapter03/Accessing_Databases.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

from Bio import Entrez, Medline, SeqIO

# ### Do not forget to inform NCBI of your email address (change below)

Entrez.email = "put@your_email.here" 

#This gives you the list of available databases
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)

handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)
if int(rec_list['RetMax']) < int(rec_list['Count']):
    handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                            retmax=rec_list['Count'])
    rec_list = Entrez.read(handle)

id_list = rec_list['IdList']
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count'])

recs = list(SeqIO.parse(hdl, 'gb'))

for rec in recs:
    if rec.name == 'KM288867':
        break
print(rec.name)
print(rec.description)

for feature in rec.features:
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed:\n%s' % feature)

for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

print(len(rec.seq))

refs = rec.annotations['references']
print(refs)
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
                                rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))


================================================
FILE: Chapter03/Basic_Sequence_Processing.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

from Bio import Entrez, Seq, SeqIO, SeqRecord

Entrez.email = "put@your_email.here" 
hdl = Entrez.efetch(db='nucleotide', id=['NM_002299'], rettype='gb')  # Lactase gene
#for l in hdl:
#    print l
gb_rec = SeqIO.read(hdl, 'gb')

for feature in gb_rec.features:
    if feature.type == 'CDS':
        location = feature.location  # Note translation existing
cds = SeqRecord.SeqRecord(gb_rec.seq[location.start:location.end], 'NM_002299', description='LCT CDS only')

w_hdl = open('example.fasta', 'w')
SeqIO.write([cds], w_hdl, 'fasta')
w_hdl.close()

recs = SeqIO.parse('example.fasta', 'fasta')
for rec in recs:
    seq = rec.seq
    print(rec.description)
    print(seq[:10])

print((seq[:12], seq[-12:]))
rna = seq.transcribe()
rna

prot = seq.translate()
prot


================================================
FILE: Chapter03/Filtering_SNPs.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Getting the necessary data

# You will need to do this only once

# !rm -rf centro.vcf.gz 2>/dev/null
# !rm -rf standard.vcf.gz 2>/dev/null
# !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:1-200000 |bgzip -c > centro.vcf.gz
# !tabix -fh ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/preview/ag1000g.AC.phase1.AR1.vcf.gz 3L:21000000-21200000 |bgzip -c > standard.vcf.gz       
# !tabix -p vcf centro.vcf.gz
# !tabix -p vcf standard.vcf.gz

# # Recipe

# +
from collections import defaultdict
import functools

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from cyvcf2 import VCF


# -

def do_window(recs, size, fun):
    start = None
    win_res = []
    for rec in recs:
        if not rec.is_snp or len(rec.ALT) > 1:
            continue
        if start is None:
            start = rec.POS
        my_win = 1 + (rec.POS - start) // size
        while len(win_res) < my_win:
            win_res.append([])
        win_res[my_win - 1].extend(fun(rec))
    return win_res


def apply_win_funs(wins, funs):
    fun_results = []
    for win in wins:
        my_funs = {}
        for name, fun in funs.items():
            try:
                my_funs[name] = fun(win)
            except:
                my_funs[name] = None
        fun_results.append(my_funs)
    return fun_results


wins = {}
size = 2000
names = ['centro.vcf.gz', 'standard.vcf.gz']
for name in names:
    recs = VCF(name)
    wins[name] = do_window(recs, size, lambda x: [1])

stats = {}
fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True)
for name, nwins in wins.items():
    stats[name] = apply_win_funs(nwins, {'sum': sum})
    x_lim = [i * size  for i in range(len(stats[name]))]
    ax.plot(x_lim, [x['sum'] for x in stats[name]], label=name)
ax.legend()
ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large')
ax.set_ylabel('Number of variant sites (bi-allelic SNPs)', fontsize='xx-large')
fig.suptitle('Number of bi-allelic SNPs along the genome', fontsize='xx-large')
fig.savefig('bi.png')

# +
mq0_wins = {}
size = 5000

def get_sample(rec, annot, my_type):
    return [v for v in rec.format(annot) if v > np.iinfo(my_type).min]

for name in names:
    recs = VCF(name)
    mq0_wins[name] = do_window(recs, size, functools.partial(get_sample, annot='MQ0', my_type=np.int32))
# -

stats = {}
colors = ['b', 'g']
i = 0
fig, ax = plt.subplots(figsize=(16, 9))
for name, nwins in mq0_wins.items():
    stats[name] = apply_win_funs(nwins, {'median': np.median, '75': functools.partial(np.percentile, q=95)})
    x_lim = [j * size  for j in range(len(stats[name]))]
    ax.plot(x_lim, [x['median'] for x in stats[name]], label=name, color=colors[i])
    ax.plot(x_lim, [x['75'] for x in stats[name]], '--', color=colors[i])
    i += 1
#ax.set_ylim(0, 40)
ax.legend()
ax.set_xlabel('Genomic location in the downloaded segment', fontsize='xx-large')
ax.set_ylabel('MQ0', fontsize='xx-large')
fig.suptitle('Distribution of MQ0 along the genome', fontsize='xx-large')
fig.savefig('MQ0.png')


def get_sample_relation(recs, f1, f2):
    rel = defaultdict(int)
    for rec in recs:
        if not rec.is_snp:
             continue
        for pos in range(len(rec.genotypes)):
            v1 = f1(rec, pos)
            v2 = f2(rec, pos)
            if v1 is None or v2 == np.iinfo(type(v2)).min:
                continue  # We ignore Nones
            rel[(v1, v2)] += 1
            # careful with the size, floats: round?
        #break
    return rel


rels = {}
for name in names:
    recs = VCF(name)
    rels[name] = get_sample_relation(
        recs,
        lambda rec, pos: 1 if rec.genotypes[pos][0] != rec.genotypes[pos][1] else 0,
        lambda rec, pos: rec.format('DP')[pos][0])

# +
fig, ax = plt.subplots(figsize=(16, 9), dpi=300, tight_layout=True)

def plot_hz_rel(dps, ax, ax2, name, rel):
    frac_hz = []
    cnt_dp = []
    for dp in dps:
        hz = 0.0
        cnt = 0

        for khz, kdp in rel.keys():
            if kdp != dp:
                continue
            cnt += rel[(khz, dp)]
            if khz == 1:
                hz += rel[(khz, dp)]
        frac_hz.append(hz / cnt)
        cnt_dp.append(cnt)
    ax.plot(dps, frac_hz, label=name)
    ax2.plot(dps, cnt_dp, '--', label=name)

ax2 = ax.twinx()
for name, rel in rels.items():
    dps = list(set([x[1] for x in rel.keys()]))
    dps.sort()
    plot_hz_rel(dps, ax, ax2, name, rel)
ax.set_xlim(0, 75)
ax.set_ylim(0, 0.2)
ax2.set_ylabel('Quantity of calls', fontsize='xx-large')
ax.set_ylabel('Fraction of Heterozygote calls', fontsize='xx-large')
ax.set_xlabel('Sample Read Depth (DP)', fontsize='xx-large')
ax.legend()
fig.suptitle('Number of calls per depth and fraction of calls which are Hz',
             fontsize='xx-large')
fig.savefig('hz.png')

# -

def get_variant_relation(recs, f1, f2):
    rel = defaultdict(int)
    for rec in recs:
        if not rec.is_snp:
             continue
        try:
            v1 = f1(rec)
            v2 = f2(rec)
            if v1 is None or v2 is None:
                continue  # We ignore Nones
            rel[(v1, v2)] += 1
            #careful with the size, floats: round?
        except:
            # This is outside the domain (typically None)
            pass
    return rel


# +
accepted_eff = ['INTERGENIC', 'INTRON', 'NON_SYNONYMOUS_CODING', 'SYNONYMOUS_CODING']

def eff_to_int(rec):
    try:
        annot = rec.INFO['EFF']
        master_type = annot.split('(')[0]
        return accepted_eff.index(master_type)
    except ValueError:
        return len(accepted_eff)


# -

eff_mq0s = {}
for name in names:
    recs = VCF(name)
    eff_mq0s[name] = get_variant_relation(
        recs,
        lambda r: eff_to_int(r), lambda r: int(r.INFO['DP']))

fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
name = 'standard.vcf.gz'
bp_vals = [[] for x in range(len(accepted_eff) + 1)]
for k, cnt in eff_mq0s[name].items():
    my_eff, mq0 = k
    bp_vals[my_eff].extend([mq0] * cnt)
    #memory usage
#print(bp_vals[-2])
sns.boxplot(data=bp_vals, sym='', ax=ax)
ax.set_xticklabels(accepted_eff + ['OTHER'])
ax.set_ylabel('DP (variant)', fontsize='xx-large')
fig.suptitle('Distribution of variant DP per SNP type',
             fontsize='xx-large')
fig.savefig('eff.png')


================================================
FILE: Chapter03/LCT.bed
================================================
track name=gene description="Gene information"
2	135836529	135837180	ENSE00002202258	0	-
2	135833110	135833190	ENSE00001660765	0	-
2	135829592	135829676	ENSE00001731451	0	-
2	135823900	135824003	ENSE00001659892	0	-
2	135822019	135822098	ENSE00001777620	0	-
2	135817340	135818061	ENSE00001602826	0	-
2	135812310	135812956	ENSE00000776576	0	-
2	135808442	135809993	ENSE00001008768	0	-
2	135807127	135807396	ENSE00000776573	0	-
2	135804766	135805057	ENSE00000776572	0	-
2	135803929	135804128	ENSE00000776571	0	-
2	135800606	135800809	ENSE00000776570	0	-
2	135798028	135798138	ENSE00003515081	0	-
2	135794640	135794775	ENSE00001630333	0	-
2	135790657	135790881	ENSE00001667885	0	-
2	135789570	135789798	ENSE00001728878	0	-
2	135787839	135788544	ENSE00001653704	0	-
2	135812310	135812959	ENSE00001745158	0	-
2	135808442	135809993	ENSE00001008768	0	-
2	135807127	135807396	ENSE00000776573	0	-
2	135804766	135805057	ENSE00000776572	0	-
2	135803929	135804128	ENSE00000776571	0	-
2	135798028	135798138	ENSE00003459353	0	-
2	135794336	135794775	ENSE00001635523	0	-
2	135810168	135810279	ENSE00001438557	0	-
2	135820190	135820639	ENSE00001732580	0	+
2	135821674	135823087	ENSE00001695040	0	+
2	135836529	135837180	NM_002299.2.1	0	-
2	135833110	135833190	NM_002299.2.2	0	-
2	135829592	135829676	NM_002299.2.3	0	-
2	135823900	135824003	NM_002299.2.4	0	-
2	135822019	135822098	NM_002299.2.5	0	-
2	135817340	135818061	NM_002299.2.6	0	-
2	135812310	135812956	NM_002299.2.7	0	-
2	135808442	135809993	NM_002299.2.8	0	-
2	135807127	135807396	NM_002299.2.9	0	-
2	135804766	135805057	NM_002299.2.10	0	-
2	135803929	135804128	NM_002299.2.11	0	-
2	135800606	135800809	NM_002299.2.12	0	-
2	135798028	135798138	NM_002299.2.13	0	-
2	135794640	135794775	NM_002299.2.14	0	-
2	135790657	135790881	NM_002299.2.15	0	-
2	135789570	135789798	NM_002299.2.16	0	-
2	135787844	135788544	NM_002299.2.17	0	-
2	135836529	135837169	CCDS2178.117	0	-
2	135833110	135833190	CCDS2178.116	0	-
2	135829592	135829676	CCDS2178.115	0	-
2	135823900	135824003	CCDS2178.114	0	-
2	135822019	135822098	CCDS2178.113	0	-
2	135817340	135818061	CCDS2178.112	0	-
2	135812310	135812956	CCDS2178.111	0	-
2	135808442	135809993	CCDS2178.110	0	-
2	135807127	135807396	CCDS2178.19	0	-
2	135804766	135805057	CCDS2178.18	0	-
2	135803929	135804128	CCDS2178.17	0	-
2	135800606	135800809	CCDS2178.16	0	-
2	135798028	135798138	CCDS2178.15	0	-
2	135794640	135794775	CCDS2178.14	0	-
2	135790657	135790881	CCDS2178.13	0	-
2	135789570	135789798	CCDS2178.12	0	-
2	135788323	135788544	CCDS2178.11	0	-


================================================
FILE: Chapter03/Processing_BED_with_HTSeq.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

from collections import defaultdict
import re
import HTSeq

lct_bed = HTSeq.BED_Reader('LCT.bed')

# +
feature_types = defaultdict(int)

for rec in lct_bed:
    last_rec = rec
    feature_types[re.search('([A-Z]+)', rec.name).group(0)] += 1

print(feature_types)

#Code specific to this dataset, document
# -

print(last_rec)
print(last_rec.name)
print(type(last_rec))
interval = last_rec.iv
print(interval)
print(type(interval))

# +
print(interval.chrom, interval.start, interval.end)
print(interval.strand)
print(interval.length)
print(interval.start_d)
print(interval.start_as_pos)
print(type(interval.start_as_pos))

#talk about overlaps

# -

exon_start = None
exon_end = None
sizes = []
for rec in lct_bed:
    if not rec.name.startswith('CCDS'):
        continue
    interval = rec.iv
    exon_start = min(interval.start, exon_start or interval.start)
    exon_end = max(interval.length, exon_end or interval.end)
    sizes.append(interval.length)
sizes.sort()
print("Num exons: %d / Begin: %d / End %d" % (len(sizes), exon_start, exon_end))
print("Smaller exon: %d / Larger exon: %d / Mean size: %.1f" % (sizes[0], sizes[-1], sum(sizes)/len(sizes)))


================================================
FILE: Chapter03/Working_with_BAM.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Getting the necessary data

# You just need to do this only once

# !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam 2>/dev/null
# !rm -f NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai 2>/dev/null
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/exome_alignment/NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam.bai

# # The recipe

# +
#pip install pysam
from collections import defaultdict

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import pysam
# -

bam = pysam.AlignmentFile('NA18489.chrom20.ILLUMINA.bwa.YRI.exome.20121211.bam', 'rb')

headers = bam.header
for record_type, records in headers.items():
    print (record_type)
    for i, record in enumerate(records):
        if type(record) == dict:
            print('\t%d' % (i + 1))
            for field, value in record.items():
                print('\t\t%s\t%s' % (field, value))
        else:
            print('\t\t%s' % record)

#0-based
for rec in bam:
    if rec.cigarstring.find('M') > -1 and rec.cigarstring.find('S') > -1 and not rec.is_unmapped and not rec.mate_is_unmapped:
        break
print(rec.query_name, rec.reference_id, bam.getrname(rec.reference_id), rec.reference_start, rec.reference_end)
print(rec.cigarstring)
print(rec.query_alignment_start, rec.query_alignment_end, rec.query_alignment_length)
print(rec.next_reference_id, rec.next_reference_start, rec.template_length)
print(rec.is_paired, rec.is_proper_pair, rec.is_unmapped, rec.mapping_quality)
print(rec.query_qualities)
print(rec.query_alignment_qualities)
print(rec.query_sequence)

counts = [0] * 76
for n, rec in enumerate(bam.fetch('20', 0, 10000000)):
    for i in range(rec.query_alignment_start, rec.query_alignment_end):
        counts[i] += 1
freqs = [100 * x / (n + 1) for x in counts]
fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
ax.plot(range(1, 77), freqs)
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Percentage of mapped calls as a function of the position from the start of the sequencer read', fontsize='xx-large')
fig.savefig('map_perc.png')

phreds = defaultdict(list)
for rec in bam.fetch('20', 0, None):
    for i in range(rec.query_alignment_start, rec.query_alignment_end):
        phreds[i].append(rec.query_qualities[i])

maxs = [max(phreds[i]) for i in range(76)]
tops = [np.percentile(phreds[i], 95) for i in range(76)]
medians = [np.percentile(phreds[i], 50) for i in range(76)]
bottoms = [np.percentile(phreds[i], 5) for i in range(76)]
medians_fig = [x - y for x, y in zip(medians, bottoms)]
tops_fig = [x - y for x, y in zip(tops, medians)]
maxs_fig = [x - y for x, y in zip(maxs, tops)]

fig, ax = plt.subplots(figsize=(16,9),dpi=300, tight_layout=True)
ax.stackplot(range(1, 77), (bottoms, medians_fig, tops_fig, maxs_fig))
ax.plot(range(1, 77), maxs, 'k-')
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Distribution of PHRED scores as a function of the position in the read', fontsize='xx-large')
fig.savefig('phred2.png')


================================================
FILE: Chapter03/Working_with_FASTQ.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Getting the necessary data

# You just need to download this ~28 MB file only once

# !rm -f SRR003265.filt.fastq.gz 2>/dev/null
# !wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265.filt.fastq.gz

# # The recipe

# +
from collections import defaultdict
import gzip

import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO
# -

recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
rec = next(recs)
print(rec.id, rec.description, rec.seq)
print(rec.letter_annotations)

recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
cnt = defaultdict(int)
for rec in recs:
    for letter in rec.seq:
        cnt[letter] += 1
tot = sum(cnt.values())
for letter, cnt in cnt.items():
    print('%s: %.2f %d' % (letter, 100 * cnt / tot, cnt))

recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='UTF-8'), 'fastq')
n_cnt = defaultdict(int)
for rec in recs:
    for i, letter in enumerate(rec.seq):
        pos = i + 1
        if letter == 'N':
            n_cnt[pos] += 1
seq_len = max(n_cnt.keys())
positions = range(1, seq_len + 1)
fig, ax = plt.subplots(figsize=(16, 9), tight_layout=True, dpi=300)
fig.suptitle('Number of N calls as a function of the distance from the start of the sequencer read', fontsize='xx-large')
ax.plot(positions, [n_cnt[x] for x in positions])
ax.set_xlim(1, seq_len)
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('Number of N Calls', fontsize='xx-large')
fig.savefig('n_calls.png')

recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
cnt_qual = defaultdict(int)
for rec in recs:
    for i, qual in enumerate(rec.letter_annotations['phred_quality']):
        if i < 25:
            continue
        cnt_qual[qual] += 1
tot = sum(cnt_qual.values())
for qual, cnt in cnt_qual.items():
    print('%d: %.2f %d' % (qual, 100. * cnt / tot, cnt))

recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz', 'rt', encoding='utf-8'), 'fastq')
qual_pos = defaultdict(list)
for rec in recs:
    for i, qual in enumerate(rec.letter_annotations['phred_quality']):
        if i < 25 or qual == 40:
            continue
        pos = i + 1
        qual_pos[pos].append(qual)
vps = []
poses = list(qual_pos.keys())
poses.sort()
for pos in poses:
    vps.append(qual_pos[pos])
fig, ax = plt.subplots(figsize=(16,9), dpi=300, tight_layout=True)
sns.boxplot(data=vps, ax=ax)
ax.set_xticklabels([str(x) for x in range(26, max(qual_pos.keys()) + 1)])
ax.set_xlabel('Read distance', fontsize='xx-large')
ax.set_ylabel('PHRED score', fontsize='xx-large')
fig.suptitle('Distribution of PHRED scores as a function of read distance', fontsize='xx-large')
fig.savefig('phred.png')

# # There is more...

# ## Do this to download the paired end data

# Be careful as this will be 1GB of data (and fully optional)

# !rm -f SRR003265_1.filt.fastq.gz 2>/dev/null
# !rm -f SRR003265_2.filt.fastq.gz 2>/dev/null
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz
# !wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz

# +
f1 = gzip.open('SRR003265_1.filt.fastq.gz', 'rt', encoding='utf8')
f2 = gzip.open('SRR003265_2.filt.fastq.gz', 'rt', encoding='utf8')
recs1 = SeqIO.parse(f1, 'fastq')
recs2 = SeqIO.parse(f2, 'fastq')
cnt = 0
for rec1, rec2 in zip(recs1, recs2):
    cnt +=1

print('Number of pairs: %d' % cnt)
# -


================================================
FILE: Chapter03/Working_with_VCF.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Getting the necessary data

# You just need to do this only once

# !rm -f genotypes.vcf.gz 2>/dev/null
# !tabix -fh ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/supporting/vcf_with_sample_level_annotation/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5_extra_anno.20130502.genotypes.vcf.gz 22:1-17000000|bgzip -c > genotypes.vcf.gz
# !tabix -p vcf genotypes.vcf.gz

# +
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt

from cyvcf2 import VCF

# +
v = VCF('genotypes.vcf.gz')
rec = next(v)
print('Variant Level information')
info = rec.INFO
for info in rec.INFO:
    print(info)

print('Sample Level information')
for fmt in rec.FORMAT:
    print(fmt)

# +
v = VCF('genotypes.vcf.gz')
samples = v.samples
print(len(samples))  # Order change

variant = next(v)
print(variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT, variant.QUAL, variant.FILTER)
print(variant.INFO)
print(variant.FORMAT)
print(variant.is_snp)

#rec.format('DP')
#rec.format('GT')

str_alleles = variant.gt_bases[0]
alleles = variant.genotypes[0][0:2]
is_phased = variant.genotypes[0][2]
print(str_alleles, alleles, is_phased)
print(variant.format('DP')[0])

# +
f = VCF('genotypes.vcf.gz')

my_type = defaultdict(int)
num_alts = defaultdict(int)

for variant in f:
    my_type[variant.var_type, variant.var_subtype] += 1
    if variant.var_type == 'snp':
        num_alts[len(variant.ALT)] += 1
print(my_type)
print(num_alts)

# +
f = VCF('genotypes.vcf.gz')

sample_dp = defaultdict(int)
for variant in f:
    if not variant.is_snp or len(variant.ALT) != 1:
        continue
    for dp in variant.format('DP'):
        #dp = int(dp)
        sample_dp[dp] += 1
# -

dps = list(sample_dp.keys())
dps.sort()
dp_dist = [sample_dp[x] for x in dps]
fig, ax = plt.subplots(figsize=(16, 9))
ax.plot(dp_dist[:50], 'r')
ax.axvline(dp_dist.index(max(dp_dist)))


================================================
FILE: Chapter04/2L.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
# %matplotlib inline

from collections import defaultdict
import gzip

import numpy as np
import matplotlib.pylab as plt
# -

num_parents = 8
dp_2L = np.load(gzip.open('DP_2L.npy.gz', 'rb'))
dp_2L.shape

for i in range(num_parents):
    print(np.median(dp_2L[:,i]), np.median(dp_2L[50000:150000,i]))

window_size = 200000
parent_DP_windows = [defaultdict(list) for i in range(num_parents)]


# +
def insert_in_window(row):
    for parent in range(num_parents):
        parent_DP_windows[parent][row[-1] // window_size].append(row[parent])

insert_in_window_v = np.vectorize(insert_in_window, signature='(n)->()')
_ = insert_in_window_v(dp_2L)
# -

fig, axs = plt.subplots(2, num_parents // 2, figsize=(16, 9), sharex=True, sharey=True, squeeze=True)
for parent in range(num_parents):
    ax = axs[parent // 4][parent % 4]
    parent_data = parent_DP_windows[parent]
    ax.set_ylim(10, 40)
    ax.plot(*zip(*[(win*window_size, np.mean(lst)) for win, lst in parent_data.items()]), '.')


================================================
FILE: Chapter04/Exploration.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import gzip
import pickle
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix

# %matplotlib inline
# -

fit = np.load(gzip.open('balanced_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True)
num_features = len(ordered_features)
fit_df = pd.DataFrame(fit, columns=ordered_features + ['pos', 'error'])
num_samples = 80
del fit

fig,ax = plt.subplots(figsize=(16,9))
_ = fit_df.hist(column=ordered_features, ax=ax)

fit_df['MeanDP'] = fit_df['DP'] / 80
fig, ax = plt.subplots()
_ = ax.hist(fit_df[fit_df['MeanDP']<50]['MeanDP'], bins=100)

errors_df = fit_df[fit_df['error'] == 1]
ok_df = fit_df[fit_df['error'] == 0]

ok_qual_above_df = ok_df[ok_df['QUAL']>0.005]
errors_qual_above_df = errors_df[errors_df['QUAL']>0.005]
print(ok_df.size, errors_df.size, ok_qual_above_df.size, errors_qual_above_df.size)
print(ok_qual_above_df.size / ok_df.size, errors_qual_above_df.size / errors_df.size)

ok_qd_above_df = ok_df[ok_df['QD']>0.05]
errors_qd_above_df = errors_df[errors_df['QD']>0.05]
print(ok_df.size, errors_df.size, ok_qd_above_df.size, errors_qd_above_df.size)
print(ok_qd_above_df.size / ok_df.size, errors_qd_above_df.size / errors_df.size)

not_bad_area_errors_df = errors_df[(errors_df['QUAL']<0.005)&(errors_df['QD']<0.05)]
_ = scatter_matrix(not_bad_area_errors_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02)

not_bad_area_ok_df = ok_df[(ok_df['QUAL']<0.005)&(ok_df['QD']<0.05)]
_ = scatter_matrix(not_bad_area_ok_df[['FS', 'ReadPosRankSum', 'MQ', 'HRun']], diagonal='kde', figsize=(16, 9), alpha=0.02)

all_fit_df = pd.DataFrame(np.load(gzip.open('feature_fit.npy.gz', 'rb')), columns=ordered_features + ['pos', 'error'])
potentially_good_corner_df = all_fit_df[(all_fit_df['QUAL']<0.005)&(all_fit_df['QD']<0.05)]
all_errors_df=all_fit_df[all_fit_df['error'] == 1]
print(len(all_fit_df), len(all_errors_df), len(all_errors_df) / len(all_fit_df))

potentially_good_corner_errors_df = potentially_good_corner_df[potentially_good_corner_df['error'] == 1]
print(len(potentially_good_corner_df), len(potentially_good_corner_errors_df), len(potentially_good_corner_errors_df) / len(potentially_good_corner_df))
print(len(potentially_good_corner_df)/len(all_fit_df))


================================================
FILE: Chapter04/Mendel.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import random
import matplotlib.pyplot as plt

# # Mendelian simulations

num_sims = 100000
num_ofs = 20

# +
num_hets_AA_AT = []
for sim in range(num_sims):
    sim_hets = 0
    for ofs in range(20):
        sim_hets += 1 if random.choice([0, 1]) == 1 else 0
    num_hets_AA_AT.append(sim_hets)
    
fig, ax = plt.subplots(1,1, figsize=(16,9))
ax.hist(num_hets_AA_AT, bins=range(20))
print(len([num_hets for num_hets in num_hets_AA_AT if num_hets==20]))
# -

num_AAs_AT_AT = []
num_hets_AT_AT = []
for sim in range(num_sims):
    sim_AAs = 0
    sim_hets = 0
    for ofs in range(20):
        derived_cnt = sum(random.choices([0, 1], k=2))
        sim_AAs += 1 if derived_cnt == 0 else 0
        sim_hets += 1 if derived_cnt == 1 else 0
    num_AAs_AT_AT.append(sim_AAs)
    num_hets_AT_AT.append(sim_hets)
fig, ax = plt.subplots(1,1, figsize=(16,9))
ax.hist([num_hets_AT_AT, num_AAs_AT_AT], histtype='step', fill=False, bins=range(20), label=['het', 'AA'])
plt.legend()

# # Balanced output

# +
import gzip
import pickle
import random

import numpy as np
# -

mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb'))
feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))
ordered_features = np.load(open('ordered_features', 'rb'), allow_pickle=True)
num_features = len(ordered_features)

len(mendelian_errors), len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))

total_observations = len(mendelian_errors)
error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))
ok_observations = total_observations - error_observations
fraction_errors = error_observations/total_observations
print (total_observations, ok_observations, error_observations, 100*fraction_errors)
del mendelian_errors

# +
prob_ok_choice = error_observations / ok_observations

def accept_entry(row):
    if row[-1] == 1:
        return True
    return random.random() <= prob_ok_choice

accept_entry_v = np.vectorize(accept_entry, signature='(i)->()')

accepted_entries = accept_entry_v(feature_fit)
balanced_fit = feature_fit[accepted_entries]
del feature_fit
balanced_fit.shape
len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0])
# -

np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False)


================================================
FILE: Chapter04/Preparation.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5
# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5


# +
import pickle
import gzip
import random

import numpy as np
import h5py
import pandas as pd
# -

samples = pd.read_csv('samples.tsv', sep='\t')
print(len(samples))
print(samples['cross'].unique())
print(samples[samples['cross'] == 'cross-29-2'][['id', 'function']])
print(len(samples[samples['cross'] == 'cross-29-2']))
print(samples[samples['function'] == 'parent'])

# # Chromosome arm 3L

# +
h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r')
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples']))

calldata_genotype = h5_3L['/3L/calldata/genotype']

MQ0 = h5_3L['/3L/variants/MQ0']
MQ = h5_3L['/3L/variants/MQ']
QD = h5_3L['/3L/variants/QD']
Coverage = h5_3L['/3L/variants/Coverage']
CoverageMQ0 = h5_3L['/3L/variants/CoverageMQ0']
HaplotypeScore = h5_3L['/3L/variants/HaplotypeScore']
QUAL = h5_3L['/3L/variants/QUAL']
FS = h5_3L['/3L/variants/FS']
DP = h5_3L['/3L/variants/DP']
HRun = h5_3L['/3L/variants/HRun']
ReadPosRankSum = h5_3L['/3L/variants/ReadPosRankSum']
my_features = {
    'MQ': MQ,
    'QD': QD,
    'Coverage': Coverage,
    'HaplotypeScore': HaplotypeScore,
    'QUAL': QUAL,
    'FS': FS,
    'DP': DP,
    'HRun': HRun,
    'ReadPosRankSum': ReadPosRankSum
}

num_features = len(my_features)
num_alleles = h5_3L['/3L/variants/num_alleles']
is_snp = h5_3L['/3L/variants/is_snp']
POS = h5_3L['/3L/variants/POS']


# -

#compute mendelian errors (biallelic)
def compute_mendelian_errors(mother, father, offspring):
    num_errors = 0
    num_ofs_problems = 0
    if len(mother.union(father)) == 1:
        # Mother and father are homo and the same
        for ofs in offspring:
            if len(ofs) == 2:
                # Offspring is het
                num_errors += 1
                num_ofs_problems += 1
            elif len(ofs.intersection(mother)) == 0:
                # Offspring is homo, but opposite from parents
                num_errors += 2
                num_ofs_problems += 1
    elif len(mother) == 1 and len(father) == 1:
        # Mother and father are homo and different
        for ofs in offspring:
            if len(ofs) == 1:
                # Homo, should be het
                num_errors += 1
                num_ofs_problems += 1
    elif len(mother) == 2 and len(father) == 2:
        # Both are het, individual offspring can be anything
        pass
    else:
        # One is het, the other is homo
        homo = mother if len(mother) == 1 else father
        for ofs in offspring:
            if len(ofs) == 1 and not ofs.intersection(homo):
                # homo, but not including the allele from parent that is homo
                num_errors += 1
                num_ofs_problems += 1
    return num_errors, num_ofs_problems


# +
def acceptable_position_to_genotype():
    for i, genotype in enumerate(calldata_genotype):
        if is_snp[i] and num_alleles[i] == 2:
            if len(np.where(genotype == -1)[0]) > 1:
                # Missing data
                continue
            yield i

def acumulate(fun):
    acumulator = {}
    for res in fun():
        if res is not None:
            acumulator[res[0]] = res[1]
    return acumulator


# +
def get_family_indexes(samples_hdf5, cross_pd):
    offspring = []
    for i, individual in cross_pd.T.iteritems():
        index = samples_hdf5.index(individual.id)
        if individual.function == 'parent':
            if individual.sex == 'M':
                father = index
            else:
                mother = index
        else:
            offspring.append(index)
    return {'mother': mother, 'father': father, 'offspring': offspring}

cross_pd = samples[samples['cross'] == 'cross-29-2']
family_indexes = get_family_indexes(samples_hdf5, cross_pd)

# +
mother_index = family_indexes['mother']
father_index = family_indexes['father']
offspring_indexes = family_indexes['offspring']
all_errors = {}


def get_mendelian_errors():
    for i in acceptable_position_to_genotype():
        genotype = calldata_genotype[i]
        mother = set(genotype[mother_index])
        father = set(genotype[father_index])
        offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes]
        my_mendelian_errors = compute_mendelian_errors(mother, father, offspring)
        yield POS[i], my_mendelian_errors

mendelian_errors = acumulate(get_mendelian_errors)

pickle.dump(mendelian_errors, gzip.open('mendelian_errors.pickle.gz', 'wb'))

# +
ordered_positions = sorted(mendelian_errors.keys())
ordered_features = sorted(my_features.keys())  #XXX on code?
num_features = len(ordered_features)
feature_fit = np.empty((len(ordered_positions), len(my_features) + 2), dtype=float)

for column, feature in enumerate(ordered_features):  # 'Strange' order
    print(feature)
    current_hdf_row = 0
    for row, genomic_position in enumerate(ordered_positions):
        while POS[current_hdf_row] < genomic_position:
            current_hdf_row +=1
        feature_fit[row, column] = my_features[feature][current_hdf_row]

for row, genomic_position in enumerate(ordered_positions):
    feature_fit[row, num_features] = genomic_position
    feature_fit[row, num_features + 1] = 1 if mendelian_errors[genomic_position][0] > 0 else 0

np.save(gzip.open('feature_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False)
pickle.dump(ordered_features, open('ordered_features', 'wb'))
# -

# # Chromosome arm 2L

h5_2L = h5py.File('ag1000g.crosses.phase1.ar3sites.2L.h5', 'r')
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_2L['/2L/samples']))
calldata_DP = h5_2L['/2L/calldata/DP']
POS = h5_2L['/2L/variants/POS']


# +
def get_parent_indexes(samples_hdf5, parents_pd):
    parents = []
    for i, individual in parents_pd.T.iteritems():
        index = samples_hdf5.index(individual.id)
        parents.append(index)
    return parents

parents_pd = samples[samples['function'] == 'parent']
parent_indexes = get_parent_indexes(samples_hdf5, parents_pd)
# -

all_dps = []
for i, pos in enumerate(POS):
    if random.random() > 0.01:
        continue
    pos_dp = calldata_DP[i]
    parent_pos_dp = [pos_dp[parent_index] for parent_index in parent_indexes]
    all_dps.append(parent_pos_dp + [pos])
all_dps = np.array(all_dps)
np.save(gzip.open('DP_2L.npy.gz', 'wb'), all_dps, allow_pickle=False, fix_imports=False)


================================================
FILE: Chapter04/QIIME2_Metagenomics.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Important: Read this!
#
# This recipe does not work with the standard conda environment.
#
# If you are in the standard environment, do this:
#
# 1. Stop Jupyter
# 2. Activate QIIME2 environment on conda
# 3. Do `jupyter serverextension enable --py qiime2 --sys-prefix`
# 4. Start Jupyter inside QIIME2 environment
#
# Note that other recipes will not work inside this environment. 

# # Check this out!
#
# This is based on on [QIIME2 Fecal Microbiota Transpant example](https://docs.qiime2.org/2018.8/tutorials/fmt/) (for the command line). You are strongly advised to read it before proceeding.
#
# There is an [amazing example](http://nbviewer.jupyter.org/gist/tkosciol/29de5198a4be81559a075756c2490fde) of using the Artifact API using the "Moving Pictures" tutorial of QIIME 2 produced by Tomasz Kościółek. I use a more convoluted approach than Tomasz's in order to go a little deeper in terms of understanding of the Python internals. That is more of a learning experience on the internals than a practical recommendatin. **My recommendation is to use Tomasz's dialect, not mine**.
#

# # Getting the data

# !wget https://data.qiime2.org/2018.8/tutorials/fmt/sample_metadata.tsv
# !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-1-10p.qza
# !wget https://data.qiime2.org/2018.8/tutorials/fmt/fmt-tutorial-demux-2-10p.qza

# # The recipe

# +
import pandas as pd

from qiime2.metadata.metadata import Metadata
from qiime2.metadata.metadata import CategoricalMetadataColumn
from qiime2.sdk import Artifact
from qiime2.sdk import PluginManager
from qiime2.sdk import Result
# -

pm = PluginManager()
demux_plugin = pm.plugins['demux']
#demux_emp_single = demux_plugin.actions['emp_single']
demux_summarize = demux_plugin.actions['summarize']
pm.plugins

print(demux_summarize.description)
demux_summarize_signature = demux_summarize.signature
print(demux_summarize_signature.inputs)
print(demux_summarize_signature.parameters)
print(demux_summarize_signature.outputs)

# +
seqs1 = Result.load('fmt-tutorial-demux-1-10p.qza')
sum_data1 = demux_summarize(seqs1)

sum_data1.visualization

# +
seqs2 = Result.load('fmt-tutorial-demux-2-10p.qza')
sum_data2 = demux_summarize(seqs2)

print(dir(sum_data2))
print(type(sum_data2.visualization))
print(dir(sum_data2.visualization))
sum_data2.visualization
# -

#Quality control
dada2_plugin = pm.plugins['dada2']
dada2_denoise_single = dada2_plugin.actions['denoise_single']
qual_control1 = dada2_denoise_single(demultiplexed_seqs=seqs1,
                                    trunc_len=150, trim_left=13)

qual_control2 = dada2_denoise_single(demultiplexed_seqs=seqs2,
                                    trunc_len=150, trim_left=13)

metadata_plugin = pm.plugins['metadata']
metadata_tabulate = metadata_plugin.actions['tabulate']
stats_meta1 = metadata_tabulate(input=qual_control1.denoising_stats.view(Metadata))
stats_meta1.visualization

stats_meta2 = metadata_tabulate(input=qual_control2.denoising_stats.view(Metadata))
stats_meta2.visualization

# +
ft_plugin = pm.plugins['feature-table']
ft_merge = ft_plugin.actions['merge']
ft_merge_seqs = ft_plugin.actions['merge_seqs']
ft_summarize = ft_plugin.actions['summarize']
ft_tab_seqs = ft_plugin.actions['tabulate_seqs']

table_merge = ft_merge(tables=[qual_control1.table, qual_control2.table])
seqs_merge = ft_merge_seqs(data=[qual_control1.representative_sequences, qual_control2.representative_sequences])
# -

ft_sum = ft_summarize(table=table_merge.merged_table)
ft_sum.visualization

tab_seqs = ft_tab_seqs(data=seqs_merge.merged_data)
tab_seqs.visualization


================================================
FILE: Chapter04/samples.tsv
================================================
id	cross	sex	function
AD0231-C	cross-29-2	F	parent
AD0232-C	cross-29-2	M	parent
AD0234-C	cross-29-2	F	progeny
AD0235-C	cross-29-2	F	progeny
AD0236-C	cross-29-2	F	progeny
AD0237-C	cross-29-2	F	progeny
AD0238-C	cross-29-2	F	progeny
AD0239-C	cross-29-2	F	progeny
AD0240-C	cross-29-2	M	progeny
AD0241-C	cross-29-2	F	progeny
AD0242-C	cross-29-2	M	progeny
AD0243-C	cross-29-2	F	progeny
AD0244-C	cross-29-2	F	progeny
AD0245-C	cross-29-2	F	progeny
AD0246-C	cross-29-2	F	progeny
AD0247-C	cross-29-2	M	progeny
AD0248-C	cross-29-2	F	progeny
AD0249-C	cross-29-2	F	progeny
AD0250-C	cross-29-2	F	progeny
AD0251-C	cross-29-2	F	progeny
AD0252-C	cross-29-2	F	progeny
AD0253-C	cross-29-2	M	progeny
AD0254-C	cross-36-9	F	parent
AD0255-C	cross-36-9	M	parent
AD0259-C	cross-36-9	M	progeny
AD0260-C	cross-36-9	F	progeny
AD0261-C	cross-36-9	F	progeny
AD0262-C	cross-36-9	M	progeny
AD0263-C	cross-36-9	M	progeny
AD0265-C	cross-36-9	F	progeny
AD0266-C	cross-36-9	M	progeny
AD0267-C	cross-36-9	F	progeny
AD0268-C	cross-36-9	M	progeny
AD0269-C	cross-36-9	F	progeny
AD0270-C	cross-36-9	M	progeny
AD0271-C	cross-36-9	M	progeny
AD0272-C	cross-36-9	F	progeny
AD0273-C	cross-36-9	M	progeny
AD0274-C	cross-36-9	F	progeny
AD0275-C	cross-36-9	M	progeny
AD0276-C	cross-36-9	F	progeny
AD0305-C	cross-42-4	F	parent
AD0306-C	cross-42-4	M	parent
AD0309-C	cross-42-4	M	progeny
AD0310-C	cross-42-4	M	progeny
AD0311-C	cross-42-4	M	progeny
AD0312-C	cross-42-4	M	progeny
AD0313-C	cross-42-4	M	progeny
AD0314-C	cross-42-4	M	progeny
AD0315-C	cross-42-4	M	progeny
AD0316-C	cross-42-4	F	progeny
AD0317-C	cross-42-4	M	progeny
AD0318-C	cross-42-4	M	progeny
AD0319-C	cross-42-4	F	progeny
AD0320-C	cross-42-4	F	progeny
AD0322-C	cross-42-4	F	progeny
AD0323-C	cross-42-4	F	progeny
AD0347-C	cross-46-9	F	parent
AD0348-C	cross-46-9	M	parent
AD0351-C	cross-46-9	M	progeny
AD0352-C	cross-46-9	F	progeny
AD0353-C	cross-46-9	F	progeny
AD0354-C	cross-46-9	F	progeny
AD0355-C	cross-46-9	F	progeny
AD0356-C	cross-46-9	M	progeny
AD0357-C	cross-46-9	F	progeny
AD0358-C	cross-46-9	F	progeny
AD0359-C	cross-46-9	M	progeny
AD0360-C	cross-46-9	F	progeny
AD0361-C	cross-46-9	F	progeny
AD0362-C	cross-46-9	M	progeny
AD0363-C	cross-46-9	F	progeny
AD0364-C	cross-46-9	M	progeny
AD0365-C	cross-46-9	M	progeny
AD0366-C	cross-46-9	F	progeny
AD0367-C	cross-46-9	F	progeny
AD0368-C	cross-46-9	F	progeny
AD0369-C	cross-46-9	F	progeny
AD0370-C	cross-46-9	F	progeny
AD0438-C	cross-36-9	F	progeny


================================================
FILE: Chapter05/.gitignore
================================================
*.fasta
ag.db
*gz
*png

================================================
FILE: Chapter05/Annotations.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
#pip install gffutils
from collections import defaultdict

import gffutils
import sqlite3
# -

# !rm -f ag.db
# !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff
# !gzip -9 gambiae.gff

try:
    db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
except sqlite3.OperationalError:
    db = gffutils.FeatureDB('ag.db')

print(list(db.featuretypes()))
for feat_type in db.featuretypes():
    print(feat_type, db.count_features_of_type(feat_type))

seqids = set()
for e in db.all_features():
    seqids.add(e.seqid)
for seqid in seqids:
    print(seqid)

num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for seqid in seqids:
    cnt = 0
    for gene in db.region(seqid=seqid, featuretype='protein_coding_gene'):
        cnt += 1
        span = abs(gene.start - gene.end) # strand
        if span > max_span:
            max_span = span
            max_span_gene = gene
        my_mRNAs = list(db.children(gene, featuretype='mRNA'))
        num_mRNAs[len(my_mRNAs)] += 1
        if len(my_mRNAs) == 0:
            exon_check = [gene]
        else:
            exon_check = my_mRNAs
        for check in exon_check:
            my_exons = list(db.children(check, featuretype='exon'))
            num_exons[len(my_exons)] += 1
            if len(my_exons) > max_exons:
                max_exons = len(my_exons)
                max_exons_gene = gene
    print(f'seqid {seqid}, number of genes {cnt}')
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)


================================================
FILE: Chapter05/Gene_Ontology.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

#use pip install as conda install requires a lot of downgrades at this stage
import pygraphviz as pgv
from IPython.core.display import Image

# ## The cell below comes from the Orthology notebook

# +
import requests
 
ensembl_server = 'http://rest.ensembl.org'

def do_request(server, service, *args, **kwargs):
    params = ''
    for a in args:
        if a is not None:
            params += '/' + a
    req = requests.get('%s/%s%s' % (server, service, params),
                       params=kwargs,
                       headers={'Content-Type': 'application/json'})
 
    if not req.ok:
        req.raise_for_status()
    return req.json()


# -

lct_id = 'ENSG00000115850'

refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1')
print(len(refs))
print(refs[0].keys())
for ref in refs:
    go_id = ref['primary_id']
    details = do_request(ensembl_server, 'ontology/id', go_id)
    print('%s %s %s' % (go_id,  details['namespace'], ref['description']))
    print('%s\n' % details['definition'])

go_id = 'GO:0000016'
my_data = do_request(ensembl_server, 'ontology/id', go_id)
for k, v in my_data.items():
    if k == 'parents':
        for parent in v:
            print(parent)
            parent_id = parent['accession']
    else:
        print('%s: %s' % (k, str(v)))
print()
parent_data = do_request(ensembl_server, 'ontology/id', parent_id)
print(parent_id, len(parent_data['children']))

refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id)
for go, entry in refs.items():
    print(go)
    term = entry['term']
    print('%s %s' % (term['name'], term['definition']))
    is_a = entry.get('is_a', [])
    print('\t is a: %s\n' % ', '.join([x['accession'] for x in is_a]))


def get_upper(go_id):
    parents = {}
    node_data = {}
    refs = do_request(ensembl_server, 'ontology/ancestors/chart', go_id)
    for ref, entry in refs.items():
        my_data = do_request(ensembl_server, 'ontology/id', ref)
        node_data[ref] = {'name': entry['term']['name'], 'children': my_data['children']}
        try:
            parents[ref] = [x['accession'] for x in entry['is_a']]
        except KeyError:
            pass  # Top of hierarchy
    return parents, node_data


parents, node_data = get_upper(go_id)

g = pgv.AGraph(directed=True)
for ofs, ofs_parents in parents.items():
    ofs_text = '%s\n(%s)' % (node_data[ofs]['name'].replace(', ', '\n'), ofs)
    for parent in ofs_parents:
        parent_text = '%s\n(%s)' % (node_data[parent]['name'].replace(', ', '\n'), parent)
        children = node_data[parent]['children']
        if len(children) < 3:
            for child in children:
                if child['accession'] in node_data:
                    continue
                g.add_edge(parent_text, child['accession'])
        else:
            g.add_edge(parent_text, '...%d...' % (len(children) - 1))
        g.add_edge(parent_text, ofs_text)
print(g)
g.graph_attr['label']='Ontology tree for Lactase activity'
g.node_attr['shape']='rectangle'
g.layout(prog='dot')
g.draw('graph.png')
Image("graph.png")

print(go_id)
refs = do_request(ensembl_server, 'ontology/descendants', go_id)
for go in refs:
    print(go['accession'], go['name'], go['definition'])


================================================
FILE: Chapter05/Getting_Gene.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import gffutils
import gzip
from Bio import Seq, SeqIO

# ## Retrieving data

# !rm -f ag.db
# !wget https://vectorbase.org/common/downloads/release-55/AgambiaePEST/gff/data/VectorBase-55_AgambiaePEST.gff -O gambiae.gff
# !gzip -9 gambiae.gff

db = gffutils.FeatureDB('ag.db')

# # Getting a gene

gene_id = 'AGAP004707'

gene = db[gene_id]

print(gene)
print(gene.seqid, gene.strand)

recs = SeqIO.parse(gzip.open('gambiae.fa.gz', 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
    print(rec.description)
    if rec.id == gene.seqid:
        my_seq = rec.seq
        break


# +
def get_sequence(chrom_seq, CDSs, strand):
    seq = Seq.Seq('')
    for CDS in CDSs:
        # #FRAME???
        my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1: CDS.end]))
        seq += my_cds
    return seq if strand == '+' else seq.reverse_complement()


# +
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
    print(mRNA.id)
    if mRNA.id.endswith('RA'):
        break

CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)

print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)
# -

# # Reverse strand

reverse_transcript_id = 'AGAP004708-RA'

# +
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')

print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)
# -


================================================
FILE: Chapter05/Low_Quality.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import gzip

import numpy as np
import matplotlib.pyplot as plt

from Bio import SeqIO, SeqUtils
# -

# !rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null
# !wget https://vectorbase.org/common/downloads/Current_Release/AgambiaePEST/fasta/data/VectorBase-67_AgambiaePEST_Genome.fasta -O gambiae.fa
# !gzip -9 gambiae.fa
# !wget https://vectorbase.org/common/downloads/Current_Release/AatroparvusEBRO/fasta/data/VectorBase-67_AatroparvusEBRO_Genome.fasta -O atroparvus.fa
# !gzip -9 atroparvus.fa

gambiae_name = 'gambiae.fa.gz'
atroparvus_name = 'atroparvus.fa.gz'

recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
    print(rec.description)
#Do not do this with atroparvus

recs = SeqIO.parse(gzip.open(gambiae_name, 'rt', encoding='utf-8'), 'fasta')
chrom_Ns = {}
chrom_sizes = {}
for rec in recs:
    if rec.description.find('supercontig') > -1:
        continue
    print(rec.description, rec.id, rec)
    chrom = rec.id.split('_')[1]
    if chrom in ['UNKN']:#, 'Y_unplaced']:
        continue
    chrom_Ns[chrom] = []
    on_N = False
    curr_size = 0
    for pos, nuc in enumerate(rec.seq):
        if nuc in ['N', 'n']:
            curr_size += 1
            on_N = True
        else:
            if on_N:
                chrom_Ns[chrom].append(curr_size)
                curr_size = 0
            on_N = False
    if on_N:
        chrom_Ns[chrom].append(curr_size)
    chrom_sizes[chrom] = len(rec.seq)

for chrom, Ns in chrom_Ns.items():
    size = chrom_sizes[chrom]
    if len(Ns) > 0:
        max_Ns = max(Ns)
    else:
        max_Ns = 'NA'
    print(f'{chrom} ({size}): %Ns ({round(100 * sum(Ns) / size, 1)}), num Ns: {len(Ns)}, max N: {max_Ns}')

# ## Atroparvus super-contigs

recs = SeqIO.parse(gzip.open(atroparvus_name, 'rt', encoding='utf-8'), 'fasta')
sizes = []
size_N = []
for rec in recs:
    size = len(rec.seq)
    sizes.append(size)
    count_N = 0
    for nuc in rec.seq:
        if nuc in ['n', 'N']:
            count_N += 1
    size_N.append((size, count_N / size))

print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
      np.percentile(sizes, 10), np.percentile(sizes, 90))

small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), dpi=300, squeeze=False, sharey=True)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
fig.suptitle('Fraction of Ns per contig size', fontsize=26)
fig.savefig('frac.png')


================================================
FILE: Chapter05/Orthology.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import requests
 
ensembl_server = 'http://rest.ensembl.org'

def do_request(server, service, *args, **kwargs):
    url_params = ''
    for a in args:
        if a is not None:
            url_params += '/' + a
    req = requests.get('%s/%s%s' % (server, service, url_params),
                       params=kwargs,
                       headers={'Content-Type': 'application/json'})
 
    if not req.ok:
        req.raise_for_status()
    return req.json()


# -

answer = do_request(ensembl_server, 'info/species')
for i, sp in enumerate(answer['species']):
    print(i, sp['name'])

ext_dbs = do_request(ensembl_server, 'info/external_dbs', 'homo_sapiens', filter='HGNC%')
print(ext_dbs)

answer = do_request(ensembl_server, 'lookup/symbol', 'homo_sapiens', 'LCT')
print(answer)
lct_id = answer['id']

lct_seq = do_request(ensembl_server, 'sequence/id', lct_id)
print(lct_seq)

lct_xrefs = do_request(ensembl_server, 'xrefs/id', lct_id)
for xref in lct_xrefs:
    print(xref['db_display_name'])
    print(xref)

refs = do_request(ensembl_server, 'xrefs/id', lct_id, external_db='GO', all_levels='1')
print(lct_id, refs)

hom_response = do_request(ensembl_server, 'homology/id', lct_id, type='orthologues', sequence='none')
#print(hom_response['data'][0]['homologies'])
homologies = hom_response['data'][0]['homologies']
for homology in homologies:
    print(homology['target']['species'])
    if homology['target']['species'] != 'equus_caballus':
        continue
    print(homology)
    print(homology['taxonomy_level'])
    horse_id = homology['target']['id']

horse_req = do_request(ensembl_server, 'lookup/id', horse_id)
print(horse_req)

# +
#maybe synteny of MCM6 and LCT with caballus and gorilla


================================================
FILE: Chapter05/Reference_Genome.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.4
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
from IPython.core.display import Image

from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio import SeqIO
from Bio.Graphics import BasicChromosome
# -

# !rm -f PlasmoDB-9.3_Pfalciparum3D7_Genome.fasta 2>/dev/null
# vvvv 13.0
# !wget http://plasmodb.org/common/downloads/release-13.0/Pfalciparum3D7/fasta/data/PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta

genome_name = 'PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta'

recs = SeqIO.parse(genome_name, 'fasta')
chroms = {}
for rec in recs:
    print(rec.description)

# +
from Bio import SeqUtils

chrom_sizes = {}
chrom_GC = {}
recs = SeqIO.parse(genome_name, 'fasta')
block_size = 50000
min_GC = 100.0
max_GC = 0.0
for rec in recs:
    if rec.description.find('SO=chromosome') == -1:
        continue
    chrom = int(rec.description.split('_')[1])
    chrom_GC[chrom] = []
    size = len(rec.seq)
    chrom_sizes[chrom] = size
    num_blocks = size // block_size + 1
    for block in range(num_blocks):
        start = block_size * block
        if block == num_blocks - 1:
            end = size
        else:
            end = block_size + start + 1
        block_seq = rec.seq[start:end]
        block_GC = SeqUtils.GC(block_seq)
        if block_GC < min_GC:
            min_GC = block_GC
        if block_GC > max_GC:
            max_GC = block_GC
        chrom_GC[chrom].append(block_GC)
print(min_GC, max_GC)

# +
chroms = list(chrom_sizes.keys())
chroms.sort()

biggest_chrom = max(chrom_sizes.values())

my_genome = BasicChromosome.Organism(output_format="png")

my_genome.page_size = (29.7*cm, 21*cm) # check
telomere_length = 10

bottom_GC = 17.5
top_GC = 22.0
for chrom in chroms:
    chrom_size = chrom_sizes[chrom]
    chrom_representation = BasicChromosome.Chromosome('Cr %d' % chrom)
    chrom_representation.scale_num = biggest_chrom

    tel = BasicChromosome.TelomereSegment()
    tel.scale = telomere_length
    chrom_representation.add(tel)

    num_blocks = len(chrom_GC[chrom])
    for block, gc in enumerate(chrom_GC[chrom]):
        my_GC = chrom_GC[chrom][block]
        body = BasicChromosome.ChromosomeSegment()
        if my_GC > top_GC:
            body.fill_color = colors.Color(1, 0, 0)
        elif my_GC < bottom_GC:
            body.fill_color = colors.Color(1, 1, 0)
        else:
            my_color = (my_GC - bottom_GC) / (top_GC - bottom_GC)
            body.fill_color = colors.Color(my_color, my_color, 1)
        if block < num_blocks - 1:
            body.scale = block_size
        else:
            body.scale = chrom_size % block_size
        chrom_representation.add(body)

    tel = BasicChromosome.TelomereSegment(inverted=True)
    tel.scale = telomere_length
    chrom_representation.add(tel)

    my_genome.add(chrom_representation)

my_genome.draw("falciparum.png", "Plasmodium falciparum")
Image("falciparum.png")
# -


================================================
FILE: Chapter06/.gitignore
================================================
*.log
*.ped
*.map
*.bed
*.bim
*.fam
exclude*.txt
relationships_w_pops_041510.txt
*.in
*.out

================================================
FILE: Chapter06/Admixture.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# +
from collections import defaultdict
import os

import matplotlib.pyplot as plt

from genomics.popgen.admix import cluster, plot

# %matplotlib notebook
# -

k_range = range(2, 10)  # 2..9

# ### The next cell is very slow. Example outputs are provided (so you can avoid running it)

# +
#for k in k_range:
#    os.system('admixture --cv=10 hapmap10_auto_noofs_ld.bed %d > admix.%d' % (k, k))
# -

# ## Individual order

f = open('hapmap10_auto_noofs_ld.fam')
ind_order = []
for l in f:
    toks = l.rstrip().replace(' ', '\t').split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    ind_order.append((fam_id, ind_id))
f.close()

# ## CV-plot

CVs = []
for k in k_range:
    f = open('admix.%d' % k)
    for l in f:
        if l.find('CV error') > -1:
            CVs.append(float(l.rstrip().split(' ')[-1]))
            break
    f.close()
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
ax.plot(k_range, CVs)
ax.set_title('Cross-Validation error')
ax.set_xlabel('K')

# ## Load meta-data

f = open('relationships_w_pops_121708.txt')
pop_ind = defaultdict(list)
f.readline()  # header
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    if (fam_id, ind_id) not in ind_order:
        continue
    mom = toks[2]
    dad = toks[3]
    if mom != '0' or dad != '0':
        continue
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))
#ind_pop[('2469', 'NA20281')] = ind_pop[('2805', 'NA20281')]
f.close()


def load_Q(fname, ind_order):
    ind_comps = {}
    f = open(fname)
    for i, l in enumerate(f):
        comps = [float(x) for x in l.rstrip().split(' ')]
        ind_comps[ind_order[i]] = comps
    f.close()
    return ind_comps


comps = {}
for k in k_range:
    comps[k] = load_Q('hapmap10_auto_noofs_ld.%d.Q' % k, ind_order)

ordering = {}
for k in k_range:
    ordering[k] = cluster(comps[k], pop_ind)

fig = plt.figure(figsize=(9, 9))
plot.single(comps[4], ordering[4], fig)
None

fig = plt.figure(figsize=(16, 9))
plot.stacked(comps, ordering[7], fig)

# ## Q files?

# ## Log-likelihood


================================================
FILE: Chapter06/Data_Formats.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# ## Data download

# +
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz

# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt
# -

# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz

# # Preparation

import os
from collections import defaultdict

# ## Loading HapMap meta-data

f = open('relationships_w_pops_041510.txt')
pop_ind = defaultdict(list)
f.readline()  # header
offspring = []
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    mom = toks[2]
    dad = toks[3]
    if mom != '0' or dad != '0':
        offspring.append((fam_id, ind_id))
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))
f.close()

# ## Sub-sampling

os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap10 --thin 0.1 --geno 0.1 --export ped')
os.system('plink2 --pedmap hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap1 --thin 0.01 --geno 0.1 --export ped')


# ## Getting only autosomal data

def get_non_auto_SNPs(map_file, exclude_file):
    f = open(map_file)
    w = open(exclude_file, 'w')
    for l in f:
        toks = l.rstrip().split('\t')
        try:
            chrom = int(toks[0])
        except ValueError:
            rs = toks[1]
            w.write('%s\n' % rs)
    w.close()


get_non_auto_SNPs('hapmap10.map', 'exclude10.txt')
get_non_auto_SNPs('hapmap1.map', 'exclude1.txt')

# !plink2 --pedmap hapmap10 --out hapmap10_auto --exclude exclude10.txt --export ped
# !plink2 --pedmap hapmap1 --out hapmap1_auto --exclude exclude1.txt --export ped


# ## Removing offspring

# !plink2 --pedmap hapmap10_auto --filter-founders --out hapmap10_auto_noofs --export ped

# ## LD-prunning

# !plink2 --pedmap hapmap10_auto_noofs --indep-pairwise 50 10 0.1 --out keep --export ped
# !plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --out hapmap10_auto_noofs_ld --export ped

# ## Different encoding

# !plink2 --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld_12 --export ped 12
# !plink2 --make-bed --pedmap hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld

# ## Single chromosome

# !plink2 --pedmap hapmap10_auto_noofs --chr 2 --out hapmap10_auto_noofs_2 --export ped


================================================
FILE: Chapter06/Exploratory_Analysis.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# ## Loading HapMap data

# +
import numpy as np
import xarray as xr
import sgkit as sg
from sgkit.io import plink

data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -

data

print(data.dims)

variant_stats = sg.variant_stats(data)
variant_stats

variant_stats.variant_call_rate.to_series().describe()

print(type(variant_stats.variant_call_rate.to_series()))

sample_stats = sg.sample_stats(data)
sample_stats

sample_stats.sample_call_rate.to_series().hist()

data['sample_cohort'] = xr.DataArray(
    np.zeros(data.dims['samples'], dtype=np.int64),
    dims='samples')
# data["sample_cohort"] = xr.DataArray(np.repeat([0, 1], data.dims["samples"] // 2), dims="samples")

sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].values

sg.cohort_allele_frequencies(data)['cohort_allele_frequency'][:,:,0].to_series().hist()


# # maf

cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values

min_freqs = map(
    lambda x: x if x < 0.5 else 1 - x,
    filter(
        lambda x: x not in [0, 1],
        cohort_allele_frequency[:, 0, 0]))


================================================
FILE: Chapter06/PCA.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.3
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": false}
import os

from genomics.popgen.plink.convert import to_eigen
from genomics.popgen.pca import plot, smart
# %matplotlib inline
# -

# ## Meta-data load

# + jupyter={"outputs_hidden": false}
f = open('relationships_w_pops_121708.txt')
ind_pop = {}
f.readline()  # header
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
ind_pop['2469/NA20281'] = ind_pop['2805/NA20281']
# -

# ## Requires plink from data preparation

# + jupyter={"outputs_hidden": false}
to_eigen('hapmap10_auto_noofs_ld_12', 'hapmap10_auto_noofs_ld_12')
# -

# ## Running smartpca

# + jupyter={"outputs_hidden": false}
ctrl = smart.SmartPCAController('hapmap10_auto_noofs_ld_12')
ctrl.run()

# + jupyter={"outputs_hidden": false}
wei, wei_perc, ind_comp = smart.parse_evec('hapmap10_auto_noofs_ld_12.evec', 'hapmap10_auto_noofs_ld_12.eval')

# + jupyter={"outputs_hidden": false}
plot.render_pca(ind_comp, 1, 2, cluster=ind_pop)
#put weights

# + jupyter={"outputs_hidden": false}
plot.render_pca_eight(ind_comp, cluster=ind_pop)

# + jupyter={"outputs_hidden": false}
markers = { 'CHB': '*', 'CHD': '*', 'JPT': '*', 'GIH': '*',
           'CEU': 'v', 'TSI': 'v', 'MEX': 'v',
           'ASW': 'o', 'LWK': 'o', 'YRI': 'o', 'MKK': 'o'
           }

# -

# ## With scikit-learn

# + jupyter={"outputs_hidden": false}
from sklearn.decomposition import PCA
import numpy as np

# + jupyter={"outputs_hidden": false}
f = open('hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
    ninds += 1
    toks = line[:100].replace(' ', '\t').split('\t') #  for speed
    fam_id = toks[0]
    ind_id = toks[1]
    ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
print (nsnps)
f.close()

# + jupyter={"outputs_hidden": false}
pca_array = np.empty((ninds, nsnps), dtype=int)
print(pca_array.shape)
f = open('hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
    snps = line.replace(' ', '\t').split('\t')[6:]
    for pos in range(len(snps) // 2):
        a1 = int(snps[2 * pos])
        a2 = int(snps[2 * pos])
        my_code = a1 + a2 - 2
        pca_array[ind, pos] = my_code
f.close()
#slow

# + jupyter={"outputs_hidden": false}
my_pca = PCA(n_components=8)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)
#Memory required

# + jupyter={"outputs_hidden": false}
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
    sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca_eight(sc_ind_comp, cluster=ind_pop)

# + jupyter={"outputs_hidden": false}


================================================
FILE: Chapter06/Pop_Stats.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# ## Loading HapMap meta-data

# +
from collections import defaultdict
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xarray as xr
import sgkit as sg
from sgkit.io import plink

data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -

data

f = open('relationships_w_pops_041510.txt')
pop_ind = defaultdict(list)
f.readline()  # header
for line in f:
    toks = line.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))

pops = list(pop_ind.keys())


def assign_cohort(pops, pop_ind, sample_family_id, sample_id):
    cohort = []
    for fid, sid in zip(sample_family_id, sample_id):
        processed = False
        for i, pop in enumerate(pops):
            if (fid, sid) in pop_ind[pop]:
                processed = True
                cohort.append(i)
                break
        if not processed:
            raise Exception(f'Not processed {fid}, {sid}')
    return cohort


cohort = assign_cohort(pops, pop_ind, data.sample_family_id.values, data.sample_id.values)

data['sample_cohort'] = xr.DataArray(
    cohort, dims='samples')

# # monomorphic positions per pop

cohort_allele_frequency = sg.cohort_allele_frequencies(data)['cohort_allele_frequency'].values

monom = {}
for i, pop in enumerate(pops):
    monom[pop] = len(list(filter(lambda x: x, np.isin(cohort_allele_frequency[:, i, 0], [0, 1]))))
pprint(monom)

# # MAF

mafs = {}
for i, pop in enumerate(pops):
    min_freqs = map(
        lambda x: x if x < 0.5 else 1 - x,
        filter(
            lambda x: x not in [0, 1],
            cohort_allele_frequency[:, i, 0]))
    mafs[pop] = pd.Series(min_freqs)

maf_plot, maf_ax = plt.subplots(nrows=2, sharey=True)
mafs['YRI'].hist(ax=maf_ax[0], bins=50)
maf_ax[0].set_title('*YRI*')
mafs['JPT'].hist(ax=maf_ax[1], bins=50)
maf_ax[1].set_title('*JPT*')
maf_ax[1].set_xlabel('MAF')

# # Fst

fst = sg.Fst(data)

fst = fst.assign_coords({"cohorts_0": pops, "cohorts_1": pops})

remove_nan = lambda data: filter(lambda x: not np.isnan(x), data)
ceu_chb = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CEU', cohorts_1='CHB').values))
chb_chd = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0='CHB', cohorts_1='CHD').values))

ceu_chb.describe()

chb_chd.describe()

mean_fst = {}
for i, pop_i in enumerate(pops):
    for j, pop_j in enumerate(pops):
        if j <= i:
            continue
        pair_fst = pd.Series(remove_nan(fst.stat_Fst.sel(cohorts_0=pop_i, cohorts_1=pop_j).values))
        mean = pair_fst.mean()
        mean_fst[(pop_i, pop_j)] = mean

min_pair = min(mean_fst.values())
max_pair = max(mean_fst.values())

sns.set_style("white")
num_pops = len(pops)
arr = np.ones((num_pops - 1, num_pops - 1, 3), dtype=float)
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
for row in range(num_pops - 1):
    pop_i = pops[row]
    for col in range(row + 1, num_pops):
        pop_j = pops[col]
        val = mean_fst[(pop_i, pop_j)]
        norm_val = (val - min_pair) / (max_pair - min_pair)
        ax.text(col - 1, row, '%.3f' % val, ha='center')
        if norm_val == 0.0:
            arr[row, col - 1, 0] = 1
            arr[row, col - 1, 1] = 1
            arr[row, col - 1, 2] = 0
        elif norm_val == 1.0:
            arr[row, col - 1, 0] = 1
            arr[row, col - 1, 1] = 0
            arr[row, col - 1, 2] = 1
        else:
            arr[row, col - 1, 0] = 1 - norm_val
            arr[row, col - 1, 1] = 1
            arr[row, col - 1, 2] = 1
ax.imshow(arr, interpolation='none')
ax.set_title('Multilocus Pairwise FST')
ax.set_xticks(range(num_pops - 1))
ax.set_xticklabels(pops[1:])
ax.set_yticks(range(num_pops - 1))
ax.set_yticklabels(pops[:-1])


================================================
FILE: Chapter06/Sgkit.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import os
from collections import defaultdict

# ## Loading HapMap data

# +
import numpy as np
from sgkit.io import plink

data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\t')
# -

data

print(data.dims)

print(len(data.sample_id.values))
print(data.sample_id.values)
print(data.sample_family_id.values)
print(data.sample_sex.values)

print(data.contigs)

print(len(data.variant_contig.values))
print(data.variant_contig.values)
print(data.variant_position.values)
print(data.variant_allele.values)
print(data.variant_id.values)

data.call_genotype

call_genotype = data.call_genotype.values
print(call_genotype.shape)
first_individual = call_genotype[:,0,:]
first_variant = call_genotype[0,:,:]
first_variant_of_first_individual = call_genotype[0,0,:]
print(first_variant_of_first_individual)
print(data.sample_family_id.values[0], data.sample_id.values[0])
print(data.variant_allele.values[0])


================================================
FILE: Chapter07/.gitignore
================================================
*fasta
trim.fasta.reduced
*nex
bp_rx

================================================
FILE: Chapter07/Alignment.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import os

import dendropy
# -

# ## Genome alignment

from Bio.Align.Applications import MafftCommandline
mafft_cline = MafftCommandline(input='sample.fasta', ep=0.123, reorder=True, maxiterate=1000, localpair=True)
print(mafft_cline)
stdout, stderr = mafft_cline()
with open('align.fasta', 'w') as w:
    w.write(stdout)

os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta')


# ## Protein alignment

# +
from Bio.Align.Applications import MuscleCommandline

my_genes = ['NP', 'L', 'VP35', 'VP40']

for gene in my_genes:
    muscle_cline = MuscleCommandline(input='%s_P.fasta' % gene)
    print(muscle_cline)
    stdout, stderr = muscle_cline()
    with open('%s_P_align.fasta' % gene, 'w') as w:
        w.write(stdout)

# +
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# XXX vvv
# from Bio.Alphabet import generic_protein

for gene in my_genes:
    gene_seqs = {}
    unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta')
    for rec in unal_gene:
        gene_seqs[rec.id] = rec.seq

    al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta')
    al_genes = []
    for protein in al_prot:
        my_id = protein.id
        seq = ''
        pos = 0
        for c in protein.seq:
            if c == '-':
                seq += '---'
            else:
                seq += str(gene_seqs[my_id][pos:pos + 3])
                pos += 3
        al_genes.append(SeqRecord(Seq(seq), id=my_id))


    SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta')
# -


================================================
FILE: Chapter07/Comparison.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.6
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import os
from collections import OrderedDict

import numpy as np
import pandas as pd

import dendropy
from dendropy.calculate import popgenstat
# -

# ## Genes

# +
genes_species = OrderedDict()
my_species = ['RESTV', 'SUDV']
my_genes = ['NP', 'L', 'VP35', 'VP40']


for name in my_genes:
    gene_name = name.split('.')[0]
    char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')
    genes_species[gene_name] = {}
    
    for species in my_species:
        genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()
    for taxon, char_map in char_mat.items():
        species = taxon.label.split('_')[0]
        if species in my_species:
            genes_species[gene_name][species].taxon_namespace.add_taxon(taxon)
            genes_species[gene_name][species][taxon] = char_map
# -

summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))
stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']
for row, (gene, species_data) in enumerate(genes_species.items()):
    for col_base, species in enumerate(my_species):
        summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])
        summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])
        summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])
        summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])
columns = []
for species in my_species:
    columns.extend(['%s (%s)' % (stat, species) for stat in stats])
df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)
df # vs print(df)


# ## Genomes

def do_basic_popgen(seqs):
    num_seg_sites = popgenstat.num_segregating_sites(seqs)
    avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)
    nuc_div = popgenstat.nucleotide_diversity(seqs)
    print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))
    print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs))
    print("Tajima's D: %s" % popgenstat.tajimas_d(seqs))


#XXX change
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path(
    'trim.fasta', schema='fasta', data_type='dna')
sl_2014 = []
drc_2007 = []
ebov2007_set = dendropy.DnaCharacterMatrix()
ebov2014_set = dendropy.DnaCharacterMatrix()
for taxon, char_map in ebov_seqs.items():
    print(taxon.label)
    if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:
        sl_2014.append(char_map)
        ebov2014_set.taxon_namespace.add_taxon(taxon)
        ebov2014_set[taxon] = char_map
    elif taxon.label.startswith('EBOV_2007'):
        drc_2007.append(char_map)
        ebov2007_set.taxon_namespace.add_taxon(taxon)
        ebov2007_set[taxon] = char_map
        #ebov2007_set.extend_map({taxon: char_map})
del ebov_seqs

# +
print('2007 outbreak:')
print('Number of individuals: %s' % len(ebov2007_set.taxon_namespace))
do_basic_popgen(ebov2007_set)

print('\n2014 outbreak:')
print('Number of individuals: %s' % len(ebov2014_set.taxon_namespace))
do_basic_popgen(ebov2014_set)
# -

print(len(sl_2014))
print(len(drc_2007))

pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)

print('Average number of pairwise differences irrespective of population: %.2f' %
      pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_within)
print('Average number of net pairwise differences : %.2f' %
      pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %d' %
      pair_stats.num_segregating_sites)
print("Watterson's theta: %.2f" %
      pair_stats.wattersons_theta)
print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi)
print("Tajima's D: %.2f" % pair_stats.tajimas_d)


================================================
FILE: Chapter07/Exploration.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.6
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import dendropy
from dendropy.interop import genbank


# ## Getting the data

# +
def get_ebov_2014_sources():
    #EBOV_2014
    #yield 'EBOV_2014', genbank.GenBankDna(id_range=(233036, 233118), prefix='KM')
    yield 'EBOV_2014', genbank.GenBankDna(id_range=(34549, 34563), prefix='KM0')
    
def get_other_ebov_sources():
    #EBOV other
    yield 'EBOV_1976', genbank.GenBankDna(ids=['AF272001', 'KC242801'])
    yield 'EBOV_1995', genbank.GenBankDna(ids=['KC242796', 'KC242799'])
    yield 'EBOV_2007', genbank.GenBankDna(id_range=(84, 90), prefix='KC2427')
    
def get_other_ebolavirus_sources():
    #BDBV
    yield 'BDBV', genbank.GenBankDna(id_range=(3, 6), prefix='KC54539')
    yield 'BDBV', genbank.GenBankDna(ids=['FJ217161'])

    #RESTV
    yield 'RESTV', genbank.GenBankDna(ids=['AB050936', 'JX477165', 'JX477166', 'FJ621583', 'FJ621584', 'FJ621585']) 

    #SUDV
    yield 'SUDV', genbank.GenBankDna(ids=['KC242783', 'AY729654', 'EU338380',
                                          'JN638998', 'FJ968794', 'KC589025', 'JN638998'])
    #yield 'SUDV', genbank.GenBankDna(id_range=(89, 92), prefix='KC5453')    

    #TAFV
    yield 'TAFV', genbank.GenBankDna(ids=['FJ217162'])


# +
other = open('other.fasta', 'w')
sampled = open('sample.fasta', 'w')

for species, recs in get_other_ebolavirus_sources():
    tn = dendropy.TaxonNamespace()
    char_mat = recs.generate_char_matrix(taxon_namespace=tn,
        gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession)))
    char_mat.write_to_stream(other, 'fasta')
    char_mat.write_to_stream(sampled, 'fasta')
other.close()
ebov_2014 = open('ebov_2014.fasta', 'w')
ebov = open('ebov.fasta', 'w')
for species, recs in get_ebov_2014_sources():
    tn = dendropy.TaxonNamespace()
    char_mat = recs.generate_char_matrix(taxon_namespace=tn,
        gb_to_taxon_fn=lambda gb: tn.require_taxon(label='EBOV_2014_%s' % gb.accession))
    char_mat.write_to_stream(ebov_2014, 'fasta')
    char_mat.write_to_stream(sampled, 'fasta')
    char_mat.write_to_stream(ebov, 'fasta')
ebov_2014.close()

ebov_2007 = open('ebov_2007.fasta', 'w')
for species, recs in get_other_ebov_sources():
    tn = dendropy.TaxonNamespace()
    char_mat = recs.generate_char_matrix(taxon_namespace=tn,
        gb_to_taxon_fn=lambda gb: tn.require_taxon(label='%s_%s' % (species, gb.accession)))
    char_mat.write_to_stream(ebov, 'fasta')
    char_mat.write_to_stream(sampled, 'fasta')
    if species == 'EBOV_2007':
        char_mat.write_to_stream(ebov_2007, 'fasta')

ebov.close()
ebov_2007.close()
sampled.close()
# -

# ## Genes

# +
my_genes = ['NP', 'L', 'VP35', 'VP40']

def dump_genes(species, recs, g_dls, p_hdls):
    for rec in recs:

        for feature in rec.feature_table:
                    if feature.key == 'CDS':
                        gene_name = None
                        for qual in feature.qualifiers:
                            if qual.name == 'gene':
                                if qual.value in my_genes:
                                    gene_name = qual.value
                            elif qual.name == 'translation':
                                protein_translation = qual.value
                        if gene_name is not None:
                            locs = feature.location.split('.')
                            start, end = int(locs[0]), int(locs[-1])
                            g_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession))
                            p_hdls[gene_name].write('>%s_%s\n' % (species, rec.accession))
                            g_hdls[gene_name].write('%s\n' % rec.sequence_text[start - 1 : end])
                            p_hdls[gene_name].write('%s\n' % protein_translation)

g_hdls = {}
p_hdls = {}
for gene in my_genes:
    g_hdls[gene] = open('%s.fasta' % gene, 'w')
    p_hdls[gene] = open('%s_P.fasta' % gene, 'w')
for species, recs in get_other_ebolavirus_sources():
    if species in ['RESTV', 'SUDV']:
        dump_genes(species, recs, g_hdls, p_hdls)
for gene in my_genes:
    g_hdls[gene].close()
    p_hdls[gene].close()


# -

# ## Genome exploration

def describe_seqs(seqs):
    print('Number of sequences: %d' % len(seqs.taxon_namespace))
    print('First 10 taxon sets: %s' % ' '.join([taxon.label for taxon in seqs.taxon_namespace[:10]]))
    lens = []
    for tax, seq in seqs.items():
        lens.append(len([x for x in seq.symbols_as_list() if x != '-']))
    print('Genome length: min %d, mean %.1f, max %d' % (min(lens), sum(lens) / len(lens), max(lens)))


ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('ebov.fasta', schema='fasta', data_type='dna')
print('EBOV')
describe_seqs(ebov_seqs)
del ebov_seqs

print('ebolavirus sequences')
ebolav_seqs = dendropy.DnaCharacterMatrix.get_from_path('other.fasta', schema='fasta', data_type='dna')
describe_seqs(ebolav_seqs)
from collections import defaultdict
species = defaultdict(int)
for taxon in ebolav_seqs.taxon_namespace:
    toks = taxon.label.split('_')
    my_species = toks[0]
    if my_species == 'EBOV':
        ident = '%s (%s)' % (my_species, toks[1])
    else:
        ident = my_species
    species[ident] += 1
for my_species, cnt in species.items():
    print("%20s: %d" % (my_species, cnt))
del ebolav_seqs

# ## Genes

# +
import os
gene_length = {}
my_genes = ['NP', 'L', 'VP35', 'VP40']

for name in my_genes:
    gene_name = name.split('.')[0]
    seqs = dendropy.DnaCharacterMatrix.get_from_path('%s.fasta' % name, schema='fasta', data_type='dna')
    gene_length[gene_name] = []
    for tax, seq in seqs.items():
        gene_length[gene_name].append(len([x for x in seq.symbols_as_list() if x != '-']))
for gene, lens in gene_length.items():
    print ('%6s: %d' % (gene, sum(lens) / len(lens)))
# -


================================================
FILE: Chapter07/Reconstruction.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.6
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import os
import random
import shutil
import sys

import dendropy
from dendropy.interop import raxml
# -

ebola_data = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta', 'fasta')
rx = raxml.RaxmlRunner()
ebola_tree = rx.estimate_tree(ebola_data, ['-m', 'GTRGAMMA', '-N', '10'])
print('RAxML temporary directory: %s' % rx.working_dir_path)
del ebola_data

ebola_tree.write_to_path('my_ebola.nex', 'nexus')

# +
import matplotlib.pyplot as plt
from Bio import Phylo
# # %matplotlib inline
my_ebola_tree = Phylo.read('my_ebola.nex', 'nexus')
my_ebola_tree.name = 'Our Ebolavirus tree'

fig = plt.figure(figsize=(16, 18))
ax = fig.add_subplot(1, 1, 1)
Phylo.draw(my_ebola_tree, axes=ax)
# -

# ## RAxML with Biopython

# XXX change
from Bio.Phylo.Applications import RaxmlCommandline
raxml_cline = RaxmlCommandline(sequences='trim.fasta',
                               model='GTRGAMMA', name='biopython',
                               num_replicates='10',
                               parsimony_seed=random.randint(0, sys.maxsize),
                               working_dir=os.getcwd() + os.sep + 'bp_rx')
print(raxml_cline)
try:
    os.mkdir('bp_rx')
except OSError:
    shutil.rmtree('bp_rx')
    os.mkdir('bp_rx')
out, err = raxml_cline()

from Bio import Phylo
biopython_tree = Phylo.read('bp_rx/RAxML_bestTree.biopython', 'newick')

print(biopython_tree)


================================================
FILE: Chapter07/Selection.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# +
### XXX This is probably to remove
# -

sl_2014 = []
drc_2007 = []
for seq in ebola_seqs.taxon_set:
    if seq.label.startswith('EBOV_2014') and len(sl_2014) < 8:
        sl_2014.append(ebola_seqs[seq])
    elif seq.label.startswith('EBOV_2007'):
        drc_2007.append(ebola_seqs[seq])

print(len(sl_2014))
print(len(drc_2007))

pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)

print('Average number of pairwise differences (total): %s' %
      pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %s' %
      pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %s' %
      pair_stats.average_number_of_pairwise_differences_within)
print('Average number of new pairwise differences : %s' %
      pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %s' %
      pair_stats.num_segregating_sites)
print("Watterson's theta: %s" %
      pair_stats.wattersons_theta)
print("Wakeley's Psi: %s" % pair_stats.wakeleys_psi)
print("Tajima's D: %s" % pair_stats.tajimas_d)


================================================
FILE: Chapter07/Trees.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.6
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import dendropy

ebola_raxml = dendropy.Tree.get_from_path('my_ebola.nex', 'nexus')


# +
def compute_level(node, level=0):
    for child in node.child_nodes():
        compute_level(child, level + 1)
    if node.taxon is not None:
        print("%s: %d %d" % (node.taxon, node.level(), level))

compute_level(ebola_raxml.seed_node)


# +
def compute_height(node):
    children = node.child_nodes()
    if len(children) == 0:
        height = 0
    else:
        height = 1 + max(map(lambda x: compute_height(x), children))
    desc = node.taxon or 'Internal'
    print("%s: %d %d" % (desc, height, node.level()))
    return height

compute_height(ebola_raxml.seed_node)


# +
def compute_nofs(node):
    children = node.child_nodes()
    nofs = len(children)
    map(lambda x: compute_nofs(x), children)
    desc = node.taxon or 'Internal'
    print("%s: %d %d" % (desc, nofs, node.level()))

compute_nofs(ebola_raxml.seed_node)


# +
def print_nodes(node):
    for child in node.child_nodes():
        print_nodes(child)
    if node.taxon is not None:
        print('%s (%d)' % (node.taxon, node.level()))

print_nodes(ebola_raxml.seed_node)

# +
from collections import deque

def print_breadth(tree):
    queue = deque()
    queue.append(tree.seed_node)
    while len(queue) > 0:
        process_node = queue.popleft()
        if process_node.taxon is not None:
            print('%s (%d)' % (process_node.taxon, process_node.level()))
        else:
            for child in process_node.child_nodes():
                queue.append(child)

print_breadth(ebola_raxml)

# +
from copy import deepcopy
simple_ebola = deepcopy(ebola_raxml)

def simplify_tree(node):
    prefs = set()
    for leaf in node.leaf_nodes():
        my_toks = leaf.taxon.label.split(' ')[0].split('_')
        if my_toks[0] == 'EBOV':
            prefs.add('EBOV' + my_toks[1])
        else:
            prefs.add(my_toks[0])
    if len(prefs) == 1:
        print(prefs, len(node.leaf_nodes()))
        node.taxon = dendropy.Taxon(label=list(prefs)[0])
        #node.collapse_clade()
        node.set_child_nodes([])
    else:
        for child in node.child_nodes():
            simplify_tree(child)

simplify_tree(simple_ebola.seed_node)
simple_ebola.ladderize()
simple_ebola.write_to_path('ebola_simple.nex', 'nexus')
# -


================================================
FILE: Chapter07/Visualization.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.6
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

from copy import deepcopy
import matplotlib.pyplot as plt
from Bio import Phylo

ebola_tree = Phylo.read('my_ebola.nex', 'nexus')
ebola_tree.name = 'Ebolavirus tree'
ebola_simple_tree = Phylo.read('ebola_simple.nex', 'nexus')
ebola_simple_tree.name = 'Ebolavirus simplified tree'

Phylo.draw_ascii(ebola_simple_tree)
Phylo.draw_ascii(ebola_tree)

fig = plt.figure(figsize=(16, 22))
ax = fig.add_subplot(111)
Phylo.draw(ebola_simple_tree, axes=ax, branch_labels=
           lambda c: c.branch_length if c.branch_length > 0.02 else None)

# +
fig = plt.figure(figsize=(16, 22))
ax = fig.add_subplot(111)
from collections import OrderedDict
my_colors = OrderedDict({
'EBOV_2014': 'red',
'EBOV': 'magenta',
'BDBV': 'cyan',
'SUDV': 'blue',
'RESTV' : 'green',
'TAFV' : 'yellow'
})

def get_color(name):
    for pref, color in my_colors.items():
        if name.find(pref) > -1:
            return color
    return 'grey'

def color_tree(node, fun_color=get_color):
    if node.is_terminal():
        node.color = fun_color(node.name)
    else:
        my_children = set()
        for child in node.clades:
            color_tree(child, fun_color)
            my_children.add(child.color.to_hex())
        if len(my_children) == 1:
            node.color = child.color
        else:
            node.color = 'grey'

ebola_color_tree = deepcopy(ebola_tree)
color_tree(ebola_color_tree.root)
Phylo.draw(ebola_color_tree, axes=ax, label_func=
           lambda x: x.name.split(' ')[0][1:] if x.name is not None else None)
# -


================================================
FILE: Chapter08/.gitignore
================================================
*ent
*fasta

================================================
FILE: Chapter08/Distance.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import math
import timeit

from Bio import PDB
# -

repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', file_format='pdb', pdir='.')  # XXX
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')

zns = []
for atom in p53_1tup.get_atoms():
    if atom.element == 'ZN':
        #print(atom, dir(atom), atom.mass, atom.element, atom.coord[0])
        zns.append(atom)
for zn in zns:
        print(zn, zn.coord)


# +
#Suggest a pymol viewing
# -

#Try this in numba?
def get_closest_atoms(pdb_struct, ref_atom, distance):
    atoms = {}
    rx, ry, rz = ref_atom.coord
    for atom in pdb_struct.get_atoms():
        if atom == ref_atom:
            continue
        x, y, z = atom.coord
        my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) 
        if my_dist < distance:
            atoms[atom] = my_dist
    return atoms


for zn in zns:
    print()
    print(zn.coord)
    atoms = get_closest_atoms(p53_1tup, zn, 4)
    for atom, distance in atoms.items():
        print(atom.element, distance, atom.coord)

for distance in [1, 2, 4, 8, 16, 32, 64, 128]:
    my_atoms = []
    for zn in zns:
        atoms = get_closest_atoms(p53_1tup, zn, distance)
        my_atoms.append(len(atoms))
    print(distance, my_atoms)

nexecs = 10
print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], 4.0)',
                    'from __main__ import get_closest_atoms, p53_1tup, zns',
                    number=nexecs) / nexecs * 1000)


def get_closest_alternative(pdb_struct, ref_atom, distance):
    atoms = {}
    rx, ry, rz = ref_atom.coord
    for atom in pdb_struct.get_atoms():
        if atom == ref_atom:
            continue
        x, y, z = atom.coord
        if abs(x - rx) > distance or abs(y - ry) > distance or abs(z - rz) > distance:
            continue
        my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) 
        if my_dist < distance:
            atoms[atom] = my_dist
    return atoms


print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], 4.0)',
                    'from __main__ import get_closest_alternative, p53_1tup, zns',
                    number=nexecs) / nexecs * 1000)

print('Standard')
for distance in [1, 4, 16, 64, 128]:
    print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], distance)',
                        'from __main__ import get_closest_atoms, p53_1tup, zns, distance',
                        number=nexecs) / nexecs * 1000)
print('Optimized')
for distance in [1, 4, 16, 64, 128]:
    print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], distance)',
                        'from __main__ import get_closest_alternative, p53_1tup, zns, distance',
                        number=nexecs) / nexecs * 1000)


# +
#for interesting distances


================================================
FILE: Chapter08/Intro.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
from collections import defaultdict

import requests

from Bio import ExPASy, SwissProt
# -

#explain why not biopython
server = 'https://rest.uniprot.org/uniprotkb/search'
def do_request(server, **kwargs):
    params = ''
    req = requests.get(server, params=kwargs)
    if not req.ok:
        req.raise_for_status()
    return req


req = do_request(server,
    # 1. Filtering human p53, reviewed entries
    query='gene:p53 AND reviewed:true AND organism_id:9606',
    format='tsv',
    # 2. Specifying output columns with REST API field names
    fields='accession,id,protein_name,gene_names,organism_name,length',
    size=50
)
print(req.text)

#We might revisit this for KEGG

# +
#XXX - stringio
import pandas as pd
import io

uniprot_list = pd.read_table(io.StringIO(req.text))
uniprot_list.rename(columns={'Organism ID': 'ID'}, 
inplace=True)
print(uniprot_list)
# -

p53_human = uniprot_list[
    (uniprot_list.Entry == 'P04637') &
    (uniprot_list['Entry Name'].str.contains('P53_HUMAN'))]['Entry'].iloc[0]

handle = ExPASy.get_sprot_raw(p53_human)

sp_rec = SwissProt.read(handle)

print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name)
print(sp_rec.description)
print(sp_rec.organism, sp_rec.seqinfo)
print(sp_rec.sequence)

print(sp_rec.comments)
print(sp_rec.keywords)

help(sp_rec)

done_features = set()
print('Total features:', len(sp_rec.features))
for feature in sp_rec.features:
    if feature in done_features:
        continue
    else:
        done_features.add(feature)
        print(feature)
print('Cross references: ',len(sp_rec.cross_references))
per_source = defaultdict(list)
for xref in sp_rec.cross_references:
    source = xref[0]
    per_source[source].append(xref[1:])
print(per_source.keys())
done_GOs = set()
print('Annotation SOURCES:', len(per_source['GO']))
for annot in per_source['GO']:
    if annot[1][0] in done_GOs:
        continue
    else:
        done_GOs.add(annot[1][0])
        print(annot)


================================================
FILE: Chapter08/Mass.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
import numpy as np
import pandas as pd

from Bio import PDB

# +
# #!rm -f 1tup.cif 2>/dev/null
# #!wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif
#parser = PDB.MMCIFParser()
#p53_1tup = parser.get_structure('P53', '1tup.cif')
# -

repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')

my_residues = set()
for residue in p53_1tup.get_residues():
    my_residues.add(residue.id[0])
print(my_residues)


# +
def get_mass(atoms, accept_fun=lambda atom: atom.parent.id[0] != 'W'):
    return sum([atom.mass for atom in atoms if accept_fun(atom)])

chain_names = [chain.id for chain in p53_1tup.get_chains()]
my_mass = np.ndarray((len(chain_names), 3))
for i, chain in enumerate(p53_1tup.get_chains()):
    my_mass[i, 0] = get_mass(chain.get_atoms())
    my_mass[i, 1] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] not in [' ', 'W'])
    my_mass[i, 2] = get_mass(chain.get_atoms(), accept_fun=lambda atom: atom.parent.id[0] == 'W')
masses = pd.DataFrame(my_mass, index=chain_names, columns=['No Water', 'Zincs', 'Water'])
masses


# -

def get_center(atoms, weight_fun=lambda atom: 1 if atom.parent.id[0] != 'W' else 0):
    xsum = ysum = zsum = 0.0
    acum = 0.0
    for atom in atoms:
        x, y, z = atom.coord
        weight = weight_fun(atom)
        acum += weight
        xsum += weight * x
        ysum += weight * y
        zsum += weight * z
    return xsum / acum, ysum / acum, zsum / acum


print(get_center(p53_1tup.get_atoms()))
print(get_center(p53_1tup.get_atoms(),
                 weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0))

my_center = np.ndarray((len(chain_names), 6))
for i, chain in enumerate(p53_1tup.get_chains()):
    x, y, z = get_center(chain.get_atoms())
    my_center[i, 0] = x
    my_center[i, 1] = y
    my_center[i, 2] = z
    x, y, z = get_center(chain.get_atoms(), weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0)
    my_center[i, 3] = x
    my_center[i, 4] = y
    my_center[i, 5] = z
weights = pd.DataFrame(my_center, index=chain_names, columns=['X', 'Y', 'Z', 'X (Mass)', 'Y (Mass)', 'Z (Mass)'])
weights

# +
#Pymol viz


================================================
FILE: Chapter08/PDB.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

from Bio import PDB

repository = PDB.PDBList()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')
repository.retrieve_pdb_file('1OLG', pdir='.', file_format='pdb')
repository.retrieve_pdb_file('1YCQ', pdir='.', file_format='pdb')

parser = PDB.PDBParser()
p53_1tup = parser.get_structure('P 53 - DNA Binding', 'pdb1tup.ent')
p53_1olg = parser.get_structure('P 53 - Tetramerization', 'pdb1olg.ent')
p53_1ycq = parser.get_structure('P 53 - Transactivation', 'pdb1ycq.ent')


# +
def print_pdb_headers(headers, indent=0):
    ind_text = ' ' * indent
    for header, content in headers.items():
        if type(content) == dict:
            print('\n%s%20s:' % (ind_text, header))
            print_pdb_headers(content, indent + 4)
            print()
        elif type(content) == list:
            print('%s%20s:' % (ind_text, header))
            for elem in content:
                print('%s%21s %s' % (ind_text, '->', elem))
        else:
            print('%s%20s: %s' % (ind_text, header, content))

print_pdb_headers(p53_1tup.header)
# -

print(p53_1tup.header['compound'])
print(p53_1olg.header['compound'])
print(p53_1ycq.header['compound'])


def describe_model(name, pdb):
    print()
    for model in pdb:
        for chain in model:
            print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' %
                  (name, chain.id, len(chain), len(list(chain.get_atoms()))))
describe_model('1TUP', p53_1tup)
describe_model('1OLG', p53_1olg)
describe_model('1YCQ', p53_1ycq)
#will go deep in a next recipe (bottom up)

for residue in p53_1tup.get_residues():
    if residue.id[0] in [' ', 'W']:
        continue
    print(residue.id)

res = next(p53_1tup[0]['A'].get_residues())
print(res)
for atom in res:
    print(atom, atom.serial_number, atom.element)
print(p53_1tup[0]['A'][94]['CA'])

# +
from Bio.SeqIO import PdbIO, FastaIO
from Bio import SeqIO

def get_fasta(pdb_file, fasta_file, transfer_ids=None):
    records = list(PdbIO.PdbSeqresIterator(pdb_file))
    if transfer_ids is not None:
        records = [rec for rec in records if rec.id in transfer_ids and len(rec.seq) > 0]
    else:
        records = [rec for rec in records if len(rec.seq) > 0]
    
    with open(fasta_file, 'w') as out_handle:
        SeqIO.write(records, out_handle, 'fasta')
    for rec in records:
       print(rec.id, rec.seq, len(rec.seq))
        
        
get_fasta('pdb1tup.ent', '1tup.fasta', transfer_ids=['1TUP:B'])
get_fasta('pdb1olg.ent', '1olg.fasta', transfer_ids=['1OLG:B'])
get_fasta('pdb1ycq.ent', '1ycq.fasta', transfer_ids=['1YCQ:B'])
# -


================================================
FILE: Chapter08/Parser.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

from Bio import PDB

#XXX
repository = PDB.PDBList()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')

# +
rec_types = {
    #single line
    'HEADER': [(str, 11, 49), (str, 50, 58), (str, 62, 65)],
    #multi_line
    'SOURCE': [(int, 7, 9), (str, 10, 78)],
    #multi_rec
    'LINK' : [(str, 12, 15), (str, 16, 16), (str, 17, 19), (str, 21, 21), (int, 22, 25),
              (str, 26, 26), (str, 42, 45), (str, 46, 46), (str, 47, 49), (str, 51, 51),
              (int, 52, 55), (str, 56, 56), (str, 59, 64), (str, 66, 71), (float, 73, 77)],
    'HELIX': [(int, 7, 9), (str, 11, 13), (str, 15, 17), (str, 19, 19), (int, 21, 24),
              (str, 25, 25), (str, 27, 29), (str, 31, 31),
              (int, 33, 36), (str, 37 ,37), (int, 38, 39), (str, 40, 69), (int, 71, 75)],
    'SHEET': [(int, 7, 9), (str, 11, 13), (int, 14, 15), (str, 17, 19), (str, 21, 21),
              (int, 22, 24), (str, 26, 26), (str, 28, 30),
              (str, 32, 32), (int, 33, 36), (str, 37, 37), (int, 38, 39), (str, 41, 44),
              (str, 45, 47), (str, 49, 49), (int, 50, 53), (str, 54, 54), (str, 56, 59),
              (str, 60, 62), (str, 64, 64), (int, 65, 68), (str, 69, 69)],
}

def parse_pdb(hdl):
    for line in hdl:
        line = line[:-1]  # remove \n but not other whitespace
        toks = []
        for section, elements in rec_types.items():
            if line.startswith(section):
                for fun, start, end in elements:
                    try:
                        toks.append(fun(line[start: end + 1]))
                    except ValueError:
                        toks.append(None)  # eg continuation
                yield (section, toks)
        if len(toks) == 0:
            yield ('UNKNOWN', line)
                

# -

hdl = open('pdb1tup.ent')
done_rec = set()
for rec in parse_pdb(hdl):
    if rec[0] == 'UNKNOWN' or rec[0] in done_rec:
        continue
    print(rec)
    done_rec.add(rec[0])

# +
multi_lines = ['SOURCE']

#assume multi is just a string
def process_multi_lines(hdl):
    current_multi = ''
    current_multi_name = None
    for rec_type, toks in parse_pdb(hdl):
        if current_multi_name is not None and current_multi_name != rec_type:
            yield current_multi_name, [current_multi]
            current_multi = ''
            current_multi_name = None
        if rec_type in multi_lines:
            current_multi += toks[1].strip().rstrip() + ' '
            current_multi_name = rec_type
        else:
            if len(current_multi) != 0:
                yield current_multi_name, [current_multi]
                current_multi = ''
                current_multi_name = None                
            yield rec_type, toks
    if len(current_multi) != 0:
        yield current_multi_name, [current_multi]


# -

hdl = open('pdb1tup.ent')
done_rec = set()
for rec in process_multi_lines(hdl):
    if rec[0] == 'UNKNOWN' or rec[0] in done_rec:
        continue
    print(rec)
    done_rec.add(rec[0])


# +
def get_spec_list(my_str):
    #ignoring escape characters
    spec_list = {}
    elems = my_str.strip().strip().split(';')
    for elem in elems:
        toks = elem.split(':')
        spec_list[toks[0].strip()] = toks[1].strip()
    return spec_list

struct_types = {
    'SOURCE': [get_spec_list] 
}

def process_struct_types(hdl):
    for rec_type, toks in process_multi_lines(hdl):
        if rec_type in struct_types.keys():
            funs = struct_types[rec_type]
            struct_toks = []
            for tok, fun in zip(toks, funs):
                struct_toks.append(fun(tok))
            yield rec_type, struct_toks
        else:
            yield rec_type, toks


# -

hdl = open('pdb1tup.ent')
for rec in process_struct_types(hdl):
    if rec[0] != 'SOURCE':
        continue
    print(rec)


================================================
FILE: Chapter08/PyMol_Intro.py
================================================
import threading
def dump_thread():
    print
    for thr in threading.enumerate():
        print(thr)
dump_thread()
import pymol
pymol.pymol_launch=4
pymol.pymol_argv = [ 'pymol', '-qc'] #  Quiet / no GUI
from pymol import cmd
pymol.finish_launching()
dump_thread()

#cmd.fetch('1TUP', async=False)
cmd.fetch('1TUP')
cmd.disable('all')
cmd.enable('1TUP')
cmd.bg_color('white')
cmd.hide('all')
cmd.show('cartoon')
#cmd.hide('cartoon', 'chain E+F')
#cmd.show('ribbon', 'chain E+F')
cmd.select('zinc', 'name zn')
cmd.show('sphere', 'zinc')
cmd.set('ray_trace_mode', 3)
cmd.png('1TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False)
dump_thread()

cmd.set('ray_trace_mode', 1)
cmd.png('TUP.png', width=1980, height=1080, quiet=0, ray=1, prior=False)
cmd.quit()


================================================
FILE: Chapter08/PyMol_Movie.py
================================================
import pymol
from pymol import cmd
#pymol.pymol_argv = [ 'pymol', '-qc'] #  Quiet / no GUI
pymol.finish_launching()

#cmd.fetch('1TUP', async=False)
cmd.fetch('1TUP')

cmd.disable('all')
cmd.enable('1TUP')
cmd.hide('all')
cmd.show('sphere', 'name zn')

cmd.show('surface', 'chain A+B+C')
cmd.show('cartoon', 'chain E+F')
cmd.scene('S0', action='store', view=0, frame=0, animate=-1)

cmd.show('cartoon')
cmd.hide('surface')

cmd.scene('S1', action='store', view=0, frame=0, animate=-1)

cmd.hide('cartoon', 'chain A+B+C')
cmd.show('mesh', 'chain A')
cmd.show('sticks', 'chain A+B+C')
cmd.scene('S2', action='store', view=0, frame=0, animate=-1)

cmd.set('ray_trace_mode', 0)
cmd.mset(1, 500)


cmd.frame(0)
cmd.scene('S0')
cmd.mview()
cmd.frame(60)
cmd.set_view((-0.175534308,   -0.331560850,   -0.926960170,
             0.541812420,     0.753615797,   -0.372158051,
             0.821965039,    -0.567564785,    0.047358301,
             0.000000000,     0.000000000, -249.619018555,
             58.625568390,   15.602619171,   77.781631470,
             196.801528931, 302.436492920,  -20.000000000))

cmd.mview()
cmd.frame(90)
cmd.set_view((-0.175534308,   -0.331560850,   -0.926960170,
              0.541812420,    0.753615797,   -0.372158051,
              0.821965039,   -0.567564785,    0.047358301,
              -0.000067875,    0.000017881, -249.615447998,
              54.029174805,   26.956727982,   77.124832153,
             196.801528931,  302.436492920,  -20.000000000))
cmd.mview()
cmd.frame(150)
cmd.set_view((-0.175534308,   -0.331560850,   -0.926960170,
              0.541812420,    0.753615797,   -0.372158051,
              0.821965039,   -0.567564785,    0.047358301,
              -0.000067875,    0.000017881,  -55.406421661,
              54.029174805,   26.956727982,   77.124832153,
              2.592475891,  108.227416992,  -20.000000000))
cmd.mview()
cmd.frame(200)
cmd.scene('S1')
cmd.mview()
cmd.frame(350)
cmd.scene('S1')
cmd.set_view((0.395763457,   -0.173441306,    0.901825786,
              0.915456235,    0.152441502,   -0.372427106,
             -0.072881661,    0.972972929,    0.219108686,
              0.000070953,    0.000013039,  -37.689743042,
             57.748500824,   14.325904846,   77.241867065,
             -15.123448372,   90.511535645,  -20.000000000))

cmd.mview()
cmd.frame(351)
cmd.scene('S2')
cmd.mview()

cmd.frame(500)
cmd.scene('S2')
cmd.mview()
cmd.mplay()
cmd.mpng('p53_1tup')

cmd.quit()


================================================
FILE: Chapter08/Stats.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.8
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
from collections import defaultdict
import sys

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# #%matplotlib inline

from Bio import PDB
# -

repository = PDB.PDBList()
parser = PDB.PDBParser()
repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') #XXX
p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent')

# +
atom_cnt = defaultdict(int)
atom_chain = defaultdict(int)
atom_res_types = defaultdict(int)

for atom in p53_1tup.get_atoms():
    my_residue = atom.parent
    my_chain = my_residue.parent
    atom_chain[my_chain.id] += 1
    if my_residue.resname != 'HOH':
        atom_cnt[atom.element] += 1
    atom_res_types[my_residue.resname] += 1
print(dict(atom_res_types))
print(dict(atom_chain))
print(dict(atom_cnt))
# -

res_types = defaultdict(int)
res_per_chain = defaultdict(int)
for residue in p53_1tup.get_residues():
    res_types[residue.resname] += 1
    res_per_chain[residue.parent.id] +=1
print(dict(res_types))
print(dict(res_per_chain))


def get_bounds(my_atoms):
    my_min = [sys.maxsize] * 3
    my_max = [-sys.maxsize] * 3
    for atom in my_atoms:
        for i, coord in enumerate(atom.coord):
            if coord < my_min[i]:
                my_min[i] = coord
            if coord > my_max[i]:
                my_max[i] = coord
    return my_min, my_max


chain_bounds = {}
for chain in p53_1tup.get_chains():
    print(chain.id, get_bounds(chain.get_atoms()))
    chain_bounds[chain.id] = get_bounds(chain.get_atoms())
print(get_bounds(p53_1tup.get_atoms()))

#matplotlib 3d plot
fig = plt.figure(figsize=(16, 9))
ax3d = fig.add_subplot(111, projection='3d')
ax_xy = fig.add_subplot(331)
ax_xy.set_title('X/Y')
ax_xz = fig.add_subplot(334)
ax_xz.set_title('X/Z')
ax_zy = fig.add_subplot(337)
ax_zy.set_title('Z/Y')
color = {'A': 'r', 'B': 'g', 'C': 'b', 'E': '0.5', 'F': '0.75'}
zx, zy, zz = [], [], []
for chain in p53_1tup.get_chains():
    xs, ys, zs = [], [], []
    for residue in chain.get_residues():
        ref_atom = next(residue.get_iterator())
        x, y, z = ref_atom.coord
        if ref_atom.element == 'ZN':
            zx.append(x)
            zy.append(y)
            zz.append(z)
            continue
        xs.append(x)
        ys.append(y)
        zs.append(z)
    ax3d.scatter(xs, ys, zs, color=color[chain.id])
    ax_xy.scatter(xs, ys, marker='.', color=color[chain.id])
    ax_xz.scatter(xs, zs, marker='.', color=color[chain.id])
    ax_zy.scatter(zs, ys, marker='.', color=color[chain.id])
ax3d.set_xlabel('X')
ax3d.set_ylabel('Y')
ax3d.set_zlabel('Z')
ax3d.scatter(zx, zy, zz, color='k', marker='v', s=300)
ax_xy.scatter(zx, zy, color='k', marker='v', s=80)
ax_xz.scatter(zx, zz, color='k', marker='v', s=80)
ax_zy.scatter(zz, zy, color='k', marker='v', s=80)
for ax in [ax_xy, ax_xz, ax_zy]:
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)


================================================
FILE: Chapter08/mmCIF.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

from Bio import PDB

# !rm -f 1tup.cif 2>/dev/null
# !wget "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP" -O 1tup.cif

parser = PDB.MMCIFParser()
p53_1tup = parser.get_structure('P53_HUMAN', '1tup.cif')


def describe_model(name, pdb):
    print()
    for model in p53_1tup:
        for chain in model:
            print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' %
                  (name, chain.id, len(chain), len(list(chain.get_atoms()))))
describe_model('1TUP', p53_1tup)

done_chain = set()
for residue in p53_1tup.get_residues():
    chain = residue.parent
    if chain.id in done_chain:
        continue
    done_chain.add(chain.id)
    print(chain.id, residue.id)

mmcif_dict = PDB.MMCIF2Dict.MMCIF2Dict('1tup.cif')

for k, v in mmcif_dict.items():
    print(k, v)
    print()


================================================
FILE: Chapter09/galaxy/.gitignore
================================================
galaxy.yaml.enc
tool
salt

================================================
FILE: Chapter09/galaxy/LCT.bed
================================================
track name=gene description="Gene information"
2	135836529	135837180	ENSE00002202258	0	-
2	135833110	135833190	ENSE00001660765	0	-
2	135829592	135829676	ENSE00001731451	0	-
2	135823900	135824003	ENSE00001659892	0	-
2	135822019	135822098	ENSE00001777620	0	-
2	135817340	135818061	ENSE00001602826	0	-
2	135812310	135812956	ENSE00000776576	0	-
2	135808442	135809993	ENSE00001008768	0	-
2	135807127	135807396	ENSE00000776573	0	-
2	135804766	135805057	ENSE00000776572	0	-
2	135803929	135804128	ENSE00000776571	0	-
2	135800606	135800809	ENSE00000776570	0	-
2	135798028	135798138	ENSE00003515081	0	-
2	135794640	135794775	ENSE00001630333	0	-
2	135790657	135790881	ENSE00001667885	0	-
2	135789570	135789798	ENSE00001728878	0	-
2	135787839	135788544	ENSE00001653704	0	-
2	135812310	135812959	ENSE00001745158	0	-
2	135808442	135809993	ENSE00001008768	0	-
2	135807127	135807396	ENSE00000776573	0	-
2	135804766	135805057	ENSE00000776572	0	-
2	135803929	135804128	ENSE00000776571	0	-
2	135798028	135798138	ENSE00003459353	0	-
2	135794336	135794775	ENSE00001635523	0	-
2	135810168	135810279	ENSE00001438557	0	-
2	135820190	135820639	ENSE00001732580	0	+
2	135821674	135823087	ENSE00001695040	0	+
2	135836529	135837180	NM_002299.2.1	0	-
2	135833110	135833190	NM_002299.2.2	0	-
2	135829592	135829676	NM_002299.2.3	0	-
2	135823900	135824003	NM_002299.2.4	0	-
2	135822019	135822098	NM_002299.2.5	0	-
2	135817340	135818061	NM_002299.2.6	0	-
2	135812310	135812956	NM_002299.2.7	0	-
2	135808442	135809993	NM_002299.2.8	0	-
2	135807127	135807396	NM_002299.2.9	0	-
2	135804766	135805057	NM_002299.2.10	0	-
2	135803929	135804128	NM_002299.2.11	0	-
2	135800606	135800809	NM_002299.2.12	0	-
2	135798028	135798138	NM_002299.2.13	0	-
2	135794640	135794775	NM_002299.2.14	0	-
2	135790657	135790881	NM_002299.2.15	0	-
2	135789570	135789798	NM_002299.2.16	0	-
2	135787844	135788544	NM_002299.2.17	0	-
2	135836529	135837169	CCDS2178.117	0	-
2	135833110	135833190	CCDS2178.116	0	-
2	135829592	135829676	CCDS2178.115	0	-
2	135823900	135824003	CCDS2178.114	0	-
2	135822019	135822098	CCDS2178.113	0	-
2	135817340	135818061	CCDS2178.112	0	-
2	135812310	135812956	CCDS2178.111	0	-
2	135808442	135809993	CCDS2178.110	0	-
2	135807127	135807396	CCDS2178.19	0	-
2	135804766	135805057	CCDS2178.18	0	-
2	135803929	135804128	CCDS2178.17	0	-
2	135800606	135800809	CCDS2178.16	0	-
2	135798028	135798138	CCDS2178.15	0	-
2	135794640	135794775	CCDS2178.14	0	-
2	135790657	135790881	CCDS2178.13	0	-
2	135789570	135789798	CCDS2178.12	0	-
2	135788323	135788544	CCDS2178.11	0	-


================================================
FILE: Chapter09/galaxy/api.py
================================================
import base64
from collections import defaultdict
#import ftplib

import getpass
import pprint
import warnings

from ruamel.yaml import YAML

from cryptography.fernet import Fernet
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC

import pandas as pd

from bioblend.galaxy import GalaxyInstance

import paramiko

pp = pprint.PrettyPrinter()
warnings.filterwarnings('ignore')
# explain above, and warn


with open('galaxy.yaml.enc', 'rb') as f:
    enc_conf = f.read()


password = getpass.getpass('Please enter the password:').encode()
with open('salt', 'rb') as f:
    salt = f.read()
kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt,
                 iterations=100000, backend=default_backend())
key = base64.urlsafe_b64encode(kdf.derive(password))
fernet = Fernet(key)

yaml = YAML()
conf = yaml.load(fernet.decrypt(enc_conf).decode())

server = conf['server']
rest_protocol = conf['rest_protocol']
rest_port = conf['rest_port']
user = conf['user']
password = conf['password']
sftp_port = int(conf['sftp_port'])
api_key = conf['api_key']

rest_url = '%s://%s:%d' % (rest_protocol, server, rest_port)

history_name = 'bioinf_example'

gi = GalaxyInstance(url=rest_url, key=api_key)
gi.verify = False
histories = gi.histories

print('Existing histories:')
for history in histories.get_histories():
    if history['name'] == history_name:
        histories.delete_history(history['id'])
    print('  - ' + history['name'])
print()

ds_history = histories.create_history(history_name)


print('Uploading file')
transport = paramiko.Transport((server, sftp_port))
transport.connect(None, user, password)
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.put('LCT.bed', 'LCT.bed')
sftp.close()
transport.close()
#ftp = ftplib.FTP() 
#ftp.connect(host=server, port=ftp_port)
#ftp.login(user=user, passwd=password)
#f = open('LCT.bed', 'rb')
#ftp.set_pasv(True)  # explain
##ftp.storbinary('STOR LCT.bed', f)
#s = ftp.transfercmd('STOR LCT.bed')
#s.send(f.read())
#s.close()
#f.close() 
#ftp.close()

gi.tools.upload_from_ftp('LCT.bed', ds_history['id'])
print()

contents = gi.histories.show_history(ds_history['id'], contents=True)

def summarize_contents(contents):
    summary = defaultdict(list)
    for item in contents:
        summary['íd'].append(item['id'])
        summary['híd'].append(item['hid'])
        summary['name'].append(item['name'])
        summary['type'].append(item['type'])
        summary['extension'].append(item['extension'])
    return pd.DataFrame.from_dict(summary)

print('History contents:')
pd_contents = summarize_contents(contents)
print(pd_contents)
print()

print('Metadata for LCT.bed')
bed_ds = contents[0]
pp.pprint(bed_ds)
print()

print('Metadata about all tools')
all_tools = gi.tools.get_tools()
pp.pprint(all_tools)
print()

bed2gff = gi.tools.get_tools(name='Convert BED to GFF')[0]
print("Convert BED to GFF metadata:")
pp.pprint(gi.tools.show_tool(bed2gff['id'], io_details=True, link_details=True))
print()

def dataset_to_param(dataset):
    return dict(src='hda', id=dataset['id'])

tool_inputs = {
    'input1': dataset_to_param(bed_ds)
    }

#hid!


gi.tools.run_tool(ds_history['id'], bed2gff['id'], tool_inputs=tool_inputs)


================================================
FILE: Chapter09/galaxy/encrypt.py
================================================
"Encrypt an YAML file with the script configuration"

import base64
import getpass
from io import StringIO
import os

from ruamel.yaml import YAML

from cryptography.fernet import Fernet
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC

password = getpass.getpass('Please enter the password:').encode()

salt = os.urandom(16)
kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt,
                 iterations=100000, backend=default_backend())
key = base64.urlsafe_b64encode(kdf.derive(password))
fernet = Fernet(key)

with open('salt', 'wb') as w:
    w.write(salt)


yaml = YAML()

content = yaml.load(open('galaxy.yaml', 'rt', encoding='utf-8'))
print(type(content), content)
output = StringIO()
yaml.dump(content, output)
print ('Encrypting:\n%s' % output.getvalue())

enc_output = fernet.encrypt(output.getvalue().encode())

with open('galaxy.yaml.enc', 'wb') as w:
    w.write(enc_output)


print("Complete, the clear version should be deleted now")


================================================
FILE: Chapter09/galaxy/galaxy.yaml
================================================
rest_protocol: http
server: localhost
rest_port: 8080
sftp_port: 8022
user: admin@galaxy.org
password: password
api_key: fakekey


================================================
FILE: Chapter09/nextflow/.gitignore
================================================
data
pca.png
work
.nextflow*
report

================================================
FILE: Chapter09/nextflow/pipeline.nf
================================================
nextflow.enable.dsl=2

download_root = "https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3"


process plink_download {
  output:
  path 'hapmap.map.gz'//, emit: mapgz
  path 'hapmap.ped.gz'//, emit: pedgz
 
  script:
  """
  wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz -O hapmap.map.gz
  wget $download_root/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz -O hapmap.ped.gz
   """
}


process uncompress_plink {
  publishDir 'data', glob: '*', mode: 'copy'
  
  input:
  path mapgz
  path pedgz

  output:
  path 'hapmap.map'
  path 'hapmap.ped'

  script:
  """
  gzip -dc $mapgz > hapmap.map
  gzip -dc $pedgz > hapmap.ped
  """
}

//DSL 2 and docs
//conda

process subsample_1p {
  input:
  path 'hapmap.map'
  path 'hapmap.ped'

  output:
  path 'hapmap1.map'
  path 'hapmap1.ped'

  script:
  """
  plink2 --pedmap hapmap --out hapmap1 --thin 0.01 --geno 0.1 --export ped
  """
}

process plink_pca {
  input:
  path 'hapmap.map'
  path 'hapmap.ped'

  output:
  path 'hapmap.eigenvec'
  path 'hapmap.eigenval'

  script:
  """
  plink2 --pca --pedmap hapmap -out hapmap
  """
}


process plot_pca {
  publishDir '.', glob: '*', mode: 'copy'

  input:
  path 'hapmap.eigenvec'
  path 'hapmap.eigenval'

  output:
  path 'pca.png'

  script:
  """
  #!/usr/bin/env python
  import pandas as pd

  pca_df = pd.read_csv('hapmap.eigenvec', sep='\t') 
  ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9))
  ax.figure.savefig('pca.png')
  """
}


/*
workflow {
    plink_download | uncompress_plink
}
*/


/*
workflow {
    ped_file = file('data/hapmap.ped')
    map_file = file('data/hapmap.map')
    if (!ped_file.exists() | !map_file.exists()) {
        plink_download | uncompress_plink
    }
}
*/


workflow {
    ped_file = file('data/hapmap.ped')
    map_file = file('data/hapmap.map')
    if (!ped_file.exists() | !map_file.exists()) {
        plink_download | uncompress_plink | subsample_1p | plink_pca | plot_pca
    }
    else {
        subsample_1p(
            Channel.fromPath('data/hapmap.map'),
            Channel.fromPath('data/hapmap.ped')) | plink_pca | plot_pca
    }
}


================================================
FILE: Chapter09/snakemake/.gitignore
================================================
data
scratch
.snakemake
pca.png
dag.svg
bio.png
bio.svg

================================================
FILE: Chapter09/snakemake/Snakefile
================================================
rule all:
    input:
        "pca.png"

rule plink_download:
    output:
        map="scratch/hapmap.map.gz",
        ped="scratch/hapmap.ped.gz",
        rel="data/relationships.txt"
    shell:
        """
        python -c "import urllib.request; urllib.request.urlretrieve(
            'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz', 
            '{output.map}')"
        python -c "import urllib.request; urllib.request.urlretrieve(
            'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz', 
            '{output.ped}')"
        python -c "import urllib.request; urllib.request.urlretrieve(
            'https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt', 
            '{output.rel}')"
        """

PLINKEXTS = ['ped', 'map']

rule uncompress_plink:
    input:
        "scratch/hapmap.{plinkext}.gz"

    output:
        "data/hapmap.{plinkext}"

    shell:
        "gzip -dc {input} > {output}"


rule subsample_1p:
    input:
        "data/hapmap.ped",
        "data/hapmap.map"

    output:
        "data/hapmap1.ped",
        "data/hapmap1.map"

    run:
        shell(f"plink2 --pedmap {input[0][:-4]} --out {output[0][:-4]} --thin 0.01 --geno 0.1 --export ped")

# snakemake and software requirements

# https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#automatic-deployment-of-software-dependencies
#plink2 --pedmap data/hapmap --out data/hapmap10 --thin 0.1 --geno 0.1 --export ped

rule plink_pca:
    input:
        "data/hapmap1.ped",
        "data/hapmap1.map"

    output:
        "data/hapmap1.eigenvec",
        "data/hapmap1.eigenval"

    shell:
        "plink2 --pca --pedmap data/hapmap1 -out data/hapmap1"


rule plot_pca:
    input:
        "data/hapmap1.eigenvec",
        "data/hapmap1.eigenval"

    output:
        "pca.png"

    script:
        "./plot_pca.py"


================================================
FILE: Chapter09/snakemake/plot_pca.py
================================================
import pandas as pd

eigen_fname = snakemake.input[0] if snakemake.input[0].endswith('eigenvec') else snakemake.input[1]
pca_df = pd.read_csv(eigen_fname, sep='\t') 
ax = pca_df.plot.scatter(x=2, y=3, figsize=(16, 9))
ax.figure.savefig(snakemake.output[0]) 


================================================
FILE: Chapter10/Clustering.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": false}
import os

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import numpy as np

from genomics.popgen.pca import plot
# -

# ## Meta-data load

# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/relationships_w_pops_041510.txt')
ind_pop = {}
f.readline()  # header
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
# -

# ## With scikit-learn

# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
    ninds += 1
    toks = line[:100].replace(' ', '\t').split('\t') #  for speed
    fam_id = toks[0]
    ind_id = toks[1]
    ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
print (nsnps)
f.close()

# + jupyter={"outputs_hidden": false}
all_array = np.empty((ninds, nsnps), dtype=int)
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
    snps = line.replace(' ', '\t').split('\t')[6:]
    for pos in range(len(snps) // 2):
        a1 = int(snps[2 * pos])
        a2 = int(snps[2 * pos])
        my_code = a1 + a2 - 2
        all_array[ind, pos] = my_code
f.close()
#slow
# -

predict_case = all_array[-1, :]
pca_array = all_array[:-1,:]

last_ind = ind_order[-1]
last_ind, ind_pop[last_ind]

my_pca = PCA(n_components=2)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)

sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
    sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca(sc_ind_comp, cluster=ind_pop)


# + jupyter={"outputs_hidden": false}
def plot_kmeans_pca(trans, kmeans):
    x_min, x_max = trans[:, 0].min() - 1, trans[:, 0].max() + 1
    y_min, y_max = trans[:, 1].min() - 1, trans[:, 1].max() + 1
    mesh_x, mesh_y = np.meshgrid(np.arange(x_min, x_max, 0.5), np.arange(y_min, y_max, 0.5))

    k_surface = kmeans.predict(np.c_[mesh_x.ravel(), mesh_y.ravel()]).reshape(mesh_x.shape)
    fig, ax = plt.subplots(1,1, dpi=300)
    ax.imshow(
        k_surface, origin="lower", cmap=plt.cm.Pastel1,
        extent=(mesh_x.min(), mesh_x.max(), mesh_y.min(), mesh_y.max()),
    )

    ax.plot(trans[:, 0], trans[:, 1], "k.", markersize=2)
    ax.set_title("KMeans clustering of PCA data")
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    return ax


# + jupyter={"outputs_hidden": false}
kmeans11 = KMeans(n_clusters=11).fit(trans)
plot_kmeans_pca(trans, kmeans11)
# -

kmeans4 = KMeans(n_clusters=4).fit(trans)
plot_kmeans_pca(trans, kmeans4)

pca_predict = my_pca.transform([predict_case])
kmeans4.predict(pca_predict)

last_train = ind_order[-2]
last_train, ind_pop[last_train]

kmeans4.predict(trans)[0]


================================================
FILE: Chapter10/Decision_Tree.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": false}
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree

# + [markdown] jupyter={"outputs_hidden": false}
# http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29

# + jupyter={"outputs_hidden": false}
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
# -

# ## With scikit-learn

# + jupyter={"outputs_hidden": false}
f = open('breast-cancer-wisconsin.data')
w = open('clean.data', 'w')
for line in f:
    if line.find('?') > -1:
        continue
    w.write(line)
f.close()
w.close()

# + jupyter={"outputs_hidden": false}
column_names = [
    'sample_id', 'clump_thickness', 'uniformity_cell_size',
    'uniformity_cell shape', 'marginal_adhesion',
    'single_epithelial_cell_size', 'bare_nuclei',
    'bland_chromatin', 'normal_nucleoli', 'mitoses',
    'class'
]
samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0)
samples

# + jupyter={"outputs_hidden": false}
training_input = samples.iloc[:,:-1]
target = samples.iloc[:,-1].apply(lambda x: 0 if x == 2 else 1)

# + jupyter={"outputs_hidden": false}
clf = tree.DecisionTreeClassifier(max_depth=3)

# + jupyter={"outputs_hidden": false}
clf.fit(training_input, target)

# + jupyter={"outputs_hidden": false}
importances = pd.Series(
    clf.feature_importances_ * 100,
    index=training_input.columns).sort_values(ascending=False)
importances

# + jupyter={"outputs_hidden": false}
100 * clf.score(training_input, target)

# + jupyter={"outputs_hidden": false}
fig, ax = plt.subplots(1, dpi=300)
tree.plot_tree(clf,ax=ax, feature_names=training_input.columns, class_names=['Benign', 'Malignant'])
# -


================================================
FILE: Chapter10/PCA.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": false}
import os

from sklearn.decomposition import PCA
import numpy as np

from genomics.popgen.pca import plot
# -

# ## Meta-data load

# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/relationships_w_pops_041510.txt')
ind_pop = {}
f.readline()  # header
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()
# -

# ## With scikit-learn

# + jupyter={"outputs_hidden": false}
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
ninds = 0
ind_order = []
for line in f:
    ninds += 1
    toks = line[:100].replace(' ', '\t').split('\t') #  for speed
    fam_id = toks[0]
    ind_id = toks[1]
    ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
f.close()

# + jupyter={"outputs_hidden": false}
pca_array = np.empty((ninds, nsnps), dtype=int)
print(pca_array.shape)
f = open('../Chapter06/hapmap10_auto_noofs_ld_12.ped')
for ind, line in enumerate(f):
    snps = line.replace(' ', '\t').split('\t')[6:]
    for pos in range(len(snps) // 2):
        a1 = int(snps[2 * pos])
        a2 = int(snps[2 * pos])
        my_code = a1 + a2 - 2
        pca_array[ind, pos] = my_code
f.close()

# + jupyter={"outputs_hidden": false}
my_pca = PCA(n_components=8)
my_pca.fit(pca_array)
trans = my_pca.transform(pca_array)
#Memory required

# + jupyter={"outputs_hidden": false}
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
    sc_ind_comp[ind_order[i]] = ind_pca
plot.render_pca_eight(sc_ind_comp, cluster=ind_pop)

# + jupyter={"outputs_hidden": false}


================================================
FILE: Chapter10/Random_Forest.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": false}
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz

# + [markdown] jupyter={"outputs_hidden": false}
# http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29

# + jupyter={"outputs_hidden": false}
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# !wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names
# -

# ## With scikit-learn

# + jupyter={"outputs_hidden": false}
f = open('breast-cancer-wisconsin.data')
w = open('clean.data', 'w')
for line in f:
    if line.find('?') > -1:
        continue
    w.write(line)
f.close()
w.close()

# + jupyter={"outputs_hidden": false}
column_names = [
    'sample_id', 'clump_thickness', 'uniformity_cell_size',
    'uniformity_cell shape', 'marginal_adhesion',
    'single_epithelial_cell_size', 'bare_nuclei',
    'bland_chromatin', 'normal_nucleoli', 'mitoses',
    'class'
]
samples = pd.read_csv('clean.data', header=None, names=column_names, index_col=0)
samples

# + jupyter={"outputs_hidden": false}
trainning_input = samples.iloc[:,:-1]
target = samples.iloc[:,-1]

# + jupyter={"outputs_hidden": false}
clf = RandomForestClassifier(max_depth=3, n_estimators=200)

# + jupyter={"outputs_hidden": false}
clf.fit(trainning_input, target)

# + jupyter={"outputs_hidden": false}
importances = pd.Series(
    clf.feature_importances_ * 100,
    index=trainning_input.columns).sort_values(ascending=False)
importances
# -

100 * clf.score(trainning_input, target)


for test_size in [0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99]:
    X_train, X_test, y_train, y_test = train_test_split(
        trainning_input, target, test_size=test_size)
    tclf = RandomForestClassifier(max_depth=3)
    tclf.fit(X_train, y_train)
    score = tclf.score(X_test, y_test)
    print(f'{1 - test_size:.1%} {score:.2%}')
# Random number generator


================================================
FILE: Chapter11/.gitignore
================================================
dask-worker-space
data
mydask.png
x.png

================================================
FILE: Chapter11/Dask_Distributed.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# +
#import dask
#from dask.base import get_scheduler
#import dask.array as da
#
#mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
#print(get_scheduler(collections=[mosquito]).__module__) 

# +
import zarr
import dask.dataframe as dd
from dask.distributed import Client

#client = Client('127.0.0.1:8786')
client = Client()
client

# +
import numpy as np
import dask.array as da

mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
# -

mosquito

mosquito.shape[0]

mosquito = mosquito.rechunk((mosquito.shape[0]//8, 81, 2))

mosquito = mosquito.persist()

mosquito.visualize()

mosquito

mosquito.chunks


def calc_stats(my_chunk):
    num_miss = np.sum(np.equal(my_chunk[0][0][:,:,0], -1), axis=1)
    return num_miss


stats = da.blockwise(calc_stats, 'i', mosquito, 'ijk', dtype=np.uint8)

stats.visualize()

stat_results = stats.compute()

stat_results


================================================
FILE: Chapter11/Dask_Intro.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

import zarr

mosquito = zarr.open('data/AG1000G-AO/2L/calldata/GT')
mosquito
zarr.array(mosquito, chunks=(1 + 48525747 // 4, 81, 2), store='data/rechunk')

mosquito = zarr.open('data/rechunk')
mosquito.chunks

# +
import numpy as np
import dask.array as da

mosquito = da.from_zarr('data/rechunk')
#mosquito = da.from_zarr('data/AG1000G-AO/2L/calldata/GT')
# ^^^ load array
# -

mosquito

print(mosquito[0])

mosquito[0].compute()

mosquito.visualize(rankdir='TB')


def calc_stats(variant):
    variant = variant.reshape(variant.shape[0] // 2, 2)
    num_misses = np.sum(np.equal(variant, -1)) // 2
    return num_misses


mosquito_2d = mosquito.reshape(mosquito.shape[0], mosquito.shape[1] * mosquito.shape[2])
mosquito_2d.visualize(rankdir='TB')

mosquito_2d

max_pos = 10000000
stats = da.apply_along_axis(
    calc_stats, 1, mosquito_2d[:max_pos,:],
    shape=(max_pos,), dtype=np.int64)

stats.visualize('x.png',rankdir='TB')

a = stats.compute()

a


================================================
FILE: Chapter11/MP_intro.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Downloading data

# https://malariagen.github.io/vector-data/ag3/download.html
# !mkdir -p data/AG1000G-AO/
# !gsutil -m rsync -r \
#         -x '.*/calldata/(AD|GQ|MQ)/.*' \
#         gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \
#         data/AG1000G-AO/ > /dev/null

# !mkdir -p data/metadata/
# !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/

# # BLA

# +
import numpy as np
import zarr

mosquito = zarr.open('data/AG1000G-AO')
print(mosquito.tree())

gt_2l = mosquito['/2L/calldata/GT']
gt_2l.info

dir(gt_2l)
gt_2l.shape[0]

# +
from math import ceil
from multiprocessing import Pool


def calc_stats(my_chunk):
    num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1)
    num_anc_hom = np.sum(
        np.all([
            np.equal(my_chunk[:,:,0], 0),
            np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1)
    num_het = np.sum(
        np.not_equal(
            my_chunk[:,:,0],
            my_chunk[:,:,1]), axis=1)
    return num_miss, num_anc_hom, num_het


chunk_pos_size = gt_2l.chunks[0]
max_pos = gt_2l.shape[0]


intervals = []
for chunk_pos in range(ceil(max_pos / chunk_pos_size)):
    start_pos = chunk_pos * chunk_pos_size
    end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size)
    intervals.append((start_pos, end_pos))


def compute_interval(interval):
    start_pos, end_pos = interval
    my_chunk = gt_2l[start_pos:end_pos, :, :]
    num_samples = my_chunk.shape[1]
    num_miss, num_anc_hom, num_het = calc_stats(my_chunk)
    chunk_complete_data = np.sum(np.equal(num_miss, 0))
    chunk_more_anc_hom = np.sum(num_anc_hom > num_het)
    return chunk_complete_data, chunk_more_anc_hom


with Pool() as p:
    print(p)
    chunk_returns = p.map(compute_interval, intervals)
    complete_data = sum(map(lambda x: x[0], chunk_returns))
    more_anc_hom = sum(map(lambda x: x[1], chunk_returns))
    
    print(complete_data, more_anc_hom)
# -


================================================
FILE: Chapter11/Zarr_Intro.py
================================================
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Downloading data

# https://malariagen.github.io/vector-data/ag3/download.html
# !mkdir -p data/AG1000G-AO/
# !gsutil -m rsync -r \
#         -x '.*/calldata/(AD|GQ|MQ)/.*' \
#         gs://vo_agam_release/v3/snp_genotypes/all/AG1000G-AO/ \
#         data/AG1000G-AO/ > /dev/null

# !mkdir -p data/metadata/
# !gsutil -m rsync -r gs://vo_agam_release/v3/metadata/ data/metadata/

# # BLA

# +
import numpy as np
import zarr

mosquito = zarr.open('data/AG1000G-AO')
print(mosquito.tree())
# -

mosquito['samples']

np.array(mosquito['samples'])

gt_2l = mosquito['/2L/calldata/GT']
gt_2l
gt_2l.info

gt_2l[400000,:,:]

# +
# Do not do np.array(gt_2l)
# -

dir(gt_2l)
gt_2l.shape[0]

# +
from math import ceil

chunk_pos_size = gt_2l.chunks[0]
max_pos = gt_2l.shape[0]


def calc_stats(my_chunk):
    num_miss = np.sum(np.equal(my_chunk[:,:,0], -1), axis=1)
    num_anc_hom = np.sum(
        np.all([
            np.equal(my_chunk[:,:,0], 0),
            np.equal(my_chunk[:,:,0], my_chunk[:,:,1])], axis=0), axis=1)
    num_het = np.sum(
        np.not_equal(
            my_chunk[:,:,0],
            my_chunk[:,:,1]), axis=1)
    return num_miss, num_anc_hom, num_het


complete_data = 0
more_anc_hom = 0
total_pos = 0
for chunk_pos in range(ceil(max_pos / chunk_pos_size)):
    start_pos = chunk_pos * chunk_pos_size
    end_pos = min(max_pos + 1, (chunk_pos + 1) * chunk_pos_size)
    my_chunk = gt_2l[start_pos:end_pos, :, :]
    #print(start_pos, end_pos, my_chunk.shape)
    num_samples = my_chunk.shape[1]
    num_miss, num_anc_hom, num_het = calc_stats(my_chunk)
    chunk_complete_data = np.sum(np.equal(num_miss, 0))
    #print(end_pos - start_pos, my_chunk.shape, num_anc_hom.shape, num_het.shape)
    chunk_more_anc_hom = np.sum(num_anc_hom > num_het)
    print(np.sum(num_anc_hom > num_het))
    complete_data += chunk_complete_data
    more_anc_hom += chunk_more_anc_hom
    total_pos += (end_pos - start_pos)
print(complete_data, more_anc_hom, total_pos)
# -


================================================
FILE: Chapter12/Builtin.py
================================================
import functools


@functools.cache
def fibo(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    return fibo(n - 1) + fibo(n - 2)


fibo(1000)


def gene_min_reads(source, min_reads):
    return map(
        lambda x: x[0],
        filter(
            lambda x: x[1] >= min_reads,
            source.items()))


list(gene_min_reads({'LCT': 10, 'MRAP2': 1}, 2))


multiplication = lambda x, y: x * y

double = functools.partial(multiplication, 2)

double(3)


================================================
FILE: Chapter12/Lazy.py
================================================
import pandas as pd


def load(file_name):
    df = pd.read_csv(file_name).set_index('gene')
    return dict(df['count'])


def get_min_reads(all_data, min_reads):
    return {
        gene: count
        for gene, count in all_data.items()
        if count >= min_reads
    }


def has_min_observations(subset_data, min_observations):
    return len(subset_data) >= min_observations


print(has_min_observations(
    get_min_reads(
        load('my_genes.csv'), 4
    ), 3))


def get_rec(file_name):
    with open(file_name) as f:
        f.readline()  # header
        for line in f:
            toks = line.strip().split(',')
            yield toks[0], int(toks[1])


def gene_min_reads(source, min_reads):
    for gene, count in source:
        if count >= min_reads:
            yield gene


def gene_min_observations(subset_source, min_observations):
    my_observations = 0
    for gene in subset_source:
        my_observations += 1
        if my_observations == min_observations:
            return True
    return False


print(gene_min_observations(
    gene_min_reads(
        get_rec('my_genes.csv'), 4
    ), 2))


================================================
FILE: Chapter12/Mutability.py
================================================
import shutil
import pandas as pd


def restore_db(file_name):
    shutil.copyfile(f'{file_name}.base', file_name)


def load(file_name):
    df = pd.read_csv(file_name).set_index('gene')
    return dict(df['count'])


def save(dict_db, file_name):
    pd.Series(dict_db).to_csv(
        file_name, index_label='gene', header=['count'])


def add_sample_dict(dict_db, gene_list):
    for gene in gene_list:
        dict_db[gene] = dict_db.get(0) + 1


def add_sample_new_dict(dict_db, gene_list):
    my_dict_db = dict(dict_db)  # next recipe
    for gene in gene_list:
        my_dict_db[gene] = my_dict_db.get(0) + 1
    return my_dict_db


gene_count = load('my_genes.csv')

add_sample_dict(gene_count, ['DEPP'])

new_gene_count = add_sample_new_dict(gene_count, ['DEPP'])


================================================
FILE: Chapter12/Persistence1.py
================================================
import shutil
import pandas as pd


def restore_db(file_name):
    shutil.copyfile(f'{file_name}.base', file_name)


def load(file_name):
    df = pd.read_csv(file_name).set_index('gene')
    return dict(df['count'])


def save(dict_db, file_name):
    pd.Series(dict_db).to_csv(
        file_name, index_label='gene', header=['count'])


def add_sample_csv(gene_list):
    gene_count = load('my_genes.csv')
    for gene in gene_list:
        gene_count[gene] = gene_count.get(0) + 1
    save(gene_count, 'my_genes.csv')


restore_db('my_genes.csv')

add_sample_csv(['MC4R', 'TYR'])
add_sample_csv(['LCT', 'HLA-A'])
add_sample_csv(['HLA-B', 'HLA-C'])


================================================
FILE: Chapter12/Persistence2.py
================================================
import shutil
import pandas as pd


def restore_db(file_name):
    shutil.copyfile(f'{file_name}.base', file_name)


def load(file_name):
    df = pd.read_csv(file_name).set_index('gene')
    return dict(df['count'])


def save(dict_db, file_name):
    pd.Series(dict_db).to_csv(
        file_name, index_label='gene', header=['count'])


def add_sample_new_dict(dict_db, gene_list):
    my_dict_db = dict(dict_db)  # next recipe
    for gene in gene_list:
        my_dict_db[gene] = my_dict_db.get(0) + 1
    return my_dict_db


restore_db('my_genes.csv')

gene_count = load('my_genes.csv')
gene_count = add_sample_new_dict(gene_count, ['MC4R', 'TYR'])
gene_count = add_sample_new_dict(gene_count, ['LCT', 'HLA-A'])
gene_count = add_sample_new_dict(gene_count, ['HLA-B', 'HLA-C'])
save(gene_count, 'my_genes.csv')


================================================
FILE: Chapter12/Pure.py
================================================
import shutil
import pandas as pd


def restore_db(file_name):
    shutil.copyfile(f'{file_name}.base', file_name)


def load(file_name):
    df = pd.read_csv(file_name).set_index('gene')
    return dict(df['count'])


def save(dict_db, file_name):
    pd.Series(dict_db).to_csv(
        file_name, index_label='gene', header=['count'])


def add_sample_csv(gene_list):
    gene_count = load('my_genes.csv')
    for gene in gene_list:
        gene_count[gene] = gene_count.get(0) + 1
    save(gene_count, 'my_genes.csv')


def add_sample_global_dict(gene_list):
    global gene_count
    for gene in gene_list:
        gene_count[gene] = gene_count.get(0) + 1


def add_sample_dict(dict_db, gene_list):
    for gene in gene_list:
        dict_db[gene] = dict_db.get(0) + 1


gene_count = load('my_genes.csv')


add_sample_csv(['MC4R', 'TYR'])

add_sample_dict(gene_count, ['MC4R', 'TYR'])


save(gene_count, 'my_genes.csv')


================================================
FILE: Chapter12/Recursion.py
================================================
def fibo_iter(n):
    if n < 2:
        return n
    last = 1
    second_last = 0
    for _i in range(2, n + 1):
        result = second_last + last
        second_last = last
        last = result
    return result


def fibo_naive(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    return fibo_naive(n - 1) + fibo_naive(n - 2)


fibo_iter(0)
fibo_iter(1)
fibo_iter(2)
fibo_iter(3)
fibo_iter(4)
fibo_iter(5)
fibo_iter(6)
fibo_naive(1000)


def factorial(n):
    if n == 1:
        return 1
    return n * factorial(n - 1)


factorial(5)
factorial(20000)


================================================
FILE: Chapter12/Tools.py
================================================
import functools


def fibo_iter(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    last = 1
    second_last = 1
    for i in range(3, n + 1):
        result = second_last + last
        second_last = last
        last = result
    return result


def fibo_naive(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    return fibo_naive(n - 1) + fibo_naive(n - 2)


@functools.lru_cache
def fibo(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    return fibo(n - 1) + fibo(n - 2)


time fibo_iter(100)
#time fibo_naive(1000)
time fibo(1000)


def factorial(n):
    if n == 1:
        return 1
    return n * factorial(n - 1)


factorial(20000)


================================================
FILE: Chapter12/my_genes.csv
================================================
gene,count
LCT,5
LEPR,4
MRAP2,1

================================================
FILE: Chapter12/my_genes.csv.base
================================================
gene,count
LCT,5
LEPR,4
MRAP2,1

================================================
FILE: Datasets.py
================================================

# # Datasets for the book
#
# Here we provide links to the datasets used in the book.
#
# Important Notes:
#
# 1. Note that these datasets are provided on external servers by third parties

# # Python and the Surrounding Software Ecology
#
# ## R sections
#
# http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index
#

# # PDB
#
# 
# ## Parsing mmCIF files with Biopython
#
# [1TUP.cif](http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=cif&compression=NO&structureId=1TUP)"


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2021 Packt

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================


# Bioinformatics-with-Python-Cookbook-third-edition

<a href="https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421"><img src="https://static.packt-cdn.com/products/9781803236421/cover/smaller" alt="Bioinformatics with Python Cookbook - Third Edition" height="256px" align="right"></a>

This is the code repository for [Bioinformatics with Python Cookbook - Third Edition](https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421), published by Packt.

**Use modern Python libraries and applications to solve real-world computational biology problems**

## What is this book about?
Bioinformatics is an active research field that uses a range of simple-to-advanced computations to extract valuable information from biological data, and this book will show you how to manage these tasks using Python.

This updated third edition of the Bioinformatics with Python Cookbook begins with a quick overview of the various tools and libraries in the Python ecosystem that will help you convert, analyze, and visualize biological datasets. Next, you'll cover key techniques for next-generation sequencing, single-cell analysis, genomics, metagenomics, population genetics, phylogenetics, and proteomics with the help of real-world examples. You'll learn how to work with important pipeline systems, such as Galaxy servers and Snakemake, and understand the various modules in Python for functional and asynchronous programming. This book will also help you explore topics such as SNP discovery using statistical approaches under high-performance computing frameworks, including Dask and Spark. In addition to this, you’ll explore the application of machine learning algorithms in bioinformatics.

By the end of this bioinformatics Python book, you'll be equipped with the knowledge you need to implement the latest programming techniques and frameworks, empowering you to deal with bioinformatics data on every scale.

This book covers the following exciting features: 
* Become well-versed with data processing libraries such as NumPy, pandas, arrow, and zarr in the context of bioinformatic analysis
* Interact with genomic databases
* Solve real-world problems in the fields of population genetics, phylogenetics, and proteomics
* Build bioinformatics pipelines using a Galaxy server and Snakemake
* Work with functools and itertools for functional programming
* Perform parallel processing with Dask on biological data
* Explore principal component analysis (PCA) techniques with scikit-learn

If you feel this book is for you, get your [copy](https://www.amazon.in/Bioinformatics-Python-Cookbook-bioinformatics-computational/dp/1789344697/ref=sr_1_2?keywords=Bioinformatics+with+Python+Cookbook+-+Third+Edition&qid=1665382032&sr=8-2) today!

<a href="https://www.packtpub.com/product/bioinformatics-with-python-cookbook-third-edition/9781803236421"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" alt="https://www.packtpub.com/" border="5" /></a>

## Instructions and Navigations
All of the code is organized into folders.

The code will look like the following:
```
from Bio import SeqIO
genome_name = 'PlasmoDB-9.3_Pfalciparum3D7_Genome.fasta'
recs = SeqIO.parse(genome_name, 'fasta')
for rec in recs:
print(rec.description)
```
**Following is what you need for this book:**
This book is for bioinformatics analysts, data scientists, computational biologists, researchers, and Python developers who want to address intermediate-to-advanced biological and bioinformatics problems. Working knowledge of the Python programming language is expected. Basic knowledge of biology will also be helpful.

With the following software and hardware list you can run all code files present in the book (Chapter 1-12).

### Software and Hardware List

| Chapter  | Software required                                                                    | OS required                        |
| -------- | -------------------------------------------------------------------------------------| -----------------------------------|
|  	1-12	   | Python 3.9                             			  | Any OS | 		
|  	1-12	   | Numpy, Pandas and Matplotlib                             			  | Any OS | 		
|  	1-12	   | BioPython                             			  | Any OS | 		
|  	1-12	   | DAsk, Zarr, Sckit-learn                             			  | Any OS | 		

We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://packt.link/3KQQO).

  
## Get to Know the Author
**Tiago Antao** is a bioinformatician who is currently working in the field of genomics. A former computer scientist, Tiago moved into computational biology with an MSc in bioinformatics from the Faculty of Sciences at the University of Porto, Portugal, and a PhD on the spread of drug-resistant malaria from the Liverpool School of Tropical Medicine, UK. Post his doctoral, Tiago worked with human datasets at the University of Cambridge, UK and with mosquito whole-genome sequencing data at the University of Oxford, UK, before helping to set up the bioinformatics infrastructure at the University of Montana, USA. He currently works as a data engineer in the biotechnology field in Boston, MA. He is one of the co-authors of Biopython, a major bioinformatics package written in Python.
### Download a free PDF

 <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
<p align="center"> <a href="https://packt.link/free-ebook/9781803236421">https://packt.link/free-ebook/9781803236421 </a> </p>


================================================
FILE: Welcome.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3040340b-bd0b-4266-a7a6-8b48d9a94625",
   "metadata": {},
   "source": [
    "# Python for Bionformatics\n",
    "\n",
    "## Datasets\n",
    "\n",
    "[Click here](Datasets.py) for the datasets used in the book. You only need this if you do not use the notebooks (as the notebooks will take care of the data)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee3697db-cdfe-41c2-ae06-8dc1633b5701",
   "metadata": {},
   "source": [
    "## Python and the surrounding software ecology\n",
    "\n",
    "- [Interfacing with R](Chapter01/Interfacing_R.py)\n",
    "- [R Magic](Chapter01/R_magic.py)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b2663bc-8efe-4bb0-9ac5-f9e2eb09cc5e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "",
   "name": ""
  },
  "language_info": {
   "name": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: docker/Chapter01/Dockerfile
================================================
FROM tiagoantao/bio3
MAINTAINER Tiago Antao <tiago@tiago.org>
# RUN conda create -n bioinformatics_r --clone bioinformatics_base

#RUN conda install -n bioinformatics_r r-base=4.1.3 r-ggplot2=3.3.5 r-lazyeval=0.2.2 r-gridextra=2.3 rpy2
RUN conda create -n bioinformatics_r jupyterlab jupytext pandas
RUN conda install -n bioinformatics_r r-base r-ggplot2 r-lazyeval r-gridextra rpy2
CMD conda run --no-capture-output -n bioinformatics_r jupyter-lab --ip=0.0.0.0 --no-browser --allow-root --port=9875 --NotebookApp.token='' --NotebookApp.password=''


================================================
FILE: docker/main/Dockerfile
================================================
FROM continuumio/anaconda3:2021.05
MAINTAINER Tiago Antao <tiago@tiago.org>
#ENV DEBIAN_FRONTEND noninteractive

#RUN apt-get update && apt-get upgrade -y && apt-get install -y git wget build-essential unzip graphviz libgraphviz-dev pkg-config swig libx11-dev libgsl0-dev libopenblas-dev liblapacke-dev
#RUN apt-get install -y samtools mafft muscle raxml tabix

RUN git clone https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-third-Edition.git

#RUN conda upgrade -n base conda
RUN conda config --add channels conda-forge
RUN conda config --add channels bioconda
RUN conda create -n bioinformatics_base --file /Bioinformatics-with-Python-Cookbook-third-Edition/Chapter01/bioinformatics_base.txt
RUN pip install pyarrow==8.0.0
RUN conda init bash

EXPOSE 9875

WORKDIR /Bioinformatics-with-Python-Cookbook-third-Edition

RUN echo setterm -foreground magenta >> /etc/bash.bashrc
CMD conda run --no-capture-output -n bioinformatics_base jupyter-lab --ip=0.0.0.0 --no-browser --allow-root --port=9875 --NotebookApp.token='' --NotebookApp.password=''