Repository: franciscozorrilla/metaGEM Branch: master Commit: 8609ad6bcf04 Files: 40 Total size: 285.4 KB Directory structure: gitextract_s0fltzsu/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-.md │ │ └── question-.md │ └── PULL_REQUEST_TEMPLATE/ │ └── PR.md ├── .snakemake-workflow-catalog.yml ├── .travis.yml ├── CITATION.bib ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── config/ │ ├── README.md │ ├── cluster_config.json │ └── config.yaml └── workflow/ ├── Snakefile ├── envs/ │ ├── metaGEM_env.yml │ ├── metaGEM_env_long.yml │ ├── metaWRAP_env.yml │ └── prokkaroary_env.yml ├── metaGEM.sh ├── rules/ │ ├── Snakefile_experimental.smk.py │ ├── Snakefile_single_end.smk.py │ ├── kallisto2concoctTable.smk │ ├── maxbin_single.smk │ └── metabat_single.smk └── scripts/ ├── assemblyVis.R ├── assemblyVis_alternative.R ├── binFilter.py ├── binningVis.R ├── binningVis_perSample.R ├── compositionVis.R ├── compositionVis_old.R ├── download_toydata.txt ├── env_setup.sh ├── kallisto2concoct.py ├── media_db.tsv ├── modelVis.R ├── prepareRoaryInput.R ├── prepareRoaryInputGTDBtk.R ├── prepareRoaryInput_old.R ├── qfilterVis.R └── taxonomyVis.R ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug-.md ================================================ --- name: Bug about: Have you used metaGEM and found some strange behavior or unexpected output? Please post your bug reports here! title: "[Bug]: _______________ " labels: bug assignees: franciscozorrilla --- Please include any and all relevant information here, for example: * What tool and/or Snakefile rule are causing problems? * What steps have you taken so far? * Provide any relevant error messages: ``` ``` Before posting, please search for key terms in the issues section as it is likely that your problem may have already been addressed to a certain extent. If not, please continue and post a new issue. ================================================ FILE: .github/ISSUE_TEMPLATE/question-.md ================================================ --- name: Question? about: Are you wondering if metaGEM is appropriate for your particular case? Or maybe you have analyzed some data using metaGEM and need help understanding the output? Post your questions here! title: "[Question]: _______________ ?" labels: question assignees: franciscozorrilla --- Please include any and all relevant information here, for example: * What are you trying to achieve? i.e. What question are you trying to answer? * What microbial community are you studying? e.g. Human gut, free living soil, synthetic lab culture? * What steps have you taken so far? e.g. Assembled reads with MEGAHIT but cannot get binner X to work. * Provide any relevant error messages: ``` ``` Before posting, please search for key terms in the issues section as it is likely that your problem may have already been addressed to a certain extent. If not, please continue and post a new issue. ================================================ FILE: .github/PULL_REQUEST_TEMPLATE/PR.md ================================================ # ✏️ Description Please include a summary of the changes and the related issue, as well as relevant motivation and context. List any dependencies that are required for this change. Fixes # (issue) ## Type of change Please delete options that are not relevant. - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] This change requires a documentation update # 📐 How Has This Been Tested? Please describe the tests that you ran to verify your changes. Also provide instructions so we can reproduce, including any relevant details for your test configuration. Delete options that are not relevant or add new options as necessary. - [ ] Test A - [ ] Test B **Test Configuration**: * Tool X version: * Tool Y version: * Snakemake version: * OS: # 🐍 Snakefile chores Please explain how Snakefile rules were created, modified, or expanded to provide new functionalities or bugfixes. Delete options that are not relevant or add new options as necessary. - [ ] create Snakefile rule for ... - [ ] modified Snakefile rule for ... - [ ] create helper rule ... # 🔨 Additional chores Please ensure that the appropriate config files + wrapper script have been modified to support new functionalies or bugfixes. Delete options that are not relevant or add new options as necessary. - [ ] metaGEM.sh: add options + support for new tasks - [ ] config.yaml: add params used in new tasks - [ ] config readme file: update - [ ] conda recipe: update - [ ] main readme file: text/figure # 📝 Final checklist Delete options that are not relevant or add new options as necessary. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works ================================================ FILE: .snakemake-workflow-catalog.yml ================================================ usage: mandatory-flags: # optional definition of additional flags desc: # describe your flags here in a few sentences (they will be inserted below the example commands) flags: # put your flags here software-stack-deployment: # definition of software deployment method (at least one of conda, singularity, or singularity+conda) conda: false # whether pipeline works with --use-conda singularity: false # whether pipeline works with --use-singularity singularity+conda: false # whether pipeline works with --use-singularity --use-conda report: false # add this to confirm that the workflow allows to use 'snakemake --report report.zip' to generate a report containing all results and explanations ================================================ FILE: .travis.yml ================================================ language: python python: # We don't actually use the Travis Python, but this keeps it organized. - "3.8.6" install: # set up conda - sudo apt-get update - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - hash -r - conda config --set always_yes yes --set changeps1 no - conda update -q conda # clone metagem repo - git clone https://github.com/franciscozorrilla/metaGEM.git && cd metaGEM # set up mamba - travis_wait 30 conda create --quiet --prefix ./envs/mamba mamba -c conda-forge && source activate envs/mamba # set up metagem - travis_wait 30 mamba env create --quiet --prefix ./envs/metagem -f envs/metaGEM_env.yml - source activate envs/metagem # This causes TRAVIS CI specific error, something to do with the setuptools version - pip install --quiet --user memote carveme smetana - source deactivate && source activate envs/mamba # set up metawrap - travis_wait 30 mamba env create --quiet --prefix ./envs/metawrap -f envs/metaWRAP_env.yml # set up prokka-roary - travis_wait 30 mamba env create --quiet --prefix ./envs/prokkaroary -f envs/prokkaroary_env.yml # set root dir - echo -e "Setting current directory to root in config.yaml file ... \n" && root=$(pwd) && sed -i "2s~/.*$~$root~" config.yaml # set scratch dir - mkdir -p tmp - echo -e "Setting tmp directory in config.yaml file ... \n" && scratch=$(pwd|sed 's|$|/tmp|g') && sed -i "3s~/.*$~$scratch~" config.yaml script: - source activate envs/metagem # run createFolders - snakemake createFolders -j1 # run downloadToy - snakemake downloadToy -j1 # run fastp - snakemake all -j1 ================================================ FILE: CITATION.bib ================================================ @article{10.1093/nar/gkab815, author = {Zorrilla, Francisco and Buric, Filip and Patil, Kiran R and Zelezniak, Aleksej}, title = "{metaGEM: reconstruction of genome scale metabolic models directly from metagenomes}", journal = {Nucleic Acids Research}, volume = {49}, number = {21}, pages = {e126-e126}, year = {2021}, month = {10}, abstract = "{Metagenomic analyses of microbial communities have revealed a large degree of interspecies and intraspecies genetic diversity through the reconstruction of metagenome assembled genomes (MAGs). Yet, metabolic modeling efforts mainly rely on reference genomes as the starting point for reconstruction and simulation of genome scale metabolic models (GEMs), neglecting the immense intra- and inter-species diversity present in microbial communities. Here, we present metaGEM (https://github.com/franciscozorrilla/metaGEM), an end-to-end pipeline enabling metabolic modeling of multi-species communities directly from metagenomes. The pipeline automates all steps from the extraction of context-specific prokaryotic GEMs from MAGs to community level flux balance analysis (FBA) simulations. To demonstrate the capabilities of metaGEM, we analyzed 483 samples spanning lab culture, human gut, plant-associated, soil, and ocean metagenomes, reconstructing over 14,000 GEMs. We show that GEMs reconstructed from metagenomes have fully represented metabolism comparable to isolated genomes. We demonstrate that metagenomic GEMs capture intraspecies metabolic diversity and identify potential differences in the progression of type 2 diabetes at the level of gut bacterial metabolic exchanges. Overall, metaGEM enables FBA-ready metabolic model reconstruction directly from metagenomes, provides a resource of metabolic models, and showcases community-level modeling of microbiomes associated with disease conditions allowing generation of mechanistic hypotheses.}", issn = {0305-1048}, doi = {10.1093/nar/gkab815}, url = {https://doi.org/10.1093/nar/gkab815}, eprint = {https://academic.oup.com/nar/article-pdf/49/21/e126/41503923/gkab815.pdf}, } ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at fz274@cam.ac.uk. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2021 Francisco Zorrilla Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # 💎 `metaGEM` > **Note** > An easy-to-use workflow for generating context specific genome-scale metabolic models and predicting metabolic interactions within microbial communities directly from metagenomic data. [![Nucleic Acids Research](https://img.shields.io/badge/Nucleic%20Acids%20Research-10.1093%2Fnar%2Fgkab815-critical)](https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkab815/6382386) [![bioRxiv](https://img.shields.io/badge/bioRxiv-10.1101%2F2020.12.31.424982%20-B31B1B)](https://www.biorxiv.org/content/10.1101/2020.12.31.424982v2.full) [![Build Status](https://app.travis-ci.com/franciscozorrilla/metaGEM.svg?branch=master)](https://app.travis-ci.com/github/franciscozorrilla/metaGEM) [![GitHub license](https://img.shields.io/github/license/franciscozorrilla/metaGEM)](https://github.com/franciscozorrilla/metaGEM/blob/master/LICENSE) [![Snakemake](https://img.shields.io/badge/Snakemake->=5.10.0,<5.31.1-green)](https://snakemake.readthedocs.io/en/stable/project_info/history.html#id407) [![Anaconda-Server Badge](https://anaconda.org/bioconda/metagem/badges/downloads.svg)](https://anaconda.org/bioconda/metagem) [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/metaGEM/community) [![DOI](https://img.shields.io/badge/Zenodo-10.5281%2F4707723-blue)](https://zenodo.org/badge/latestdoi/137376259) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1I1S8AoGuJ9Oc2292vqAGTDmZcbnolbuj#scrollTo=awiAaVwSF5Fz) [![Anaconda-Server Badge](https://anaconda.org/bioconda/metagem/badges/version.svg)](https://anaconda.org/bioconda/metagem) [![Anaconda-Server Badge](https://anaconda.org/bioconda/metagem/badges/latest_release_date.svg)](https://anaconda.org/bioconda/metagem) ![metawrapfigs_final4 001](https://user-images.githubusercontent.com/35606471/116543667-0d0f8f00-a8e6-11eb-835c-bc1fe935f43e.png) `metaGEM` is a Snakemake workflow that integrates an array of existing bioinformatics and metabolic modeling tools, for the purpose of predicting metabolic interactions within bacterial communities of microbiomes. From whole metagenome shotgun datasets, metagenome assembled genomes (MAGs) are reconstructed, which are then converted into genome-scale metabolic models (GEMs) for *in silico* simulations. Additional outputs include abundance estimates, taxonomic assignment, growth rate estimation, pangenome analysis, and eukaryotic MAG identification. ## ⚙️ Installation You can start using `metaGEM` on your cluster with just one line of code with the [mamba package manager](https://github.com/mamba-org/mamba) ``` mamba create -n metagem -c bioconda metagem ``` This will create an environment called `metagem` and start installing dependencies. Please consult the `config/README.md` page for more detailed setup instructions. [![installation](https://img.shields.io/badge/metaGEM-config-%2331a354)](https://github.com/franciscozorrilla/metaGEM/tree/master/config) ## 🔧 Usage Clone this repo ``` git clone https://github.com/franciscozorrilla/metaGEM.git && cd metaGEM/workflow ``` Run `metaGEM` without any arguments to see usage instructions: ``` bash metaGEM.sh ``` ``` Usage: bash metaGEM.sh [-t|--task TASK] [-j|--nJobs NUMBER OF JOBS] [-c|--cores NUMBER OF CORES] [-m|--mem GB RAM] [-h|--hours MAX RUNTIME] [-l|--local] Options: -t, --task Specify task to complete: SETUP createFolders downloadToy organizeData check CORE WORKFLOW fastp megahit crossMapSeries kallistoIndex crossMapParallel kallisto2concoct concoct metabat maxbin binRefine binReassemble extractProteinBins carveme memote organizeGEMs smetana extractDnaBins gtdbtk abundance BONUS grid prokka roary eukrep eukcc VISUALIZATION (in development) stats qfilterVis assemblyVis binningVis taxonomyVis modelVis interactionVis growthVis -j, --nJobs Specify number of jobs to run in parallel -c, --nCores Specify number of cores per job -m, --mem Specify memory in GB required for job -h, --hours Specify number of hours to allocated to job runtime -l, --local Run jobs on local machine for non-cluster usage ``` ## 🧉 Try it now You can set up and use `metaGEM` on the cloud by following along the google colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1I1S8AoGuJ9Oc2292vqAGTDmZcbnolbuj#scrollTo=awiAaVwSF5Fz) Please note that google colab does not provide the computational resources necessary to fully run `metaGEM` on a real dataset. This notebook demonstrates how to set up and use `metaGEM` by perfoming the first steps in the workflow on a toy dataset. ## 💩 Tutorials `metaGEM` can be used to explore your own gut microbiome sequencing data from at-home-test-kit services such as [unseen bio](https://unseenbio.com/). The following tutorial showcases the `metaGEM` workflow on two unseenbio samples. [![Tutorial](https://img.shields.io/badge/metaGEM-Tutorial-%23d8b365)](https://github.com/franciscozorrilla/unseenbio_metaGEM) For an introductory metabolic modeling tutorial, refer to the resources compiled for the [EMBOMicroCom: Metabolite and species dynamics in microbial communities](https://www.embl.org/about/info/course-and-conference-office/events/mcd22-01/) workshop in 2022. [![Tutorial3](https://img.shields.io/badge/MicroCom-Tutorial-green)](https://github.com/franciscozorrilla/EMBOMicroCom) For a more advanced tutorial, check out the resources we put together for the [SymbNET: from metagenomics to metabolic interactions](https://www.ebi.ac.uk/training/events/symbnet-2022/) course in 2022. [![Tutorial2](https://img.shields.io/badge/SymbNET-Tutorial-red)](https://github.com/franciscozorrilla/SymbNET) ## 🏛️ Wiki Refer to the wiki for additional usage tips, frequently asked questions, and implementation details. [![wiki](https://img.shields.io/badge/metaGEM-Wiki-blue)](https://github.com/franciscozorrilla/metaGEM/wiki) ## 📦 Datasets * You can access the metaGEM-generated results for the publication [here](https://github.com/franciscozorrilla/metaGEM_paper). ``` 🧪 Small communities of gut microbes from lab cultures 💩 Real gut microbiome samples from Swedish diabetes paper 🪴 Plant-associated soil samples from Chinese rhizobiome study 🌏 Bulk-soil samples from Australian biodiversity analysis 🌊 Ocean water samples from global TARA Oceans expeditions ``` * Additionally, you can access metaGEM-generated results from a reanalysis of recently published ancient metagenomes [here](https://zenodo.org/record/7414438#.Y5HSFYLP3bs). ## 🐍 Workflow ### Core 1. Quality filter reads with [fastp](https://github.com/OpenGene/fastp) 2. Assembly with [megahit](https://github.com/voutcn/megahit) 3. Draft bin sets with [CONCOCT](https://github.com/BinPro/CONCOCT), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and [MetaBAT2](https://sourceforge.net/projects/maxbin2/) 4. Refine & reassemble bins with [metaWRAP](https://github.com/bxlab/metaWRAP) 5. Taxonomic assignment with [GTDB-tk](https://github.com/Ecogenomics/GTDBTk) 6. Relative abundances with [bwa](https://github.com/lh3/bwa) and [samtools](https://github.com/samtools/samtools) 7. Reconstruct & evaluate genome-scale metabolic models with [CarveMe](https://github.com/cdanielmachado/carveme) and [memote](https://github.com/opencobra/memote) 8. Species metabolic coupling analysis with [SMETANA](https://github.com/cdanielmachado/smetana) ### Bonus 9. Growth rate estimation with [GRiD](https://github.com/ohlab/GRiD), [SMEG](https://github.com/ohlab/SMEG) or [CoPTR](https://github.com/tyjo/coptr) 10. Pangenome analysis with [roary](https://github.com/sanger-pathogens/Roary) 11. Eukaryotic draft bins with [EukRep](https://github.com/patrickwest/EukRep) and [EukCC](https://github.com/Finn-Lab/EukCC) ## 🏗️ Active Development If you want to see any new additional or alternative tools incorporated into the `metaGEM` workflow please raise an issue or create a pull request. Snakemake allows workflows to be very flexible, so adding new rules is as easy as filling out the following template and adding it to the Snakefile: ``` rule package-name: input: rules.rulename.output output: f'{config["path"]["root"]}/{config["folder"]["X"]}/{{IDs}}/output.file' message: """ Helpful and descriptive message detailing goal of this rule/package. """ shell: """ # Well documented command line instructions go here # Load conda environment set +u;source activate {config[envs][package]};set -u; # Run tool package-name -i {input} -o {output} """ ``` ## 🖇️ Publications The `metaGEM` workflow has been used in multiple studies, including the following non-exhaustive list: ``` Plastic-degrading potential across the global microbiome correlates with recent pollution trends J Zrimec, M Kokina, S Jonasson, F Zorrilla, A Zelezniak MBio, 2021 ``` ``` Competition-cooperation in the chemoautotrophic ecosystem of Movile Cave: first metagenomic approach on sediments Chiciudean, I., Russo, G., Bogdan, D.F. et al. Environmental Microbiome, 2022 ``` ``` The National Ecological Observatory Network’s soil metagenomes: assembly and basic analysis Werbin ZR, Hackos B, Lopez-Nava J et al. F1000Research, 2022 ``` ``` Microbial interactions shape cheese flavour formation Melkonian, C., Zorrilla, F., Kjærbølling, I. et al. Nature Communications, 2023 ``` ## 🍾 Please cite ``` metaGEM: reconstruction of genome scale metabolic models directly from metagenomes Francisco Zorrilla, Filip Buric, Kiran R Patil, Aleksej Zelezniak Nucleic Acids Research, 2021; gkab815, https://doi.org/10.1093/nar/gkab815 ``` [![Nucleic Acids Research](https://img.shields.io/badge/Nucleic%20Acids%20Research-10.1093%2Fnar%2Fgkab815-critical)](https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkab815/6382386) ## ⭐ Star History Star History Chart ## 📲 Contact Please reach out with any comments, concerns, or discussions regarding `metaGEM`. [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/metaGEM/community) [![Twitter](https://img.shields.io/badge/Twitter-%40metagenomez-lightblue)](https://twitter.com/metagenomez) [![LinkedIn](https://img.shields.io/badge/LinkedIn-fzorrilla94-blue)](https://www.linkedin.com/in/fzorrilla94/) [![email](https://img.shields.io/badge/email-fz274%40cam.ac.uk-%23a6bddb)](fz274@cam.ac.uk) ================================================ FILE: config/README.md ================================================ # 💎 Setup guide ## 🔩 Config files Make sure to inspect and set up the two config files in this folder. ### Snakemake configuration `config.yaml`: handles all the tunable parameters, subfolder names, paths, and more. The `root` path is automatically set by the `metaGEM.sh` parser to be the current working directory. Most importantly, you should make sure that the `scratch` path is properly configured. Most clusters have a location for temporary or high I/O operations such as `$TMPDIR` or `$SCRATCH`, e.g. [see here](https://www.c3se.chalmers.se/documentation/filesystem/#using-node-local-disk-tmpdir). Please refer to the config.yaml [wiki page](https://github.com/franciscozorrilla/metaGEM/wiki/Snakefile-config) for a more in depth look at this config file. ### Cluster configuration `cluster_config.json`: handles parameters for submitting jobs to the cluster workload manager. Most importantly, you should make sure that the `account` is properly defined to be able to submit jobs to your cluster. Please refer to the cluster_config.json wiki page for a more in depth look at this config file. ## 🛢️ Environments Conda can take *ages* to solve environment dependencies when installing many tools at once, we can use [mamba](https://github.com/mamba-org/mamba) instead for faster installation. Set up two environments: 1. `metagem`: Contains most `metaGEM` core workflow tools, Python 3 & Snakemake>=5.10.0,<5.31.1 2. `metawrap` Contains `metaWRAP` and its dependencies, Python 2 ### 1. metaGEM Clone metaGEM repo ``` git clone https://github.com/franciscozorrilla/metaGEM.git ``` Move into metaGEM/workflow folder ``` cd metaGEM/workflow ``` Clean up unnecessary ~250 Mb of unnecessary git history files ``` rm -r ../.git ``` Press `y` and `Enter` when prompted to remove write-protected files, these are not necessary and just eat your precious space. ``` rm: remove write-protected regular file ‘.git/objects/pack/pack-f4a65f7b63c09419a9b30e64b0e4405c524a5b35.pack’? y rm: remove write-protected regular file ‘.git/objects/pack/pack-f4a65f7b63c09419a9b30e64b0e4405c524a5b35.idx’? y ``` Recommended method: install from bioconda ``` conda config --add channels conda-forge && mamba create --prefix envs/metagem -c bioconda metagem ``` Alternative method: create metaGEM env using recipe .yml file ``` mamba env create --prefix ./envs/metagem -f envs/metaGEM_env.yml ``` Deactivate activate metaGEM env ``` source activate envs/metagem ``` Install pip tools ``` pip install --user memote carveme smetana ``` ### 2. metaWRAP It is best to set up `metaWRAP` in its own isolated environment to prevent version conflicts with `metaGEM`. Note that `metaWRAP v1.3.2` has not migrated from python 2 to python 3 yet. ``` conda create -n metawrap source activate metawrap conda install -c ursky metawrap-mg=1.3.2 ``` Or using the conda recipe file: ``` mamba env create --prefix ./envs/metawrap -f envs/metaWRAP_env.yml ``` ## 🔮 Check installation To make sure that the basics have been properly configured, run the `check` task using the `metaGEM.sh` parser: ``` bash metaGEM.sh -t check ``` This will check if conda is installed/available and verify that the environments were properly set up. Additionally, this `check` function will prompt you to create results folders if they are not already present. Finally, this task will check if any sequencing files are present in the dataset folder, prompting the user to the either organize already existing files into sample-specific subfolders or to download a small [toy dataset](https://zenodo.org/record/3534949/). ## Tools requiring additional configuration > **Warning** Please note that you will need to set up the following tools/databases to run the complete core metaGEM workflow: ### 1. CheckM `CheckM` is used extensively within the `metaWRAP` modules to evaluate the output of various intermediate steps. Although the `CheckM` package is installed in the `metawrap` environment, the user is required to download the `CheckM` database and run `checkm data setRoot ` as outlined in the [`CheckM` installation guide](https://github.com/Ecogenomics/CheckM/wiki/Installation#how-to-install-checkm). ### 2. GTDB-Tk `GTDB-Tk` is used for taxonomic assignment of MAGs, and requires a database to be downloaded and configured. Please refer to the [installation documentation](https://ecogenomics.github.io/GTDBTk/installing/index.html) for detailed instructions. ### 3. CPLEX Unfortunately `CPLEX` cannot be automatically installed in the `env_setup.sh` script, you must install this dependency manually within the metagem conda environment. GEM reconstruction and GEM community simulations require the `IBM CPLEX solver`, which is [free to download with an academic license](https://www.ibm.com/academic/home). Refer to the [`CarveMe`](https://carveme.readthedocs.io/en/latest/installation.html) and [`SMETANA`](https://smetana.readthedocs.io/en/latest/installation.html) installation instructions for further information or troubleshooting. Note: `CPLEX v.12.8` is recommended. ================================================ FILE: config/cluster_config.json ================================================ { "__default__" : { "account" : "your-account-name", "time" : "0-06:00:00", "n" : 48, "tasks" : 1, "mem" : 180G, "name" : "DL.{rule}", "output" : "logs/{wildcards}.%N.{rule}.out.log", }, } ================================================ FILE: config/config.yaml ================================================ path: root: /path/to/project/folder/on/your/cluster scratch: /path/to/temporary/or/scratch/directory/for/intermediate/files folder: data: dataset logs: logs assemblies: assemblies scripts: scripts crossMap: crossMap concoct: concoct maxbin: maxbin metabat: metabat refined: refined_bins reassembled: reassembled_bins classification: GTDBTk abundance: abundance GRiD: GRiD GEMs: GEMs SMETANA: SMETANA memote: memote qfiltered: qfiltered stats: stats proteinBins: protein_bins dnaBins: dna_bins pangenome: pangenome kallisto: kallisto kallistoIndex: kallistoIndex benchmarks: benchmarks prodigal: prodigal blastp: blastp blastp_db: blastp_db scripts: kallisto2concoct: kallisto2concoct.py prepRoary: prepareRoaryInput.R binFilter: binFilter.py qfilterVis: qfilterVis.R assemblyVis: assemblyVis.R binningVis: binningVis.R modelVis: modelVis.R compositionVis: compositionVis.R taxonomyVis: taxonomyVis.R carveme: media_db.tsv toy: download_toydata.txt GTDBtkVis: cores: fastp: 4 megahit: 48 crossMap: 48 concoct: 48 metabat: 48 maxbin: 48 refine: 48 reassemble: 48 classify: 2 gtdbtk: 48 abundance: 16 carveme: 4 smetana: 12 memote: 4 grid: 24 prokka: 2 roary: 12 diamond: 12 params: cutfasta: 10000 assemblyPreset: meta-sensitive assemblyMin: 1000 concoct: 800 metabatMin: 50000 seed: 420 minBin: 1500 refineMem: 1600 refineComp: 50 refineCont: 10 reassembleMem: 1600 reassembleComp: 50 reassembleCont: 10 carveMedia: M8 smetanaMedia: M1,M2,M3,M4,M5,M7,M8,M9,M10,M11,M13,M14,M15A,M15B,M16 smetanaSolver: CPLEX roaryI: 90 roaryCD: 90 envs: metagem: envs/metagem metawrap: envs/metawrap prokkaroary: envs/prokkaroary ================================================ FILE: workflow/Snakefile ================================================ configfile: "../config/config.yaml" import os import glob def get_ids_from_path_pattern(path_pattern): ids = sorted([os.path.basename(os.path.splitext(val)[0]) for val in (glob.glob(path_pattern))]) return ids gemIDs = get_ids_from_path_pattern('GEMs/*.xml') binIDs = get_ids_from_path_pattern('protein_bins/*.faa') IDs = get_ids_from_path_pattern('dataset/*') speciesIDs = get_ids_from_path_pattern('pangenome/speciesBinIDs/*.txt') DATA_READS_1 = f'{config["path"]["root"]}/{config["folder"]["data"]}/{{IDs}}/{{IDs}}_R1.fastq.gz' DATA_READS_2 = f'{config["path"]["root"]}/{config["folder"]["data"]}/{{IDs}}/{{IDs}}_R2.fastq.gz' focal = get_ids_from_path_pattern('dataset/*') rule all: input: expand(config["path"]["root"]+"/"+config["folder"]["qfiltered"]+"/{IDs}/{IDs}_R1.fastq.gz", IDs = IDs) message: """ WARNING: Be very careful when adding/removing any lines above this message. The metaGEM.sh parser is presently hardcoded to edit line 22 of this Snakefile to expand target rules accordingly, therefore adding/removing any lines before this message will likely result in parser malfunction. """ shell: """ echo "Gathering {input} ... " """ rule createFolders: input: config["path"]["root"] message: """ Very simple rule to check that the metaGEM.sh parser, Snakefile, and config.yaml file are set up correctly. Generates folders from config.yaml config file, not strictly necessary to run this rule. """ shell: """ cd {input} echo -e "Setting up result folders in the following work directory: $(echo {input}) \n" # Generate folders.txt by extracting folder names from config.yaml file paste config.yaml |cut -d':' -f2|tail -n +4|head -n 25|sed '/^$/d' > folders.txt # NOTE: hardcoded numbers (tail 4, head 25) for folder names, increase number as new folders are introduced. while read line;do echo "Creating $line folder ... " mkdir -p $line; done < folders.txt echo -e "\nDone creating folders. \n" rm folders.txt """ rule downloadToy: input: f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["toy"]}' message: """ Downloads toy samples into dataset folder and organizes into sample-specific sub-folders. Download a real dataset by replacing the links in the download_toydata.txt file with links to files from your dataset of intertest. Note: Make sure that the only underscores (e.g. _) that appear in the filenames are between the sample ID and R1/R2 identifier. """ shell: """ cd {config[path][root]}/{config[folder][data]} # Download each link in download_toydata.txt echo -e "\nBegin downloading toy dataset ... " while read line;do wget $line; done < {input} echo -e "\nDone donwloading dataset." # Rename downloaded files, this is only necessary for toy dataset (will cause error if used for real dataset) echo -ne "\nRenaming downloaded files ... " for file in *;do mv $file ./$(echo $file|sed 's/?download=1//g'|sed 's/_/_R/g'); done echo -e " done. \n" # Organize data into sample specific sub-folders echo -ne "Generating list of unique sample IDs ... " for file in *.gz; do echo $file; done | sed 's/_.*$//g' | sed 's/.fastq.gz//g' | uniq > ID_samples.txt echo -e " done.\n $(less ID_samples.txt|wc -l) samples identified." echo -ne "\nOrganizing downloaded files into sample specific sub-folders ... " while read line; do mkdir -p $line; mv $line*.gz $line; done < ID_samples.txt echo " done." rm ID_samples.txt """ rule organizeData: input: f'{config["path"]["root"]}/{config["folder"]["data"]}' message: """ Sorts paired end raw reads into sample specific sub folders within the dataset folder specified in the config.yaml file. Assumes all samples are present in dataset folder. Note: This rule is meant to be run on real datasets. downloadToy rule above sorts the downloaded data already. Assumes file names have format: SAMPLEID_R1|R2.fastq.gz, e.g. ERR599026_R2.fastq.gz """ shell: """ cd {input} echo -ne "\nGenerating list of unique sample IDs ... " # Create list of unique sample IDs for file in *.gz; do echo $file; done | sed 's/_[^_]*$//g' | sed 's/.fastq.gz//g' | uniq > ID_samples.txt echo -e " done.\n $(less ID_samples.txt|wc -l) samples identified.\n" # Create folder and move corresponding files for each sample echo -ne "\nOrganizing dataset into sample specific sub-folders ... " while read line; do mkdir -p $line; mv $line*.gz $line; done < ID_samples.txt echo -e " done. \n" rm ID_samples.txt """ rule qfilter: input: R1 = DATA_READS_1, R2 = DATA_READS_2 output: R1 = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}/{{IDs}}/{{IDs}}_R1.fastq.gz', R2 = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}/{{IDs}}/{{IDs}}_R2.fastq.gz' shell: """ # Activate metagem environment echo -e "Activating {config[envs][metagem]} conda environment ... " set +u;source activate {config[envs][metagem]};set -u; # This is just to make sure that output folder exists mkdir -p $(dirname {output.R1}) # Make job specific scratch dir idvar=$(echo $(basename $(dirname {output.R1}))|sed 's/_R1.fastq.gz//g') echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][qfiltered]}/${{idvar}} ... " mkdir -p {config[path][scratch]}/{config[folder][qfiltered]}/${{idvar}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][qfiltered]}/${{idvar}} # Copy files echo -e "Copying {input.R1} and {input.R2} to {config[path][scratch]}/{config[folder][qfiltered]}/${{idvar}} ... " cp {input.R1} {input.R2} . echo -e "Appending .raw to temporary input files to avoid name conflict ... " for file in *.gz; do mv -- "$file" "${{file}}.raw.gz"; done # Run fastp echo -n "Running fastp ... " fastp --thread {config[cores][fastp]} \ -i *R1*raw.gz \ -I *R2*raw.gz \ -o $(basename {output.R1}) \ -O $(basename {output.R2}) \ -j $(dirname {output.R1})/$(echo $(basename $(dirname {output.R1}))).json \ -h $(dirname {output.R1})/$(echo $(basename $(dirname {output.R1}))).html # Move output files to root dir echo -e "Moving output files $(basename {output.R1}) and $(basename {output.R2}) to $(dirname {output.R1})" mv $(basename {output.R1}) $(basename {output.R2}) $(dirname {output.R1}) # Warning echo -e "Note that you must manually clean up these temporary directories if your scratch directory points to a static location instead of variable with a job specific location ... " # Done message echo -e "Done quality filtering sample ${{idvar}}" """ rule qfilterVis: input: f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/qfilter.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/qfilterVis.pdf' shell: """ # Activate metagem env set +u;source activate {config[envs][metagem]};set -u; # Make sure stats folder exists mkdir -p $(dirname {output.text}) # Move into qfiltered folder cd {input} # Read and summarize files echo -e "\nGenerating quality filtering results file qfilter.stats: ... " for folder in */;do for file in $folder*json;do # Define sample ID=$(echo $file|sed 's|/.*$||g') # Reads before filtering readsBF=$(head -n 25 $file|grep total_reads|cut -d ':' -f2|sed 's/,//g'|head -n 1) # Reads after filtering readsAF=$(head -n 25 $file|grep total_reads|cut -d ':' -f2|sed 's/,//g'|tail -n 1) # Bases before filtering basesBF=$(head -n 25 $file|grep total_bases|cut -d ':' -f2|sed 's/,//g'|head -n 1) # Bases after filtering basesAF=$(head -n 25 $file|grep total_bases|cut -d ':' -f2|sed 's/,//g'|tail -n 1) # Q20 bases before filtering q20BF=$(head -n 25 $file|grep q20_rate|cut -d ':' -f2|sed 's/,//g'|head -n 1) # Q20 bases after filtering q20AF=$(head -n 25 $file|grep q20_rate|cut -d ':' -f2|sed 's/,//g'|tail -n 1) # Q30 bases before filtering q30BF=$(head -n 25 $file|grep q30_rate|cut -d ':' -f2|sed 's/,//g'|head -n 1) # Q30 bases after filtering q30AF=$(head -n 25 $file|grep q30_rate|cut -d ':' -f2|sed 's/,//g'|tail -n 1) # Percentage of reads kept after filtering percent=$(awk -v RBF="$readsBF" -v RAF="$readsAF" 'BEGIN{{print RAF/RBF}}' ) # Write values to qfilter.stats file echo "$ID $readsBF $readsAF $basesBF $basesAF $percent $q20BF $q20AF $q30BF $q30AF" >> qfilter.stats # Print values echo "Sample $ID retained $percent * 100 % of reads ... " done done echo "Done summarizing quality filtering results ... \nMoving to /stats/ folder and running plotting script ... " mv qfilter.stats {config[path][root]}/{config[folder][stats]} # Move to stats folder cd {config[path][root]}/{config[folder][stats]} # Run script for quality filter visualization Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][qfilterVis]} echo "Done. " # Remove duplicate/extra plot rm Rplots.pdf """ rule megahit: input: R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}/{{IDs}}/contigs.fasta.gz' benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.megahit.benchmark.txt' shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Make sure that output folder exists mkdir -p $(dirname {output}) # Make job specific scratch dir idvar=$(echo $(basename $(dirname {output}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][assemblies]}/${{idvar}} ... " mkdir -p {config[path][scratch]}/{config[folder][assemblies]}/${{idvar}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][assemblies]}/${{idvar}} # Copy files echo -n "Copying qfiltered reads to {config[path][scratch]}/${{idvar}} ... " cp {input.R1} {input.R2} . echo "done. " # Run megahit echo -n "Running MEGAHIT ... " megahit -t {config[cores][megahit]} \ --presets {config[params][assemblyPreset]} \ --verbose \ --min-contig-len {config[params][assemblyMin]} \ -1 $(basename {input.R1}) \ -2 $(basename {input.R2}) \ -o tmp; echo "done. " # Rename assembly echo "Renaming assembly ... " mv tmp/final.contigs.fa contigs.fasta # Remove spaces from contig headers and replace with hyphens echo "Fixing contig header names: replacing spaces with hyphens ... " sed -i 's/ /-/g' contigs.fasta # Zip and move assembly to output folder echo "Zipping and moving assembly ... " gzip contigs.fasta mv contigs.fasta.gz $(dirname {output}) # Done message echo -e "Done assembling quality filtered reads for sample ${{idvar}}" """ rule assemblyVis: input: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}' message: """ Note that this rule is designed to read megahit assemblies with hyphens instead of spaces in contig headers as generated by the megahit rule above. """ output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/assembly.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/assemblyVis.pdf', shell: """ # Activate metagem env set +uo pipefail;source activate {config[envs][metagem]};set -u; # Make sure stats folder exists mkdir -p $(dirname {output.text}) # Move into assembly folder cd {input} echo -e "\nGenerating assembly results file assembly.stats: ... " while read assembly;do # Define sample ID ID=$(echo $(basename $(dirname $assembly))) # Check if assembly file is empty check=$(zcat $assembly | head | wc -l) if [ $check -eq 0 ] then N=0 L=0 else N=$(zcat $assembly | grep -c ">"); L=$(zcat $assembly | grep ">"|cut -d '-' -f4|sed 's/len=//'|awk '{{sum+=$1}}END{{print sum}}'); fi # Write values to stats file echo $ID $N $L >> assembly.stats; # Print values to terminal echo -e "Sample $ID has a total of $L bp across $N contigs ... " done< <(find {input} -name "*.gz") echo "Done summarizing assembly results ... \nMoving to /stats/ folder and running plotting script ... " mv assembly.stats {config[path][root]}/{config[folder][stats]} # Move to stats folder cd {config[path][root]}/{config[folder][stats]} # Running assembly Vis R script Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][assemblyVis]} echo "Done. " # Remove unnecessary file rm Rplots.pdf """ rule crossMapSeries: input: contigs = rules.megahit.output, reads = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}' output: concoct = directory(f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/cov'), metabat = directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/cov'), maxbin = directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/cov') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.crossMapSeries.benchmark.txt' message: """ Cross map in seies: Use this approach to provide all 3 binning tools with cross-sample coverage information. Will likely provide superior binning results, but may no be feasible for datasets with many large samples such as the tara oceans dataset. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folders mkdir -p {output.concoct} mkdir -p {output.metabat} mkdir -p {output.maxbin} # Make job specific scratch dir idvar=$(echo $(basename $(dirname {output.concoct}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][crossMap]}/${{idvar}} ... " mkdir -p {config[path][scratch]}/{config[folder][crossMap]}/${{idvar}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][crossMap]}/${{idvar}} # Copy files cp {input.contigs} . # Define the focal sample ID, fsample: # The one sample's assembly that all other samples' read will be mapped against in a for loop fsampleID=$(echo $(basename $(dirname {input.contigs}))) echo -e "\nFocal sample: $fsampleID ... " echo "Renaming and unzipping assembly ... " mv $(basename {input.contigs}) $(echo $fsampleID|sed 's/$/.fa.gz/g') gunzip $(echo $fsampleID|sed 's/$/.fa.gz/g') echo -e "\nIndexing assembly ... " bwa index $fsampleID.fa for folder in {input.reads}/*;do id=$(basename $folder) echo -e "\nCopying sample $id to be mapped against the focal sample $fsampleID ..." cp $folder/*.gz . # Maybe I should be piping the lines below to reduce I/O ? echo -e "\nMapping sample to assembly ... " bwa mem -t {config[cores][crossMap]} $fsampleID.fa *.fastq.gz > $id.sam echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][crossMap]} -Sb $id.sam > $id.bam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][crossMap]} -o $id.sort $id.bam echo -e "\nRunning jgi_summarize_bam_contig_depths script to generate contig abundance/depth file for maxbin2 input ... " jgi_summarize_bam_contig_depths --outputDepth $id.depth $id.sort echo -e "\nMoving depth file to sample $fsampleID maxbin2 folder ... " mv $id.depth {output.maxbin} echo -e "\nIndexing sorted BAM file with samtools index for CONCOCT input table generation ... " samtools index $id.sort echo -e "\nRemoving temporary files ... " rm *.fastq.gz *.sam *.bam done nSamples=$(ls {input.reads}|wc -l) echo -e "\nDone mapping focal sample $fsampleID agains $nSamples samples in dataset folder." echo -e "\nRunning jgi_summarize_bam_contig_depths for all sorted bam files to generate metabat2 input ... " jgi_summarize_bam_contig_depths --outputDepth $id.all.depth *.sort echo -e "\nMoving input file $id.all.depth to $fsampleID metabat2 folder... " mv $id.all.depth {output.metabat} echo -e "Done. \nCutting up contigs to 10kbp chunks (default), not to be used for mapping!" cut_up_fasta.py -c {config[params][cutfasta]} -o 0 -m $fsampleID.fa -b assembly_c10k.bed > assembly_c10k.fa echo -e "\nSummarizing sorted and indexed BAM files with concoct_coverage_table.py to generate CONCOCT input table ... " concoct_coverage_table.py assembly_c10k.bed *.sort > coverage_table.tsv echo -e "\nMoving CONCOCT input table to $fsampleID concoct folder" mv coverage_table.tsv {output.concoct} echo -e "\nRemoving intermediate sorted bam files ... " rm *.sort """ rule kallistoIndex: input: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}/{{focal}}/contigs.fasta.gz' output: f'{config["path"]["root"]}/{config["folder"]["kallistoIndex"]}/{{focal}}/index.kaix' benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{focal}}.kallistoIndex.benchmark.txt' message: """ Needed for the crossMapParallel implementation, which uses kalliso for fast mapping instead of bwa. Saves a lot of computational power/time to only create once and re-use for each job. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p $(dirname {output}) # Make job specific scratch dir sampleID=$(echo $(basename $(dirname {input}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][kallistoIndex]}/${{sampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][kallistoIndex]}/${{sampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][kallistoIndex]}/${{sampleID}} # Copy files echo -e "\nCopying and unzipping sample $sampleID assembly ... " cp {input} . # Rename files mv $(basename {input}) $(echo $sampleID|sed 's/$/.fa.gz/g') gunzip $(echo $sampleID|sed 's/$/.fa.gz/g') echo -e "\nCutting up assembly contigs >= 20kbp into 10kbp chunks ... " cut_up_fasta.py $sampleID.fa -c 10000 -o 0 --merge_last > contigs_10K.fa echo -e "\nCreating kallisto index ... " kallisto index contigs_10K.fa -i index.kaix mv index.kaix $(dirname {output}) """ rule crossMapParallel: input: index = f'{config["path"]["root"]}/{config["folder"]["kallistoIndex"]}/{{focal}}/index.kaix', R1 = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}/{{IDs}}/{{IDs}}_R1.fastq.gz', R2 = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}/{{IDs}}/{{IDs}}_R2.fastq.gz' output: directory(f'{config["path"]["root"]}/{config["folder"]["kallisto"]}/{{focal}}/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{focal}}.{{IDs}}.crossMapParallel.benchmark.txt' message: """ This rule is an alternative implementation of crossMapSeries, using kallisto instead of bwa for mapping operations. This implementation is recommended for large datasets. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p {output} # Make job specific scratch dir focal=$(echo $(basename $(dirname {input.index}))) mapping=$(echo $(basename $(dirname {input.R1}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][kallisto]}/${{focal}}_${{mapping}} ... " mkdir -p {config[path][scratch]}/{config[folder][kallisto]}/${{focal}}_${{mapping}} # Move into tmp dir cd {config[path][scratch]}/{config[folder][kallisto]}/${{focal}}_${{mapping}} # Copy files echo -e "\nCopying assembly index {input.index} and reads {input.R1} {input.R2} to $(pwd) ... " cp {input.index} {input.R1} {input.R2} . # Run kallisto echo -e "\nRunning kallisto ... " kallisto quant --threads {config[cores][crossMap]} --plaintext -i index.kaix -o . $(basename {input.R1}) $(basename {input.R2}) # Zip file echo -e "\nZipping abundance file ... " gzip abundance.tsv # Move mapping file out output folder mv abundance.tsv.gz {output} # Cleanup temp folder echo -e "\nRemoving temporary directory {config[path][scratch]}/{config[folder][kallisto]}/${{focal}}_${{mapping}} ... " cd - rm -r {config[path][scratch]}/{config[folder][kallisto]}/${{focal}}_${{mapping}} """ rule gatherCrossMapParallel: input: expand(f'{config["path"]["root"]}/{config["folder"]["kallisto"]}/{{focal}}/{{IDs}}', focal = focal , IDs = IDs) shell: """ echo "Gathering cross map jobs ..." """ rule concoct: input: table = f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/cov/coverage_table.tsv', contigs = rules.megahit.output output: directory(f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/{{IDs}}.concoct-bins') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.concoct.benchmark.txt' shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p $(dirname {output}) # Make job specific scratch dir sampleID=$(echo $(basename $(dirname {input.contigs}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][concoct]}/${{sampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][concoct]}/${{sampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][concoct]}/${{sampleID}} # Copy files cp {input.contigs} {input.table} . echo "Unzipping assembly ... " gunzip $(basename {input.contigs}) echo -e "Done. \nCutting up contigs (default 10kbp chunks) ... " cut_up_fasta.py -c {config[params][cutfasta]} -o 0 -m $(echo $(basename {input.contigs})|sed 's/.gz//') > assembly_c10k.fa echo -e "\nRunning CONCOCT ... " concoct --coverage_file $(basename {input.table}) \ --composition_file assembly_c10k.fa \ -b $(basename $(dirname {output})) \ -t {config[cores][concoct]} \ -c {config[params][concoct]} echo -e "\nMerging clustering results into original contigs ... " merge_cutup_clustering.py $(basename $(dirname {output}))_clustering_gt1000.csv > $(basename $(dirname {output}))_clustering_merged.csv echo -e "\nExtracting bins ... " mkdir -p $(basename {output}) extract_fasta_bins.py $(echo $(basename {input.contigs})|sed 's/.gz//') $(basename $(dirname {output}))_clustering_merged.csv --output_path $(basename {output}) # Move final result files to output folder mv $(basename {output}) *.txt *.csv $(dirname {output}) """ rule metabatCross: input: assembly = rules.megahit.output, depth = f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/cov' output: directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.metabat.benchmark.txt' shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p {output} # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.assembly}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} # Copy files to tmp cp {input.assembly} {input.depth}/*.all.depth . # Unzip assembly gunzip $(basename {input.assembly}) # Run metabat2 echo -e "\nRunning metabat2 ... " metabat2 -i contigs.fasta -a *.all.depth -s {config[params][metabatMin]} -v --seed {config[params][seed]} -t 0 -m {config[params][minBin]} -o $(basename $(dirname {output})) # Move result files to output dir mv *.fa {output} """ rule maxbinCross: input: assembly = rules.megahit.output, depth = f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/cov' output: directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.maxbin.benchmark.txt' shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p $(dirname {output}) # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.assembly}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} # Copy files to tmp cp -r {input.assembly} {input.depth}/*.depth . echo -e "\nUnzipping assembly ... " gunzip $(basename {input.assembly}) echo -e "\nGenerating list of depth files based on crossMapSeries rule output ... " find . -name "*.depth" > abund.list echo -e "\nRunning maxbin2 ... " run_MaxBin.pl -thread {config[cores][maxbin]} -contig contigs.fasta -out $(basename $(dirname {output})) -abund_list abund.list # Clean up un-needed files rm *.depth abund.list contigs.fasta # Move files into output dir mkdir -p $(basename {output}) while read bin;do mv $bin $(basename {output});done< <(ls|grep fasta) mv * $(dirname {output}) """ rule binning: input: concoct = expand(config["path"]["root"]+"/"+config["folder"]["concoct"]+"/{IDs}/{IDs}.concoct-bins", IDs = IDs), maxbin = expand(config["path"]["root"]+"/"+config["folder"]["maxbin"]+"/{IDs}/{IDs}.maxbin-bins", IDs = IDs), metabat = expand(config["path"]["root"]+"/"+config["folder"]["metabat"]+"/{IDs}/{IDs}.metabat-bins", IDs = IDs) rule binRefine: input: concoct = f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/{{IDs}}.concoct-bins', metabat = f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins', maxbin = f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins' output: directory(f'{config["path"]["root"]}/{config["folder"]["refined"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.binRefine.benchmark.txt' shell: """ # Activate metawrap environment set +u;source activate {config[envs][metawrap]};set -u; # Create output folder mkdir -p {output} # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.concoct}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][refined]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][refined]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][refined]}/${{fsampleID}} # Copy files to tmp echo "Copying bins from CONCOCT, metabat2, and maxbin2 to {config[path][scratch]} ... " cp -r {input.concoct} {input.metabat} {input.maxbin} . echo "Renaming bin folders to avoid errors with metaWRAP ... " mv $(basename {input.concoct}) $(echo $(basename {input.concoct})|sed 's/-bins//g') mv $(basename {input.metabat}) $(echo $(basename {input.metabat})|sed 's/-bins//g') mv $(basename {input.maxbin}) $(echo $(basename {input.maxbin})|sed 's/-bins//g') echo "Running metaWRAP bin refinement module ... " metaWRAP bin_refinement -o . \ -A $(echo $(basename {input.concoct})|sed 's/-bins//g') \ -B $(echo $(basename {input.metabat})|sed 's/-bins//g') \ -C $(echo $(basename {input.maxbin})|sed 's/-bins//g') \ -t {config[cores][refine]} \ -m {config[params][refineMem]} \ -c {config[params][refineComp]} \ -x {config[params][refineCont]} rm -r $(echo $(basename {input.concoct})|sed 's/-bins//g') $(echo $(basename {input.metabat})|sed 's/-bins//g') $(echo $(basename {input.maxbin})|sed 's/-bins//g') work_files mv * {output} """ rule binReassemble: input: R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2, refinedBins = rules.binRefine.output output: directory(f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.binReassemble.benchmark.txt' shell: """ # Activate metawrap environment set +u;source activate {config[envs][metawrap]};set -u; # Prevents spades from using just one thread export OMP_NUM_THREADS={config[cores][reassemble]} # Create output folder mkdir -p {output} # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.R1}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][reassembled]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][reassembled]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][reassembled]}/${{fsampleID}} # Copy files to tmp cp -r {input.refinedBins}/metawrap_*_bins {input.R1} {input.R2} . echo "Running metaWRAP bin reassembly ... " metaWRAP reassemble_bins --parallel -o $(basename {output}) \ -b metawrap_*_bins \ -1 $(basename {input.R1}) \ -2 $(basename {input.R2}) \ -t {config[cores][reassemble]} \ -m {config[params][reassembleMem]} \ -c {config[params][reassembleComp]} \ -x {config[params][reassembleCont]} # Cleaning up files rm -r metawrap_*_bins rm -r $(basename {output})/work_files rm *.fastq.gz # Move results to output folder mv * $(dirname {output}) """ rule binEvaluation: input: refined = expand(config["path"]["root"]+"/"+config["folder"]["refined"]+"/{IDs}", IDs = IDs), reassembled = expand(config["path"]["root"]+"/"+config["folder"]["reassembled"]+"/{IDs}", IDs = IDs) rule binningVis: input: f'{config["path"]["root"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/reassembled_bins.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/binningVis.pdf' message: """ Generate bar plot with number of bins and density plot of bin contigs, total length, completeness, and contamination across different tools. """ shell: """ # Activate metagem env set +u;source activate {config[envs][metagem]};set -u; # Read CONCOCT bins echo "Generating concoct_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][concoct]} for folder in */;do # Define sample name var=$(echo $folder|sed 's|/||g'); for bin in $folder*concoct-bins/*.fa;do # Define bin name name=$(echo $bin | sed "s|^.*/|$var.bin.|g" | sed 's/.fa//g'); # Count contigs N=$(less $bin | grep -c ">"); # Sum length L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') # Print values to terminal and write to stats file echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> concoct_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} # Read MetaBAT2 bins echo "Generating metabat_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][metabat]} for folder in */;do # Define sample name var=$(echo $folder | sed 's|/||'); for bin in $folder*metabat-bins/*.fa;do # Define bin name name=$(echo $bin|sed 's/.fa//g'|sed 's|^.*/||g'|sed "s/^/$var./g"); # Count contigs N=$(less $bin | grep -c ">"); # Sum length L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') # Print values to terminal and write to stats file echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> metabat_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} # Read MaxBin2 bins echo "Generating maxbin_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][maxbin]} for folder in */;do for bin in $folder*maxbin-bins/*.fasta;do # Define bin name name=$(echo $bin | sed 's/.fasta//g' | sed 's|^.*/||g'); # Count contigs N=$(less $bin | grep -c ">"); # Sum length L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') # Print values to terminal and write to stats file echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> maxbin_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} # Read metaWRAP refined bins echo "Generating refined_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][refined]} for folder in */;do # Define sample name samp=$(echo $folder | sed 's|/||'); for bin in $folder*metawrap_*_bins/*.fa;do # Define bin name name=$(echo $bin | sed 's/.fa//g'|sed 's|^.*/||g'|sed "s/^/$samp./g"); # Count contigs N=$(less $bin | grep -c ">"); # Sum length L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}') # Print values to terminal and write to stats file echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> refined_bins.stats; done; done # Compile CONCOCT, MetaBAT2, MaxBin2, and metaWRAP checkM files echo "Generating CheckM summary files across samples: concoct.checkm, metabat.checkm, maxbin.checkm, and refined.checkm ... " for folder in */;do # Define sample name var=$(echo $folder|sed 's|/||g'); # Write values to checkm files paste $folder*concoct.stats|tail -n +2 | sed "s/^/$var.bin./g" >> concoct.checkm paste $folder*metabat.stats|tail -n +2 | sed "s/^/$var./g" >> metabat.checkm paste $folder*maxbin.stats|tail -n +2 >> maxbin.checkm paste $folder*metawrap_*_bins.stats|tail -n +2|sed "s/^/$var./g" >> refined.checkm done mv *.stats *.checkm {input}/{config[folder][reassembled]} # Read metaWRAP reassembled bins echo "Generating reassembled_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][reassembled]} for folder in */;do # Define sample name samp=$(echo $folder | sed 's|/||'); for bin in $folder*reassembled_bins/*.fa;do # Define bin name name=$(echo $bin | sed 's/.fa//g' | sed 's|^.*/||g' | sed "s/^/$samp./g"); N=$(less $bin | grep -c ">"); # Check if bins are original (megahit-assembled) or strict/permissive (metaspades-assembled) if [[ $name == *.strict ]] || [[ $name == *.permissive ]];then L=$(less $bin |grep ">"|cut -d '_' -f4|awk '{{sum+=$1}}END{{print sum}}') else L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}') fi # Print values to terminal and write to stats file echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> reassembled_bins.stats; done; done echo "Done reading metawrap reassembled bins ... " # Read metaWRAP reassembled checkM file echo "Generating CheckM summary file reassembled.checkm across samples for reassembled bins ... " for folder in */;do # Define sample name var=$(echo $folder|sed 's|/||g'); # Write values to checkM file paste $folder*reassembled_bins.stats|tail -n +2|sed "s/^/$var./g"; done >> reassembled.checkm echo "Done generating all statistics files for binning results ... running plotting script ... " # Move files and cd to stats folder mv *.stats *.checkm {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} # Run Rscript Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][binningVis]} # Delete redundant pdf file rm Rplots.pdf """ rule abundance: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: directory(f'{config["path"]["root"]}/{config["folder"]["abundance"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.abundance.benchmark.txt' message: """ Calculate bin abundance fraction using the following: binAbundanceFraction = ( X / Y / Z) * 1000000 X = # of reads mapped to bin_i from sample_k Y = length of bin_i (bp) Z = # of reads mapped to all bins in sample_k Note: 1000000 scaling factor converts length in bp to Mbp """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Make sure output folder exists mkdir -p {output} # Make job specific scratch dir sampleID=$(echo $(basename $(dirname {input.R1}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][abundance]}/${{sampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][abundance]}/${{sampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][abundance]}/${{sampleID}} # Copy files echo -e "\nCopying quality filtered paired end reads and generated MAGs to {config[path][scratch]} ... " cp {input.R1} {input.R2} {input.bins}/* . echo -e "\nConcatenating all bins into one FASTA file ... " cat *.fa > $(basename {output}).fa echo -e "\nCreating bwa index for concatenated FASTA file ... " bwa index $(basename {output}).fa echo -e "\nMapping quality filtered paired end reads to concatenated FASTA file with bwa mem ... " bwa mem -t {config[cores][abundance]} $(basename {output}).fa \ $(basename {input.R1}) $(basename {input.R2}) > $(basename {output}).sam echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][abundance]} -Sb $(basename {output}).sam > $(basename {output}).bam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][abundance]} -o $(basename {output}).sort.bam $(basename {output}).bam echo -e "\nExtracting stats from sorted BAM file with samtools flagstat ... " samtools flagstat $(basename {output}).sort.bam > map.stats echo -e "\nCopying sample_map.stats file to root/abundance/sample for bin concatenation and deleting temporary FASTA file ... " cp map.stats {output}/$(basename {output})_map.stats rm $(basename {output}).fa echo -e "\nRepeat procedure for each bin ... " for bin in *.fa;do echo -e "\nSetting up temporary sub-directory to map against bin $bin ... " mkdir -p $(echo "$bin"| sed "s/.fa//") # Move bin into subirectory mv $bin $(echo "$bin"| sed "s/.fa//") cd $(echo "$bin"| sed "s/.fa//") echo -e "\nCreating bwa index for bin $bin ... " bwa index $bin echo -e "\nMapping quality filtered paired end reads to bin $bin with bwa mem ... " bwa mem -t {config[cores][abundance]} $bin \ ../$(basename {input.R1}) ../$(basename {input.R2}) > $(echo "$bin"|sed "s/.fa/.sam/") echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][abundance]} -Sb $(echo "$bin"|sed "s/.fa/.sam/") > $(echo "$bin"|sed "s/.fa/.bam/") echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][abundance]} -o $(echo "$bin"|sed "s/.fa/.sort.bam/") $(echo "$bin"|sed "s/.fa/.bam/") echo -e "\nExtracting stats from sorted BAM file with samtools flagstat ... " samtools flagstat $(echo "$bin"|sed "s/.fa/.sort.bam/") > $(echo "$bin"|sed "s/.fa/.map/") echo -e "\nAppending bin length to bin.map stats file ... " echo -n "Bin Length = " >> $(echo "$bin"|sed "s/.fa/.map/") # Check if bins are original (megahit-assembled) or strict/permissive (metaspades-assembled) if [[ $bin == *.strict.fa ]] || [[ $bin == *.permissive.fa ]] || [[ $bin == *.s.fa ]] || [[ $bin == *.p.fa ]];then less $bin |grep ">"|cut -d '_' -f4|awk '{{sum+=$1}}END{{print sum}}' >> $(echo "$bin"|sed "s/.fa/.map/") else less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}' >> $(echo "$bin"|sed "s/.fa/.map/") fi paste $(echo "$bin"|sed "s/.fa/.map/") echo -e "\nCalculating abundance for bin $bin ... " echo -n "$bin"|sed "s/.fa//" >> $(echo "$bin"|sed "s/.fa/.abund/") echo -n $'\t' >> $(echo "$bin"|sed "s/.fa/.abund/") X=$(less $(echo "$bin"|sed "s/.fa/.map/")|grep "mapped ("|awk -F' ' '{{print $1}}') Y=$(less $(echo "$bin"|sed "s/.fa/.map/")|tail -n 1|awk -F' ' '{{print $4}}') Z=$(less "../map.stats"|grep "mapped ("|awk -F' ' '{{print $1}}') awk -v x="$X" -v y="$Y" -v z="$Z" 'BEGIN{{print (x/y/z) * 1000000}}' >> $(echo "$bin"|sed "s/.fa/.abund/") paste $(echo "$bin"|sed "s/.fa/.abund/") echo -e "\nRemoving temporary files for bin $bin ... " rm $bin cp $(echo "$bin"|sed "s/.fa/.map/") {output} mv $(echo "$bin"|sed "s/.fa/.abund/") ../ cd .. rm -r $(echo "$bin"| sed "s/.fa//") done echo -e "\nDone processing all bins, summarizing results into sample.abund file ... " cat *.abund > $(basename {output}).abund echo -ne "\nSumming calculated abundances to obtain normalization value ... " norm=$(less $(basename {output}).abund |awk '{{sum+=$2}}END{{print sum}}'); echo $norm echo -e "\nGenerating column with abundances normalized between 0 and 1 ... " awk -v NORM="$norm" '{{printf $1"\t"$2"\t"$2/NORM"\\n"}}' $(basename {output}).abund > abundance.txt rm $(basename {output}).abund mv abundance.txt $(basename {output}).abund mv $(basename {output}).abund {output} """ rule GTDBTk: input: f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins' output: directory(f'{config["path"]["root"]}/GTDBTk/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.GTDBTk.benchmark.txt' message: """ Please make sure that the GTDB-Tk database was downloaded and configured. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Make sure output folder exists mkdir -p {output} # Make job specific scratch dir sampleID=$(echo $(basename $(dirname {input}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][classification]}/${{sampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][classification]}/${{sampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][classification]}/${{sampleID}} # Copy files echo -e "\nCopying files to tmp dir ... " cp -r {input} . # In case you GTDBTk is not properly configured you may need to export the GTDBTK_DATA_PATH variable, # Simply uncomment the following line and fill in the path to your GTDBTk database: # export GTDBTK_DATA_PATH=/path/to/the/gtdbtk/database/you/downloaded # Run GTDBTk gtdbtk classify_wf --genome_dir $(basename {input}) --out_dir GTDBTk -x fa --cpus {config[cores][gtdbtk]} mv GTDBTk/* {output} """ rule compositionVis: input: taxonomy = f'{config["path"]["root"]}/{config["folder"]["classification"]}' , abundance = f'{config["path"]["root"]}/{config["folder"]["abundance"]}' output: #file = f'{config["path"]["root"]}/{config["folder"]["stats"]}/composition.tsv', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/compositionVis.pdf' message: """ Summarize and visualize abundance + taxonomy of MAGs across samples. Note: compositionVis should only be run after the gtdbtk and abundance rules. """ shell: """ set +u;source activate {config[envs][metagem]};set -u # Generate summary abundance file cd {input.abundance} for folder in */;do # Define sample ID sample=$(echo $folder|sed 's|/||g') # Same as in taxonomyVis rule, modify bin names by adding sample ID and shortening metaWRAP naming scheme (orig/permissive/strict) paste $sample/$sample.abund | sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$sample./g" >> abundance.stats done mv abundance.stats {config[path][root]}/{config[folder][stats]} # Generate summary taxonomy file cd {input.taxonomy} # Summarize GTDBTk output across samples for folder in */;do samp=$(echo $folder|sed 's|^.*/||'); cat $folder/classify/*summary.tsv| sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$samp./g"; done > GTDBTk.stats # Clean up stats file header=$(head -n 1 GTDBTk.stats | sed 's/^.*\.//g') sed -i '/other_related_references(genome_id,species_name,radius,ANI,AF)/d' GTDBTk.stats sed -i "1i$header" GTDBTk.stats mv GTDBTk.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][compositionVis]} """ rule extractProteinBins: message: """ Extract ORF annotated protein fasta files for each bin from reassembly checkm files, place into sample specific subdirectories within protein_bins folder. """ shell: """ # Move to root directory cd {config[path][root]} # Make sure protein bins folder exists mkdir -p {config[folder][proteinBins]} echo -e "Begin moving and renaming ORF annotated protein fasta bins from reassembled_bins/ to protein_bins/ ... \n" for folder in reassembled_bins/*/;do #Loop through each sample echo "Copying bins from sample $(echo $(basename $folder)) ... " for bin in $folder*reassembled_bins.checkm/bins/*;do # Loop through each bin var=$(echo $bin/genes.faa | sed 's|reassembled_bins/||g'|sed 's|/reassembled_bins.checkm/bins||'|sed 's|/genes||g'|sed 's|/|_|g'|sed 's/permissive/p/g'|sed 's/orig/o/g'|sed 's/strict/s/g'); cp $bin/*.faa {config[path][root]}/{config[folder][proteinBins]}/$var; done; done """ rule carveme: input: bin = f'{config["path"]["root"]}/{config["folder"]["proteinBins"]}/{{binIDs}}.faa', media = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["carveme"]}' output: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{binIDs}}.xml' benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{binIDs}}.carveme.benchmark.txt' message: """ Make sure that the input files are ORF annotated and preferably protein fasta. If given raw fasta files, Carveme will run without errors but each contig will be treated as one gene. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Make sure output folder exists mkdir -p $(dirname {output}) # Make job specific scratch dir binID=$(echo $(basename {input})|sed 's/.faa//g') echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][GEMs]}/${{binID}} ... " mkdir -p {config[path][scratch]}/{config[folder][GEMs]}/${{binID}} # Move into tmp dir cd {config[path][scratch]}/{config[folder][GEMs]}/${{binID}} # Copy files cp {input.bin} {input.media} . echo "Begin carving GEM ... " carve -g {config[params][carveMedia]} \ -v \ --mediadb $(basename {input.media}) \ --fbc2 \ -o $(echo $(basename {input.bin}) | sed 's/.faa/.xml/g') $(basename {input.bin}) echo "Done carving GEM. " [ -f *.xml ] && mv *.xml $(dirname {output}) """ rule modelVis: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/GEMs.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/modelVis.pdf' message: """ Generate bar plot with GEMs generated across samples and density plots showing number of unique metabolites, reactions, and genes across GEMs. """ shell: """ set +u;source activate {config[envs][metagem]};set -u; cd {input} echo -e "\nBegin reading models ... \n" while read model;do id=$(echo $(basename $model)|sed 's/.xml//g'); mets=$(less $model| grep "species id="|cut -d ' ' -f 8|sed 's/..$//g'|sort|uniq|wc -l); rxns=$(less $model|grep -c 'reaction id='); genes=$(less $model|grep 'fbc:geneProduct fbc:id='|grep -vic spontaneous); echo "Model: $id has $mets mets, $rxns reactions, and $genes genes ... " echo "$id $mets $rxns $genes" >> GEMs.stats; done< <(find . -name "*.xml") echo -e "\nDone generating GEMs.stats summary file, moving to stats/ folder and running modelVis.R script ... " mv GEMs.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][modelVis]} rm Rplots.pdf # Delete redundant pdf file echo "Done. " """ rule ECvis: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}' output: directory(f'{config["path"]["root"]}/ecfiles') message: """ Get EC information from GEMs. Switch the input folder and grep|sed expressions to match the ec numbers in you model sets. Currently configured for UHGG GEM set. """ shell: """ echo -e "\nCopying GEMs from specified input directory to {config[path][scratch]} ... " cp -r {input} {config[path][scratch]} cd {config[path][scratch]} mkdir ecfiles while read model; do # Read E.C. numbers from each sbml file and write to a unique file, note that grep expression is hardcoded for specific GEM batches less $(basename {input})/$model| grep 'EC Number'| \ sed 's/^.*: //g'| \ sed 's/<.*$//g'| \ sed '/-/d'|sed '/N\/A/d' | \ sort|uniq -c \ > ecfiles/$model.ec echo -ne "Reading E.C. numbers in model $model, unique E.C. : " ECNUM=$(less ecfiles/$model.ec|wc -l) echo $ECNUM done< <(ls $(basename {input})) echo -e "\nMoving ecfiles folder back to {config[path][root]}" mv ecfiles {config[path][root]} cd {config[path][root]} echo -e "\nCreating sorted unique file EC.summary for easy EC inspection ... " cat ecfiles/*.ec|awk '{{print $NF}}'|sort|uniq -c > EC.summary paste EC.summary """ rule organizeGEMs: input: f'{config["path"]["root"]}/{config["folder"]["refined"]}' message: """ Organizes GEMs into sample specific subfolders, assumes that the refined_bins folder has sample-specific subfolders. Necessary to run smetana per sample using the IDs wildcard. """ shell: """ cd {input} for folder in */;do echo -n "Creating GEM subfolder for sample $folder ... " mkdir -p ../{config[folder][GEMs]}/$folder; echo -n "moving GEMs ... " mv ../{config[folder][GEMs]}/$(echo $folder|sed 's|/||')_*.xml ../{config[folder][GEMs]}/$folder; echo "done. " done """ rule smetana: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{IDs}}' output: f'{config["path"]["root"]}/{config["folder"]["SMETANA"]}/{{IDs}}_detailed.tsv' benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.smetana.benchmark.txt' shell: """ # Activate metagem env set +u;source activate {config[envs][metagem]};set -u # Make sure output folder exists mkdir -p $(dirname {output}) # Make job specific scratch dir sampleID=$(echo $(basename {input})) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][SMETANA]}/${{sampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][SMETANA]}/${{sampleID}} # Move to tmp dir cd {config[path][scratch]}/{config[folder][SMETANA]}/${{sampleID}} # Copy media db and GEMs cp {config[path][root]}/{config[folder][scripts]}/{config[scripts][carveme]} {input}/*.xml . # Run SMETANA smetana -o $(basename {input}) --flavor fbc2 \ --mediadb media_db.tsv -m {config[params][smetanaMedia]} \ --detailed \ --solver {config[params][smetanaSolver]} -v *.xml # Copy results to output folder cp *.tsv $(dirname {output}) """ rule interactionVis: input: f'{config["path"]["root"]}/{config["folder"]["SMETANA"]}' shell: """ cd {input} mv media_db.tsv ../scripts/ cat *.tsv|sed '/community/d' > smetana.all less smetana.all |cut -f2|sort|uniq > media.txt ll|grep tsv|awk '{print $NF}'|sed 's/_.*$//g'>samples.txt while read sample;do echo -n "$sample ";while read media;do var=$(less smetana.all|grep $sample|grep -c $media); echo -n "$var " ;done < media.txt; echo "";done < samples.txt > sampleMedia.stats """ rule memote: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{gemIDs}}.xml' output: directory(f'{config["path"]["root"]}/{config["folder"]["memote"]}/{{gemIDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{gemIDs}}.memote.benchmark.txt' shell: """ # Activate metagem env set +u;source activate {config[envs][metagem]};set -u # Make sure output folder exists mkdir -p {output} # Make job specific scratch dir gemID=$(echo $(basename {input})|sed 's/.xml//g') echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][memote]}/${{gemID}} ... " mkdir -p {config[path][scratch]}/{config[folder][memote]}/${{gemID}} # Move to tmp dir cd {config[path][scratch]}/{config[folder][memote]}/${{gemID}} # Copy GEM to tmp cp {input} . # Uncomment the following line in case errors are raised about missing git module, # also ensure that module name matches that of your cluster # module load git # Run memote memote report snapshot --skip test_find_metabolites_produced_with_closed_bounds --skip test_find_metabolites_consumed_with_closed_bounds --skip test_find_metabolites_not_produced_with_open_bounds --skip test_find_metabolites_not_consumed_with_open_bounds --skip test_find_incorrect_thermodynamic_reversibility --filename $(echo $(basename {input})|sed 's/.xml/.html/') *.xml memote run --skip test_find_metabolites_produced_with_closed_bounds --skip test_find_metabolites_consumed_with_closed_bounds --skip test_find_metabolites_not_produced_with_open_bounds --skip test_find_metabolites_not_consumed_with_open_bounds --skip test_find_incorrect_thermodynamic_reversibility *.xml # Rename output file with sample ID mv result.json.gz $(echo $(basename {input})|sed 's/.xml/.json.gz/') # Move results to output folder mv *.gz *.html {output} """ rule grid: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: directory(f'{config["path"]["root"]}/{config["folder"]["GRiD"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.grid.benchmark.txt' shell: """ set +u;source activate {config[envs][metagem]};set -u cp -r {input.bins} {input.R1} {input.R2} {config[path][scratch]} cd {config[path][scratch]} cat *.gz > $(basename $(dirname {input.bins})).fastq.gz rm $(basename {input.R1}) $(basename {input.R2}) mkdir MAGdb out update_database -d MAGdb -g $(basename {input.bins}) -p MAGdb rm -r $(basename {input.bins}) grid multiplex -r . -e fastq.gz -d MAGdb -p -c 0.2 -o out -n {config[cores][grid]} rm $(basename $(dirname {input.bins})).fastq.gz mkdir {output} mv out/* {output} """ rule extractDnaBins: message: """ Extract dna fasta files for each bin from reassembly output, place into sample specific subdirectories within the dna_bins folder """ shell: """ # Move into root dir cd {config[path][root]} # Make sure dnaBins folder exists mkdir -p {config[folder][dnaBins]} # Copy files echo -e "Begin copying and renaming dna fasta bins from reassembled_bins/ to dna_bins/ ... \n" for folder in reassembled_bins/*/;do # Loop through each sample sample=$(echo $(basename $folder)); mkdir -p {config[path][root]}/{config[folder][dnaBins]}/$sample echo "Copying bins from sample $sample ... " for bin in $folder*reassembled_bins/*;do # Loop through each bin var=$(echo $bin| sed 's|reassembled_bins/||g'|sed 's|/|_|g'|sed 's/permissive/p/g'|sed 's/orig/o/g'|sed 's/strict/s/g'); cp $bin {config[path][root]}/{config[folder][dnaBins]}/$sample/$var; done; done """ rule prokka: input: bins = f'{config["path"]["root"]}/{config["folder"]["dnaBins"]}/{{binIDs}}.fa' output: directory(f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/prokka/unorganized/{{binIDs}}') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{binIDs}}.prokka.benchmark.txt' shell: """ set +u;source activate {config[envs][prokkaroary]};set -u mkdir -p $(dirname $(dirname {output})) mkdir -p $(dirname {output}) cp {input} {config[path][scratch]} cd {config[path][scratch]} id=$(echo $(basename {input})|sed "s/.fa//g") prokka -locustag $id --cpus {config[cores][prokka]} --centre MAG --compliant -outdir prokka/$id -prefix $id $(basename {input}) mv prokka/$id $(dirname {output}) """ rule roary: input: f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/prokka/organized/{{speciesIDs}}/' output: directory(f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/roary/{{speciesIDs}}/') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{speciesIDs}}.roary.benchmark.txt' shell: """ set +u;source activate {config[envs][prokkaroary]};set -u mkdir -p $(dirname {output}) cd {config[path][scratch]} cp -r {input} . roary -s -p {config[cores][roary]} -i {config[params][roaryI]} -cd {config[params][roaryCD]} -f yes_al -e -v $(basename {input})/*.gff cd yes_al create_pan_genome_plots.R cd .. mkdir -p {output} mv yes_al/* {output} """ rule run_prodigal: """Use Prodigal for coding genes predictions in contigs.""" input: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}/{{IDs}}/contigs.fasta.gz' output: gff = f'{config["path"]["root"]}/{config["folder"]["prodigal"]}/{{IDs}}/{{IDs}}_genes.gff', faa = f'{config["path"]["root"]}/{config["folder"]["prodigal"]}/{{IDs}}/{{IDs}}_genes_prot.fa', fna = f'{config["path"]["root"]}/{config["folder"]["prodigal"]}/{{IDs}}/{{IDs}}_genes_nucl.fa', log = f'{config["path"]["root"]}/{config["folder"]["prodigal"]}/{{IDs}}/{{IDs}}_log.out' shell: """ mkdir -p $(dirname {output.gff}) prodigal -i <(gunzip -c {input}) -o {output.gff} -a {output.faa} -d {output.fna} -p meta &> {output.log} """ rule run_blastp: """Use Diamond blastp for searching against coding genes predictions from contigs. Uses snakemake wrapper: https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/diamond/blastp.html""" input: fname_fasta=f'{config["path"]["root"]}/{config["folder"]["prodigal"]}/{{IDs}}/{{IDs}}_genes_prot.fa', fname_db=f'{config["path"]["root"]}/{config["folder"]["blastp_db"]}' output: fname=f'{config["path"]["root"]}/{config["folder"]["blastp"]}/{{IDs}}.xml' threads: 8 wrapper: "https://github.com/snakemake/snakemake-wrappers/raw/0.80.1/bio/diamond/blastp" ================================================ FILE: workflow/envs/metaGEM_env.yml ================================================ name: metagem channels: - conda-forge - bioconda - defaults dependencies: - bedtools>=2.29.2 - bwa>=0.7.17 - concoct>=1.1.0 - diamond>=2.0.6 - fastp>=0.20.1 - gtdbtk>=1.4.0 - maxbin2>=2.2.7 - megahit>=1.2.9 - metabat2>=2.15 - r-base>=3.5.1 - r-gridextra>=2.2.1 - r-tidyverse - r-tidytext - samtools>=1.9 - snakemake>=5.10.0,<5.31.1 ================================================ FILE: workflow/envs/metaGEM_env_long.yml ================================================ name: metagem channels: - conda-forge - bioconda - defaults dependencies: - _libgcc_mutex=0.1=conda_forge - _openmp_mutex=4.5=1_gnu - _r-mutex=1.0.1=anacondar_1 - aioeasywebdav=2.4.0=py38h32f6830_1001 - aiohttp=3.7.3=py38h497a2fe_0 - amply=0.1.4=py_0 - appdirs=1.4.4=pyh9f0ad1d_0 - aragorn=1.2.38=h516909a_3 - async-timeout=3.0.1=py_1000 - attrs=20.3.0=pyhd3deb0d_0 - backports=1.0=py_2 - backports.functools_lru_cache=1.6.1=py_0 - bamtools=2.5.1=he513fc3_6 - barrnap=0.9=2 - bcrypt=3.2.0=py38h1e0a361_1 - bedtools=2.29.2=hc088bd4_0 - binutils_impl_linux-64=2.35.1=h193b22a_1 - binutils_linux-64=2.35=hc3fd857_29 - biopython=1.78=py38h25fe258_1 - blas=1.0=mkl - blast=2.10.1=pl526he19e7b1_3 - boost=1.70.0=py38h9de70de_1 - boost-cpp=1.70.0=h8e57a91_2 - boto3=1.16.50=pyhd8ed1ab_0 - botocore=1.19.50=pyhd8ed1ab_0 - bowtie2=2.4.2=py38h1c8e9b9_1 - brotlipy=0.7.0=py38h8df0ef7_1001 - bwa=0.7.17=hed695b0_7 - bwidget=1.9.14=0 - bzip2=1.0.8=h7f98852_4 - c-ares=1.17.1=h36c2ea0_0 - ca-certificates=2020.12.8=h06a4308_0 - cachetools=4.2.0=pyhd3eb1b0_0 - cairo=1.16.0=hcf35c78_1003 - capnproto=0.6.1=hfc679d8_1 - cd-hit=4.8.1=h8b12597_3 - certifi=2020.12.5=py38h578d9bd_1 - cffi=1.14.4=py38ha312104_0 - chardet=3.0.4=py38h924ce5b_1008 - coincbc=2.10.5=hab63836_1 - concoct=1.1.0=py38h7be5676_2 - conda=4.9.2=py38h578d9bd_0 - conda-package-handling=1.7.2=py38h8df0ef7_0 - configargparse=1.2.3=pyh9f0ad1d_0 - cryptography=3.3.1=py38h2b97feb_0 - curl=7.71.1=he644dc0_8 - cython=0.29.21=py38h348cfbe_1 - datrie=0.8.2=py38h1e0a361_1 - dbus=1.13.18=hb2f20db_0 - decorator=4.4.2=py_0 - dendropy=4.5.1=pyh3252c3a_0 - diamond=2.0.6=h56fc30b_0 - docutils=0.16=py38h924ce5b_2 - dropbox=10.10.0=py38h06a4308_0 - entrez-direct=13.9=pl526h375a9b1_0 - ete3=3.1.2=pyh9f0ad1d_0 - eukcc=0.2=py_0 - eukrep=0.6.7=pyh864c0ab_1 - expat=2.2.10=he6710b0_2 - fastani=1.32=he1c1bb9_0 - fastp=0.20.1=h8b12597_0 - fasttree=2.1.10=h516909a_4 - fftw=3.3.9=h27cfd23_1 - filechunkio=1.8=py_2 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - font-ttf-inconsolata=2.001=hab24e00_0 - font-ttf-source-code-pro=2.030=hab24e00_0 - font-ttf-ubuntu=0.83=hab24e00_0 - fontconfig=2.13.1=h86ecdb6_1001 - fonts-conda-forge=1=0 - fraggenescan=1.31=h516909a_2 - freetype=2.10.4=he06d7ca_0 - fribidi=1.0.10=h516909a_0 - ftputil=4.0.0=py_0 - future=0.18.2=py38h578d9bd_2 - gcc_impl_linux-64=7.5.0=hd9e1a51_17 - gcc_linux-64=7.5.0=he2a3fca_29 - gdk-pixbuf=2.42.2=h19a9c64_1 - gettext=0.19.8.1=hf34092f_1004 - gfortran_impl_linux-64=7.5.0=hfca37b7_17 - gfortran_linux-64=7.5.0=ha081f1e_29 - ghostscript=9.53.3=h58526e2_2 - giflib=5.2.1=h516909a_2 - gitdb=4.0.5=py_0 - gitpython=3.1.12=pyhd8ed1ab_0 - glib=2.66.3=h58526e2_0 - google-api-core=1.24.1=pyhd3deb0d_0 - google-api-python-client=1.12.8=pyhd3deb0d_0 - google-auth=1.24.0=pyhd3deb0d_0 - google-auth-httplib2=0.0.4=pyh9f0ad1d_0 - google-cloud-core=1.5.0=pyhd3deb0d_0 - google-cloud-storage=1.35.0=pyhd3deb0d_0 - google-crc32c=1.1.0=py38h8838a9a_0 - google-resumable-media=1.2.0=pyhd3deb0d_0 - googleapis-common-protos=1.52.0=py38h578d9bd_1 - graphite2=1.3.14=h23475e2_0 - graphviz=2.42.3=h0511662_0 - grpcio=1.34.0=py38hdd6454d_0 - gsl=2.6=he838d99_1 - gst-plugins-base=1.14.5=h0935bb2_2 - gstreamer=1.14.5=h36ae1b5_2 - gtdbtk=1.4.0=py_0 - gxx_impl_linux-64=7.5.0=h7ea4de1_17 - gxx_linux-64=7.5.0=h547f3ba_29 - h5py=2.10.0=nompi_py38h513d04c_102 - harfbuzz=2.4.0=h9f30f68_3 - hdf5=1.10.5=nompi_h7c3c948_1111 - hmmer=3.1b2=3 - htslib=1.9=h244ad75_9 - httplib2=0.18.1=pyh9f0ad1d_0 - icu=64.2=he1b5a44_1 - idba=1.1.3=1 - idna=2.10=pyh9f0ad1d_0 - imagemagick=7.0.10_6=pl526ha9fe49d_0 - importlib-metadata=3.3.0=py38h578d9bd_2 - importlib_metadata=3.3.0=hd8ed1ab_2 - infernal=1.1.3=h516909a_0 - ipython_genutils=0.2.0=py_1 - jbig=2.1=h516909a_2002 - jinja2=2.11.2=pyh9f0ad1d_0 - jmespath=0.10.0=pyh9f0ad1d_0 - joblib=1.0.0=pyhd8ed1ab_0 - jpeg=9d=h516909a_0 - jsonschema=3.2.0=py38h32f6830_1 - jupyter_core=4.7.0=py38h578d9bd_0 - kallisto=0.46.2=h4f7b962_1 - kernel-headers_linux-64=2.6.32=h77966d4_13 - kpal=2.1.1=pyh864c0ab_3 - krb5=1.17.2=h926e7f8_0 - ld_impl_linux-64=2.35.1=hea4e1c9_1 - libarchive=3.3.3=hddc7a2b_1008 - libblas=3.9.0=1_h6e990d7_netlib - libcblas=3.9.0=3_h893e4fe_netlib - libclang=9.0.1=default_hde54327_0 - libcrc32c=1.1.1=he1b5a44_2 - libcurl=7.71.1=hcdd3856_8 - libdeflate=1.3=h516909a_0 - libedit=3.1.20191231=h46ee950_2 - libev=4.33=h516909a_1 - libffi=3.2.1=he1b5a44_1007 - libgcc=7.2.0=h69d50b8_2 - libgcc-devel_linux-64=7.5.0=h42c25f5_17 - libgcc-ng=9.3.0=h5dbcf3e_17 - libgd=2.2.5=h307a58e_1007 - libgenome=1.3.1=hc9558a2_2 - libgfortran-ng=7.5.0=hae1eefd_17 - libgfortran4=7.5.0=hae1eefd_17 - libglib=2.66.3=hbe7bbb4_0 - libgomp=9.3.0=h5dbcf3e_17 - libiconv=1.16=h516909a_0 - libidn11=1.34=h1cef754_0 - liblapack=3.9.0=3_h893e4fe_netlib - libllvm9=9.0.1=hf817b99_2 - libmems=1.6.0=h78a066a_2 - libmuscle=3.7=h470a237_1 - libnghttp2=1.41.0=h8cfc5f6_2 - libpng=1.6.37=hed695b0_2 - libprotobuf=3.14.0=h780b84a_0 - librsvg=2.50.2=h3442318_1 - libsodium=1.0.18=h516909a_1 - libsolv=0.7.16=h8b12597_0 - libssh2=1.9.0=hab1572f_5 - libstdcxx-devel_linux-64=7.5.0=h4084dd6_17 - libstdcxx-ng=9.3.0=h2ae2ef3_17 - libtiff=4.1.0=hc3755c2_3 - libtool=2.4.6=h58526e2_1007 - libuuid=2.32.1=h14c3975_1000 - libwebp=1.0.2=h56121f0_5 - libxcb=1.14=h7b6447c_0 - libxkbcommon=0.10.0=he1b5a44_0 - libxml2=2.9.10=hee79883_0 - libxslt=1.1.33=h31b3aaa_0 - llvm-openmp=8.0.1=hc9558a2_0 - lxml=4.6.2=py38hf1fe3a4_0 - lz4-c=1.9.2=he1b5a44_3 - lzo=2.10=h516909a_1000 - mafft=7.475=h516909a_0 - make=4.3=hd18ef5c_1 - mamba=0.7.6=py38h4c9354d_0 - markupsafe=1.1.1=py38h8df0ef7_2 - mash=2.2.2=ha61e061_2 - mauve=2.4.0.snapshot_2015_02_13=h2688d6d_2 - mauvealigner=1.2.0=h8b68381_1 - maxbin2=2.2.7=he1b5a44_2 - mcl=14.137=pl526h516909a_5 - megahit=1.2.9=h8b12597_0 - metabat2=2.15=h137b6e9_0 - metasnv=1.0.3=h230ddbb_2 - minced=0.4.2=0 - motus=2.5.1=py_0 - multidict=5.1.0=py38h497a2fe_0 - nbformat=5.0.8=py_0 - ncurses=6.1=hf484d3e_1002 - networkx=2.5=py_0 - nose=1.3.7=py38h32f6830_1004 - nspr=4.29=he1b5a44_1 - nss=3.55=he751ad9_0 - numpy=1.19.5=py38h18fd61f_0 - oauth2client=4.1.3=py_0 - openjdk=8.0.192=h516909a_1005 - openjpeg=2.3.1=hf7af979_3 - openmp=8.0.1=0 - openssl=1.1.1i=h7f98852_0 - pandas=1.2.0=py38h51da96c_0 - pango=1.42.4=h7062337_4 - parallel=20201122=ha770c72_0 - paramiko=2.7.2=pyh9f0ad1d_0 - pcre=8.44=he1b5a44_0 - pcre2=10.35=h032f7d1_2 - perl=5.26.2=h36c2ea0_1008 - perl-aceperl=1.92=pl526_2 - perl-algorithm-diff=1.1903=pl526_2 - perl-algorithm-munkres=0.08=pl526_1 - perl-apache-test=1.40=pl526_1 - perl-app-cpanminus=1.7044=pl526_1 - perl-appconfig=1.71=pl526_1 - perl-archive-tar=2.32=pl526_0 - perl-array-compare=3.0.1=pl526_1 - perl-array-utils=0.5=pl526_2 - perl-autoloader=5.74=pl526_2 - perl-base=2.23=pl526_1 - perl-bio-asn1-entrezgene=1.73=pl526_0 - perl-bio-featureio=1.6.905=pl526_1 - perl-bio-phylo=0.58=pl526_1 - perl-bio-samtools=1.43=pl526h1341992_1 - perl-bioperl=1.6.924=6 - perl-bioperl-core=1.6.924=1 - perl-bioperl-run=1.007002=pl526_3 - perl-business-isbn=3.004=pl526_0 - perl-business-isbn-data=20140910.003=pl526_0 - perl-cache-cache=1.08=pl526_0 - perl-capture-tiny=0.48=pl526_0 - perl-carp=1.38=pl526_3 - perl-cgi=4.44=pl526h14c3975_1 - perl-class-data-inheritable=0.08=pl526_1 - perl-class-inspector=1.34=pl526_0 - perl-class-load=0.25=pl526_0 - perl-class-load-xs=0.10=pl526h6bb024c_2 - perl-class-method-modifiers=2.12=pl526_0 - perl-clone=0.42=pl526h516909a_0 - perl-common-sense=3.74=pl526_2 - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 - perl-constant=1.33=pl526_1 - perl-convert-binary-c=0.78=pl526h6bb024c_3 - perl-convert-binhex=1.125=pl526_1 - perl-crypt-rc4=2.02=pl526_1 - perl-data-dumper=2.173=pl526_0 - perl-data-optlist=0.110=pl526_2 - perl-data-stag=0.14=pl526_1 - perl-date-format=2.30=pl526_2 - perl-dbd-sqlite=1.64=pl526h516909a_0 - perl-dbi=1.642=pl526_0 - perl-devel-globaldestruction=0.14=pl526_0 - perl-devel-overloadinfo=0.005=pl526_0 - perl-devel-stacktrace=2.04=pl526_0 - perl-digest-hmac=1.03=pl526_3 - perl-digest-md5=2.55=pl526_0 - perl-digest-perl-md5=1.9=pl526_1 - perl-digest-sha1=2.13=pl526h6bb024c_1 - perl-dist-checkconflicts=0.11=pl526_2 - perl-dynaloader=1.25=pl526_1 - perl-email-date-format=1.005=pl526_2 - perl-encode=2.88=pl526_1 - perl-encode-locale=1.05=pl526_6 - perl-env-path=0.19=pl526_2 - perl-error=0.17027=pl526_1 - perl-eval-closure=0.14=pl526h6bb024c_4 - perl-exception-class=1.44=pl526_0 - perl-exporter=5.72=pl526_1 - perl-exporter-tiny=1.002001=pl526_0 - perl-extutils-makemaker=7.36=pl526_1 - perl-file-find-rule=0.34=pl526_5 - perl-file-grep=0.02=pl526_3 - perl-file-listing=6.04=pl526_1 - perl-file-path=2.16=pl526_0 - perl-file-slurp-tiny=0.004=pl526_1 - perl-file-slurper=0.012=pl526_0 - perl-file-sort=1.01=pl526_2 - perl-file-temp=0.2304=pl526_2 - perl-file-which=1.23=pl526_0 - perl-font-afm=1.20=pl526_2 - perl-font-ttf=1.06=pl526_0 - perl-gd=2.71=pl526he860b03_0 - perl-getopt-long=2.50=pl526_1 - perl-graph=0.9704=pl526_1 - perl-graph-readwrite=2.09=pl526_2 - perl-graphviz=2.24=pl526h734ff71_0 - perl-html-element-extended=1.18=pl526_1 - perl-html-entities-numbered=0.04=pl526_1 - perl-html-formatter=2.16=pl526_0 - perl-html-parser=3.72=pl526h6bb024c_5 - perl-html-tableextract=2.13=pl526_2 - perl-html-tagset=3.20=pl526_3 - perl-html-tidy=1.60=pl526_0 - perl-html-tree=5.07=pl526_1 - perl-html-treebuilder-xpath=0.14=pl526_1 - perl-http-cookies=6.04=pl526_0 - perl-http-daemon=6.01=pl526_1 - perl-http-date=6.02=pl526_3 - perl-http-message=6.18=pl526_0 - perl-http-negotiate=6.01=pl526_3 - perl-image-info=1.38=pl526_1 - perl-image-size=3.300=pl526_2 - perl-io-compress=2.087=pl526he1b5a44_0 - perl-io-html=1.001=pl526_2 - perl-io-sessiondata=1.03=pl526_1 - perl-io-socket-ssl=2.066=pl526_0 - perl-io-string=1.08=pl526_3 - perl-io-stringy=2.111=pl526_1 - perl-io-tty=1.12=pl526_1 - perl-io-zlib=1.10=pl526_2 - perl-ipc-run=20180523.0=pl526_0 - perl-ipc-sharelite=0.17=pl526h6bb024c_1 - perl-jcode=2.07=pl526_2 - perl-json=4.02=pl526_0 - perl-json-xs=2.34=pl526h6bb024c_3 - perl-lib=0.63=pl526_1 - perl-libwww-perl=6.39=pl526_0 - perl-libxml-perl=0.08=pl526_2 - perl-list-moreutils=0.428=pl526_1 - perl-list-moreutils-xs=0.428=pl526_0 - perl-log-log4perl=1.49=pl526_0 - perl-lwp-mediatypes=6.04=pl526_0 - perl-lwp-protocol-https=6.07=pl526_4 - perl-lwp-simple=6.15=pl526h470a237_4 - perl-mailtools=2.21=pl526_0 - perl-math-cdf=0.1=pl526h14c3975_5 - perl-math-derivative=1.01=pl526_0 - perl-math-random=0.72=pl526h14c3975_2 - perl-math-spline=0.02=pl526_2 - perl-mime-base64=3.15=pl526_1 - perl-mime-lite=3.030=pl526_1 - perl-mime-tools=5.508=pl526_1 - perl-mime-types=2.17=pl526_0 - perl-mldbm=2.05=pl526_1 - perl-module-implementation=0.09=pl526_2 - perl-module-runtime=0.016=pl526_1 - perl-module-runtime-conflicts=0.003=pl526_0 - perl-moo=2.003004=pl526_0 - perl-moose=2.2011=pl526hf484d3e_1 - perl-mozilla-ca=20180117=pl526_1 - perl-mro-compat=0.13=pl526_0 - perl-net-http=6.19=pl526_0 - perl-net-ssleay=1.88=pl526h90d6eec_0 - perl-ntlm=1.09=pl526_4 - perl-number-compare=0.03=pl526_2 - perl-ole-storage_lite=0.19=pl526_3 - perl-package-deprecationmanager=0.17=pl526_0 - perl-package-stash=0.38=pl526hf484d3e_1 - perl-package-stash-xs=0.28=pl526hf484d3e_1 - perl-params-util=1.07=pl526h6bb024c_4 - perl-parent=0.236=pl526_1 - perl-parse-recdescent=1.967015=pl526_0 - perl-parse-yapp=1.21=pl526_0 - perl-pathtools=3.75=pl526h14c3975_1 - perl-pdf-api2=2.035=pl526_0 - perl-perlio-utf8_strict=0.007=pl526h6bb024c_1 - perl-pod-escapes=1.07=pl526_1 - perl-pod-usage=1.69=pl526_1 - perl-postscript=0.06=pl526_2 - perl-role-tiny=2.000008=pl526_0 - perl-scalar-list-utils=1.52=pl526h516909a_0 - perl-set-scalar=1.29=pl526_2 - perl-soap-lite=1.19=pl526_1 - perl-socket=2.027=pl526_1 - perl-sort-naturally=1.03=pl526_2 - perl-spreadsheet-parseexcel=0.65=pl526_2 - perl-spreadsheet-writeexcel=2.40=pl526_2 - perl-statistics-descriptive=3.0702=pl526_0 - perl-storable=3.15=pl526h14c3975_0 - perl-sub-exporter=0.987=pl526_2 - perl-sub-exporter-progressive=0.001013=pl526_0 - perl-sub-identify=0.14=pl526h14c3975_0 - perl-sub-install=0.928=pl526_2 - perl-sub-name=0.21=pl526_1 - perl-sub-quote=2.006003=pl526_1 - perl-sub-uplevel=0.2800=pl526h14c3975_2 - perl-svg=2.84=pl526_0 - perl-svg-graph=0.02=pl526_3 - perl-task-weaken=1.06=pl526_0 - perl-template-toolkit=2.26=pl526_1 - perl-test=1.26=pl526_1 - perl-test-builder-tester=1.23_002=pl526_1 - perl-test-deep=1.128=pl526_1 - perl-test-differences=0.67=pl526_0 - perl-test-exception=0.43=pl526_2 - perl-test-files=0.14=pl526_2 - perl-test-harness=3.42=pl526_0 - perl-test-leaktrace=0.16=pl526h14c3975_2 - perl-test-most=0.35=pl526_0 - perl-test-output=1.031=pl526_0 - perl-test-requiresinternet=0.05=pl526_0 - perl-test-warn=0.36=pl526_1 - perl-text-csv=2.00=pl526_0 - perl-text-diff=1.45=pl526_0 - perl-text-glob=0.11=pl526_1 - perl-threaded=5.26.0=0 - perl-tie-ixhash=1.23=pl526_2 - perl-time-hires=1.9760=pl526h14c3975_1 - perl-time-local=1.28=pl526_1 - perl-timedate=2.30=pl526_1 - perl-tree-dag_node=1.31=pl526_0 - perl-try-tiny=0.30=pl526_1 - perl-type-tiny=1.004004=pl526_0 - perl-types-serialiser=1.0=pl526_2 - perl-unicode-map=0.112=pl526h6bb024c_3 - perl-uri=1.76=pl526_0 - perl-www-robotrules=6.02=pl526_3 - perl-xml-dom=1.46=pl526_0 - perl-xml-dom-xpath=0.14=pl526_1 - perl-xml-filter-buffertext=1.01=pl526_2 - perl-xml-libxml=2.0132=pl526h7ec2d77_1 - perl-xml-libxslt=1.94=pl526_1 - perl-xml-namespacesupport=1.12=pl526_0 - perl-xml-parser=2.44_01=pl526ha1d75be_1002 - perl-xml-regexp=0.04=pl526_2 - perl-xml-sax=1.02=pl526_0 - perl-xml-sax-base=1.09=pl526_0 - perl-xml-sax-expat=0.51=pl526_3 - perl-xml-sax-writer=0.57=pl526_0 - perl-xml-simple=2.25=pl526_1 - perl-xml-twig=3.52=pl526_2 - perl-xml-writer=0.625=pl526_2 - perl-xml-xpath=1.44=pl526_0 - perl-xml-xpathengine=0.14=pl526_2 - perl-xsloader=0.24=pl526_0 - perl-yaml=1.29=pl526_0 - pip=20.3.3=pyhd8ed1ab_0 - pixman=0.38.0=h516909a_1003 - pkg-config=0.29.2=h516909a_1008 - pplacer=1.1.alpha19=1 - prank=v.170427=hc9558a2_3 - prettytable=2.0.0=pyhd8ed1ab_0 - prodigal=2.6.3=h516909a_2 - prokka=1.13=2 - protobuf=3.14.0=py38h709712a_0 - psutil=5.8.0=py38h497a2fe_0 - pulp=2.3.1=py38h32f6830_0 - pyasn1=0.4.8=py_0 - pyasn1-modules=0.2.8=py_0 - pycosat=0.6.3=py38h8df0ef7_1005 - pycparser=2.20=pyh9f0ad1d_2 - pyfaidx=0.5.9.2=pyh3252c3a_0 - pygments=2.7.3=pyhd8ed1ab_0 - pygmes=0.1.7=py_0 - pygraphviz=1.6=py38h25c7686_1 - pynacl=1.4.0=py38h1e0a361_2 - pyopenssl=20.0.1=pyhd8ed1ab_0 - pyparsing=2.4.7=pyh9f0ad1d_0 - pyqt=5.12.3=py38ha8c2ead_3 - pyrsistent=0.17.3=py38h25fe258_1 - pysftp=0.2.9=py_1 - pysocks=1.7.1=py38h924ce5b_2 - python=3.8.5=h425cb1d_2_cpython - python-dateutil=2.8.1=py_0 - python-irodsclient=0.8.2=py_0 - python_abi=3.8=1_cp38 - pytz=2020.5=pyhd8ed1ab_0 - pyyaml=5.3.1=py38h8df0ef7_1 - qt=5.12.5=hd8c4c69_1 - r-ade4=1.7_16=r40h30ea16f_1 - r-ape=5.4_1=r40h51c796c_0 - r-assertthat=0.2.1=r40h6115d3f_2 - r-backports=1.2.1=r40hcfec24a_0 - r-base=4.0.2=h95c6c4b_0 - r-bitops=1.0_6=r40hcdcec82_1004 - r-brio=1.1.0=r40h9e2df91_1 - r-callr=3.5.1=r40h142f84f_0 - r-catools=1.18.0=r40h0357c0b_1 - r-cli=2.2.0=r40hc72bb7e_0 - r-colorspace=2.0_0=r40h9e2df91_0 - r-crayon=1.3.4=r40h6115d3f_1003 - r-data.table=1.13.6=r40hcfec24a_0 - r-desc=1.2.0=r40h6115d3f_1003 - r-diffobj=0.3.3=r40hcfec24a_0 - r-digest=0.6.27=r40h1b71b39_0 - r-dplyr=1.0.2=r40h0357c0b_0 - r-dynamictreecut=1.63_1=r40h6115d3f_1003 - r-ellipsis=0.3.1=r40hcdcec82_0 - r-evaluate=0.14=r40h6115d3f_2 - r-fansi=0.4.1=r40hcdcec82_1 - r-farver=2.0.3=r40h0357c0b_1 - r-generics=0.1.0=r40hc72bb7e_0 - r-getopt=1.20.3=r40_2 - r-ggplot2=3.3.3=r40hc72bb7e_0 - r-glue=1.4.2=r40hcdcec82_0 - r-gplots=3.1.1=r40hc72bb7e_0 - r-gsubfn=0.7=r40h6115d3f_1002 - r-gtable=0.3.0=r40h6115d3f_3 - r-gtools=3.8.2=r40hcdcec82_1 - r-hms=0.5.3=r40h6115d3f_1 - r-isoband=0.2.3=r40h03ef668_0 - r-jsonlite=1.7.2=r40hcfec24a_0 - r-kernsmooth=2.23_18=r40h7679c2e_0 - r-labeling=0.4.2=r40h142f84f_0 - r-lattice=0.20_41=r40hcdcec82_2 - r-lifecycle=0.2.0=r40h6115d3f_1 - r-magrittr=2.0.1=r40h9e2df91_1 - r-mass=7.3_53=r40hcdcec82_0 - r-matrix=1.3_2=r40he454529_0 - r-mgcv=1.8_33=r40h7fa42b6_0 - r-munsell=0.5.0=r40h6115d3f_1003 - r-nlme=3.1_150=r40h31ca83e_0 - r-pillar=1.4.7=r40hc72bb7e_0 - r-pixmap=0.4_11=r40h6115d3f_1003 - r-pkgbuild=1.2.0=r40hc72bb7e_0 - r-pkgconfig=2.0.3=r40h6115d3f_1 - r-pkgload=1.1.0=r40h0357c0b_0 - r-praise=1.0.0=r40h6115d3f_1004 - r-prettyunits=1.1.1=r40h6115d3f_1 - r-processx=3.4.5=r40hcfec24a_0 - r-progress=1.2.2=r40h6115d3f_2 - r-proto=1.0.0=r40_2003 - r-ps=1.5.0=r40hcfec24a_0 - r-purrr=0.3.4=r40hcdcec82_1 - r-r6=2.5.0=r40hc72bb7e_0 - r-rcolorbrewer=1.1_2=r40h6115d3f_1003 - r-rcpp=1.0.5=r40he524a50_0 - r-rematch2=2.1.2=r40h6115d3f_1 - r-rlang=0.4.10=r40hcfec24a_0 - r-rprojroot=2.0.2=r40hc72bb7e_0 - r-rstudioapi=0.13=r40hc72bb7e_0 - r-scales=1.1.1=r40h6115d3f_0 - r-segmented=1.3_1=r40hc72bb7e_0 - r-seqinr=4.2_5=r40hcfec24a_0 - r-sp=1.4_2=r40hcdcec82_0 - r-testthat=3.0.1=r40h03ef668_0 - r-tibble=3.0.4=r40h0eb13af_0 - r-tidyselect=1.1.0=r40h6115d3f_0 - r-utf8=1.1.4=r40hcdcec82_1003 - r-vctrs=0.3.6=r40hcfec24a_0 - r-viridislite=0.3.0=r40h6115d3f_1003 - r-waldo=0.2.3=r40hc72bb7e_0 - r-withr=2.3.0=r40h6115d3f_0 - r-zeallot=0.1.0=r40h6115d3f_1002 - ratelimiter=1.2.0=py38h32f6830_1001 - readline=8.0=h46ee950_1 - reproc=14.2.1=h36c2ea0_0 - reproc-cpp=14.2.1=h58526e2_0 - requests=2.25.1=pyhd3deb0d_0 - roary=3.7.0=0 - rsa=4.6=pyh9f0ad1d_0 - ruamel_yaml=0.15.87=py38h7b6447c_1 - s3transfer=0.3.3=py38h32f6830_2 - samtools=1.9=h10a08f8_12 - scikit-learn=0.24.0=py38h658cfdd_0 - scipy=1.5.3=py38h828c644_0 - sed=4.8=he412f7d_0 - semantic_version=2.8.5=pyh9f0ad1d_0 - setuptools=51.0.0=py38h06a4308_2 - simplejson=3.17.2=py38h497a2fe_1 - six=1.15.0=pyh9f0ad1d_0 - slacker=0.14.0=py_0 - smeg=1.1.1=0 - smmap=3.0.4=pyh9f0ad1d_0 - snakemake=5.31.1=0 - snakemake-minimal=5.31.1=py_0 - sqlite=3.32.3=hcee41ef_1 - sysroot_linux-64=2.12=h77966d4_13 - tar=1.32=hd4ba37b_0 - tbb=2020.3=hfd86e86_0 - tbl2asn=25.7=0 - threadpoolctl=2.1.0=pyh5ca1d4c_0 - tidyp=1.04=h516909a_2 - tk=8.6.10=h21135ba_1 - tktable=2.10=hb7b940f_3 - toposort=1.6=pyhd8ed1ab_0 - tqdm=4.55.1=pyhd8ed1ab_0 - traitlets=5.0.5=py_0 - typing-extensions=3.7.4.3=0 - typing_extensions=3.7.4.3=py_0 - tzdata=2020f=he74cb21_0 - uritemplate=3.0.1=py_0 - urllib3=1.26.2=pyhd8ed1ab_0 - wcwidth=0.2.5=pyh9f0ad1d_2 - wheel=0.36.2=pyhd3deb0d_0 - wrapt=1.12.1=py38h25fe258_2 - xmlrunner=1.7.7=py_0 - xorg-kbproto=1.0.7=h14c3975_1002 - xorg-libice=1.0.10=h516909a_0 - xorg-libsm=1.2.3=h84519dc_1000 - xorg-libx11=1.6.12=h516909a_0 - xorg-libxau=1.0.9=h14c3975_0 - xorg-libxdmcp=1.1.3=h516909a_0 - xorg-libxext=1.3.4=h516909a_0 - xorg-libxpm=3.5.13=h516909a_0 - xorg-libxrender=0.9.10=h516909a_1002 - xorg-libxt=1.1.5=h516909a_1003 - xorg-renderproto=0.11.1=h14c3975_1002 - xorg-xextproto=7.3.0=h14c3975_1002 - xorg-xproto=7.0.31=h14c3975_1007 - xz=5.2.5=h516909a_1 - yaml=0.2.5=h516909a_0 - yarl=1.5.1=py38h1e0a361_0 - zipp=3.4.0=py_0 - zlib=1.2.11=h516909a_1010 - zstd=1.4.8=hdf46e1d_0 ================================================ FILE: workflow/envs/metaWRAP_env.yml ================================================ name: metawrap channels: - ursky - bioconda - conda-forge - defaults dependencies: - metawrap-mg>=1.2.3 - python=2.7 - biopython - bowtie2 - bwa - checkm-genome - matplotlib - megahit - pandas - quast - r-ggplot2 - r-recommended - salmon - samtools - seaborn - spades ================================================ FILE: workflow/envs/prokkaroary_env.yml ================================================ name: prokkaroary channels: - conda-forge - bioconda - defaults dependencies: - _libgcc_mutex=0.1=conda_forge - _openmp_mutex=4.5=1_gnu - alsa-lib=1.2.3=h516909a_0 - aragorn=1.2.38=h516909a_3 - barrnap=0.9=3 - bedtools=2.29.2=hc088bd4_0 - blast=2.10.1=pl526he19e7b1_3 - bzip2=1.0.8=h7f98852_4 - c-ares=1.17.1=h36c2ea0_0 - ca-certificates=2020.12.5=ha878542_0 - cairo=1.16.0=h7979940_1007 - cd-hit=4.8.1=hdbcaa40_0 - certifi=2020.12.5=py37h89c1867_1 - clustalw=2.1=hc9558a2_5 - curl=7.71.1=he644dc0_8 - entrez-direct=13.9=pl526h375a9b1_0 - expat=2.2.9=he1b5a44_2 - fasttree=2.1.10=h516909a_4 - fontconfig=2.13.1=h736d332_1003 - freetype=2.10.4=h7ca028e_0 - fribidi=1.0.10=h36c2ea0_0 - gettext=0.19.8.1=h0b5b191_1005 - giflib=5.2.1=h36c2ea0_2 - graphite2=1.3.13=h58526e2_1001 - graphviz=2.42.3=h0511662_0 - harfbuzz=2.7.4=h5cf4720_0 - hmmer=3.3.1=he1b5a44_0 - icu=68.1=h58526e2_0 - infernal=1.1.3=h516909a_0 - jpeg=9d=h36c2ea0_0 - krb5=1.17.2=h926e7f8_0 - lcms2=2.11=hcbb858e_1 - ld_impl_linux-64=2.35.1=hea4e1c9_1 - libcurl=7.71.1=hcdd3856_8 - libdb=6.2.32=h9c3ff4c_0 - libedit=3.1.20191231=he28a2e2_2 - libev=4.33=h516909a_1 - libffi=3.3=h58526e2_2 - libgcc-ng=9.3.0=h5dbcf3e_17 - libgd=2.3.0=h47910db_1 - libglib=2.66.4=h164308a_1 - libgomp=9.3.0=h5dbcf3e_17 - libiconv=1.16=h516909a_0 - libidn11=1.34=h1cef754_0 - libnghttp2=1.41.0=h8cfc5f6_2 - libpng=1.6.37=h21135ba_2 - libssh2=1.9.0=hab1572f_5 - libstdcxx-ng=9.3.0=h2ae2ef3_17 - libtiff=4.2.0=hdc55705_0 - libtool=2.4.6=h58526e2_1007 - libuuid=2.32.1=h7f98852_1000 - libwebp=1.1.0=h76fa15c_4 - libwebp-base=1.1.0=h36c2ea0_3 - libxcb=1.13=h14c3975_1002 - libxml2=2.9.10=h72842e0_3 - libxslt=1.1.33=h15afd5d_2 - lz4-c=1.9.3=h9c3ff4c_0 - mafft=7.475=h516909a_0 - mcl=14.137=pl526h516909a_5 - minced=0.4.2=0 - ncurses=6.2=h58526e2_4 - openjdk=11.0.8=hacce0ff_0 - openssl=1.1.1i=h7f98852_0 - paml=4.9=h516909a_5 - pango=1.42.4=h69149e4_5 - parallel=20201122=ha770c72_0 - pcre=8.44=he1b5a44_0 - perl=5.26.2=h36c2ea0_1008 - perl-aceperl=1.92=pl526_2 - perl-algorithm-diff=1.1903=pl526_2 - perl-algorithm-munkres=0.08=pl526_1 - perl-apache-test=1.40=pl526_1 - perl-app-cpanminus=1.7044=pl526_1 - perl-appconfig=1.71=pl526_1 - perl-archive-tar=2.32=pl526_0 - perl-array-compare=3.0.1=pl526_1 - perl-array-utils=0.5=pl526_2 - perl-autoloader=5.74=pl526_2 - perl-base=2.23=pl526_1 - perl-bio-asn1-entrezgene=1.73=pl526_1 - perl-bio-coordinate=1.007001=pl526_1 - perl-bio-featureio=1.6.905=pl526_2 - perl-bio-phylo=0.58=pl526_2 - perl-bio-samtools=1.43=pl526h1341992_1 - perl-bio-tools-phylo-paml=1.7.3=pl526_1 - perl-bio-tools-run-alignment-clustalw=1.7.4=pl526_1 - perl-bio-tools-run-alignment-tcoffee=1.7.4=pl526_2 - perl-bioperl=1.7.2=pl526_11 - perl-bioperl-core=1.007002=pl526_2 - perl-bioperl-run=1.007002=pl526_4 - perl-business-isbn=3.004=pl526_0 - perl-business-isbn-data=20140910.003=pl526_0 - perl-cache-cache=1.08=pl526_0 - perl-capture-tiny=0.48=pl526_0 - perl-carp=1.38=pl526_3 - perl-cgi=4.44=pl526h14c3975_1 - perl-class-data-inheritable=0.08=pl526_1 - perl-class-inspector=1.34=pl526_0 - perl-class-load=0.25=pl526_0 - perl-class-load-xs=0.10=pl526h6bb024c_2 - perl-class-method-modifiers=2.12=pl526_0 - perl-clone=0.42=pl526h516909a_0 - perl-common-sense=3.74=pl526_2 - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 - perl-constant=1.33=pl526_1 - perl-convert-binary-c=0.78=pl526h6bb024c_3 - perl-convert-binhex=1.125=pl526_1 - perl-crypt-rc4=2.02=pl526_1 - perl-data-dumper=2.173=pl526_0 - perl-data-optlist=0.110=pl526_2 - perl-data-stag=0.14=pl526_1 - perl-date-format=2.30=pl526_2 - perl-db-file=1.855=pl526h516909a_0 - perl-dbd-sqlite=1.64=pl526h516909a_0 - perl-dbi=1.642=pl526_0 - perl-devel-globaldestruction=0.14=pl526_0 - perl-devel-overloadinfo=0.005=pl526_0 - perl-devel-stacktrace=2.04=pl526_0 - perl-digest-hmac=1.03=pl526_3 - perl-digest-md5=2.55=pl526_0 - perl-digest-md5-file=0.08=pl526_2 - perl-digest-perl-md5=1.9=pl526_1 - perl-digest-sha1=2.13=pl526h6bb024c_1 - perl-dist-checkconflicts=0.11=pl526_2 - perl-dynaloader=1.25=pl526_1 - perl-email-date-format=1.005=pl526_2 - perl-encode=2.88=pl526_1 - perl-encode-locale=1.05=pl526_6 - perl-error=0.17027=pl526_1 - perl-eval-closure=0.14=pl526h6bb024c_4 - perl-exception-class=1.44=pl526_0 - perl-exporter=5.72=pl526_1 - perl-exporter-tiny=1.002001=pl526_0 - perl-extutils-makemaker=7.36=pl526_1 - perl-file-find-rule=0.34=pl526_5 - perl-file-grep=0.02=pl526_3 - perl-file-listing=6.04=pl526_1 - perl-file-path=2.16=pl526_0 - perl-file-slurp-tiny=0.004=pl526_1 - perl-file-slurper=0.012=pl526_0 - perl-file-sort=1.01=pl526_2 - perl-file-temp=0.2304=pl526_2 - perl-file-which=1.23=pl526_0 - perl-font-afm=1.20=pl526_2 - perl-font-ttf=1.06=pl526_0 - perl-gd=2.68=pl526he941832_0 - perl-getopt-long=2.50=pl526_1 - perl-graph=0.9704=pl526_1 - perl-graph-readwrite=2.09=pl526_2 - perl-graphviz=2.24=pl526h734ff71_0 - perl-html-element-extended=1.18=pl526_1 - perl-html-entities-numbered=0.04=pl526_1 - perl-html-formatter=2.16=pl526_0 - perl-html-parser=3.72=pl526h6bb024c_5 - perl-html-tableextract=2.13=pl526_2 - perl-html-tagset=3.20=pl526_3 - perl-html-tidy=1.60=pl526_0 - perl-html-tree=5.07=pl526_1 - perl-html-treebuilder-xpath=0.14=pl526_1 - perl-http-cookies=6.04=pl526_0 - perl-http-daemon=6.01=pl526_1 - perl-http-date=6.02=pl526_3 - perl-http-message=6.18=pl526_0 - perl-http-negotiate=6.01=pl526_3 - perl-image-info=1.38=pl526_1 - perl-image-size=3.300=pl526_2 - perl-io-compress=2.087=pl526he1b5a44_0 - perl-io-html=1.001=pl526_2 - perl-io-sessiondata=1.03=pl526_1 - perl-io-socket-ssl=2.066=pl526_0 - perl-io-string=1.08=pl526_3 - perl-io-stringy=2.111=pl526_1 - perl-io-tty=1.12=pl526_1 - perl-io-zlib=1.10=pl526_2 - perl-ipc-run=20180523.0=pl526_0 - perl-ipc-sharelite=0.17=pl526h6bb024c_1 - perl-jcode=2.07=pl526_2 - perl-json=4.02=pl526_0 - perl-json-xs=2.34=pl526h6bb024c_3 - perl-lib=0.63=pl526_1 - perl-libwww-perl=6.39=pl526_0 - perl-libxml-perl=0.08=pl526_2 - perl-list-moreutils=0.428=pl526_1 - perl-list-moreutils-xs=0.428=pl526_0 - perl-log-log4perl=1.49=pl526_0 - perl-lwp-mediatypes=6.04=pl526_0 - perl-lwp-protocol-https=6.07=pl526_4 - perl-lwp-simple=6.15=pl526h470a237_4 - perl-mailtools=2.21=pl526_0 - perl-math-cdf=0.1=pl526h14c3975_5 - perl-math-derivative=1.01=pl526_0 - perl-math-random=0.72=pl526h14c3975_2 - perl-math-spline=0.02=pl526_2 - perl-mime-base64=3.15=pl526_1 - perl-mime-lite=3.030=pl526_1 - perl-mime-tools=5.508=pl526_1 - perl-mime-types=2.17=pl526_0 - perl-mldbm=2.05=pl526_1 - perl-module-implementation=0.09=pl526_2 - perl-module-runtime=0.016=pl526_1 - perl-module-runtime-conflicts=0.003=pl526_0 - perl-moo=2.003004=pl526_0 - perl-moose=2.2011=pl526hf484d3e_1 - perl-mozilla-ca=20180117=pl526_1 - perl-mro-compat=0.13=pl526_0 - perl-net-http=6.19=pl526_0 - perl-net-ssleay=1.88=pl526h90d6eec_0 - perl-ntlm=1.09=pl526_4 - perl-number-compare=0.03=pl526_2 - perl-ole-storage_lite=0.19=pl526_3 - perl-package-deprecationmanager=0.17=pl526_0 - perl-package-stash=0.38=pl526hf484d3e_1 - perl-package-stash-xs=0.28=pl526hf484d3e_1 - perl-params-util=1.07=pl526h6bb024c_4 - perl-parent=0.236=pl526_1 - perl-parse-recdescent=1.967015=pl526_0 - perl-parse-yapp=1.21=pl526_0 - perl-pathtools=3.75=pl526h14c3975_1 - perl-pdf-api2=2.035=pl526_0 - perl-perlio-utf8_strict=0.007=pl526h6bb024c_1 - perl-pod-escapes=1.07=pl526_1 - perl-pod-usage=1.69=pl526_1 - perl-postscript=0.06=pl526_2 - perl-role-tiny=2.000008=pl526_0 - perl-scalar-list-utils=1.52=pl526h516909a_0 - perl-set-scalar=1.29=pl526_2 - perl-soap-lite=1.19=pl526_1 - perl-socket=2.027=pl526_1 - perl-sort-naturally=1.03=pl526_2 - perl-spreadsheet-parseexcel=0.65=pl526_2 - perl-spreadsheet-writeexcel=2.40=pl526_2 - perl-statistics-descriptive=3.0702=pl526_0 - perl-storable=3.15=pl526h14c3975_0 - perl-sub-exporter=0.987=pl526_2 - perl-sub-exporter-progressive=0.001013=pl526_0 - perl-sub-identify=0.14=pl526h14c3975_0 - perl-sub-install=0.928=pl526_2 - perl-sub-name=0.21=pl526_1 - perl-sub-quote=2.006003=pl526_1 - perl-sub-uplevel=0.2800=pl526h14c3975_2 - perl-svg=2.84=pl526_0 - perl-svg-graph=0.02=pl526_3 - perl-task-weaken=1.06=pl526_0 - perl-template-toolkit=2.26=pl526_1 - perl-test=1.26=pl526_1 - perl-test-deep=1.128=pl526_1 - perl-test-differences=0.67=pl526_0 - perl-test-exception=0.43=pl526_2 - perl-test-harness=3.42=pl526_0 - perl-test-leaktrace=0.16=pl526h14c3975_2 - perl-test-most=0.35=pl526_0 - perl-test-requiresinternet=0.05=pl526_0 - perl-test-warn=0.36=pl526_1 - perl-text-csv=2.00=pl526_0 - perl-text-diff=1.45=pl526_0 - perl-text-glob=0.11=pl526_1 - perl-tie-ixhash=1.23=pl526_2 - perl-time-hires=1.9760=pl526h14c3975_1 - perl-time-local=1.28=pl526_1 - perl-timedate=2.30=pl526_1 - perl-tree-dag_node=1.31=pl526_0 - perl-try-tiny=0.30=pl526_1 - perl-type-tiny=1.004004=pl526_0 - perl-types-serialiser=1.0=pl526_2 - perl-unicode-map=0.112=pl526h6bb024c_3 - perl-uri=1.76=pl526_0 - perl-www-robotrules=6.02=pl526_3 - perl-xml-dom=1.46=pl526_0 - perl-xml-dom-xpath=0.14=pl526_1 - perl-xml-filter-buffertext=1.01=pl526_2 - perl-xml-libxml=2.0132=pl526h7ec2d77_1 - perl-xml-libxslt=1.94=pl526_1 - perl-xml-namespacesupport=1.12=pl526_0 - perl-xml-parser=2.44_01=pl526ha1d75be_1002 - perl-xml-regexp=0.04=pl526_2 - perl-xml-sax=1.02=pl526_0 - perl-xml-sax-base=1.09=pl526_0 - perl-xml-sax-expat=0.51=pl526_3 - perl-xml-sax-writer=0.57=pl526_0 - perl-xml-simple=2.25=pl526_1 - perl-xml-twig=3.52=pl526_2 - perl-xml-writer=0.625=pl526_2 - perl-xml-xpath=1.44=pl526_0 - perl-xml-xpathengine=0.14=pl526_2 - perl-xsloader=0.24=pl526_0 - perl-yaml=1.29=pl526_0 - pip=20.3.3=pyhd8ed1ab_0 - pixman=0.40.0=h36c2ea0_0 - prank=v.170427=hc9558a2_3 - prodigal=2.6.3=h516909a_2 - prokka=1.14.6=pl526_0 - pthread-stubs=0.4=h36c2ea0_1001 - python=3.7.9=hffdb5ce_0_cpython - python_abi=3.7=1_cp37m - readline=8.0=he28a2e2_2 - roary=3.13.0=pl526h516909a_0 - setuptools=49.6.0=py37he5f6b98_2 - sqlite=3.34.0=h74cdb3f_0 - t_coffee=11.0.8=py37hea885bf_8 - tbl2asn-forever=25.7.2f=h516909a_0 - tidyp=1.04=h516909a_2 - tk=8.6.10=h21135ba_1 - wheel=0.36.2=pyhd3deb0d_0 - xorg-fixesproto=5.0=h14c3975_1002 - xorg-inputproto=2.3.2=h7f98852_1002 - xorg-kbproto=1.0.7=h7f98852_1002 - xorg-libice=1.0.10=h516909a_0 - xorg-libsm=1.2.3=h84519dc_1000 - xorg-libx11=1.6.12=h516909a_0 - xorg-libxau=1.0.9=h14c3975_0 - xorg-libxdmcp=1.1.3=h516909a_0 - xorg-libxext=1.3.4=h516909a_0 - xorg-libxfixes=5.0.3=h516909a_1004 - xorg-libxi=1.7.10=h516909a_0 - xorg-libxpm=3.5.13=h516909a_0 - xorg-libxrender=0.9.10=h516909a_1002 - xorg-libxt=1.1.5=h516909a_1003 - xorg-libxtst=1.2.3=h516909a_1002 - xorg-recordproto=1.14.2=h516909a_1002 - xorg-renderproto=0.11.1=h14c3975_1002 - xorg-xextproto=7.3.0=h7f98852_1002 - xorg-xproto=7.0.31=h7f98852_1007 - xz=5.2.5=h516909a_1 - zlib=1.2.11=h516909a_1010 - zstd=1.4.8=ha95c52a_1 ================================================ FILE: workflow/metaGEM.sh ================================================ #!/bin/bash # Version VERSION="1.0.5" # Logo printLogo() { echo " ================================================================================================================================= Developed by: Francisco Zorrilla, Kiran R. Patil, and Aleksej Zelezniak___________________________________________________________ Publication: doi.org/10.1101/2020.12.31.424982___________________________/\\\\\\\\\\\\___/\\\\\\\\\\\\\\\___/\\\\____________/\\\\_ ________________________________________________________________________/\\\//////////___\/\\\///////////___\/\\\\\\________/\\\\\\_ ____________________________________________/\\\________________________/\\\______________\/\\\______________\/\\\//\\\____/\\\//\\\_ _______/\\\\\__/\\\\\________/\\\\\\\\____/\\\\\\\\\\\___/\\\\\\\\\_____\/\\\____/\\\\\\\__\/\\\\\\\\\\\______\/\\\\///\\\/\\\/_\/\\\_ ______/\\\///\\\\\///\\\____/\\\/////\\\__\////\\\////___\////////\\\____\/\\\___\/////\\\__\/\\\///////_______\/\\\__\///\\\/___\/\\\_ ______\/\\\_\//\\\__\/\\\___/\\\\\\\\\\\______\/\\\_________/\\\\\\\\\\___\/\\\_______\/\\\__\/\\\______________\/\\\____\///_____\/\\\_ _______\/\\\__\/\\\__\/\\\__\//\\///////_______\/\\\_/\\____/\\\/////\\\___\/\\\_______\/\\\__\/\\\______________\/\\\_____________\/\\\_ ________\/\\\__\/\\\__\/\\\___\//\\\\\\\\\\_____\//\\\\\____\//\\\\\\\\/\\__\//\\\\\\\\\\\\/___\/\\\\\\\\\\\\\\\__\/\\\_____________\/\\\_ _________\///___\///___\///_____\//////////_______\/////______\////////\//____\////////////_____\///////////////___\///______________\///__ ============================================================================================================================================= A Snakemake-based pipeline desinged to predict metabolic interactions directly from metagenomics data using high performance computer clusters =============================================================================================================================================== Version: ${VERSION} " } # Helpfile function usage() { printLogo echo -n "Usage: bash metaGEM.sh [-t|--task TASK] [-j|--nJobs NUMBER OF JOBS] [-c|--cores NUMBER OF CORES] [-m|--mem GB RAM] [-h|--hours MAX RUNTIME] [-l|--local] Snakefile wrapper/parser for metaGEM, for more details visit https://github.com/franciscozorrilla/metaGEM. Options: -t, --task Specify task to complete: SETUP createFolders downloadToy organizeData check CORE WORKFLOW fastp megahit crossMapSeries kallistoIndex crossMapParallel kallisto2concoct concoct metabat maxbin binRefine binReassemble extractProteinBins carveme memote organizeGEMs smetana extractDnaBins gtdbtk abundance BONUS grid prokka roary eukrep eukcc VISUALIZATION (in development) stats qfilterVis assemblyVis binningVis compositionVis modelVis interactionVis growthVis -j, --nJobs Specify number of jobs to run in parallel -c, --nCores Specify number of cores per job -m, --mem Specify memory in GB required for job -h, --hours Specify number of hours to allocated to job runtime -l, --local Run jobs on local machine for non-cluster usage " } # Run check task run_check() { #check if conda is installed/available echo -ne "Checking if conda is available ... " condatest=$(conda list|wc -l) if [[ "$condatest" -eq 0 ]]; then echo -e "WARNING: Conda is not available! Please load your cluster's conda module or install locally.\n" && exit elif [[ "$condatest" -gt 0 ]]; then condav=$(conda --version|cut -d ' ' -f2) echo -e "detected version $condav!" fi # check if conda environments are present echo -ne "Searching for metaGEM conda environment ... " envcheck1=$(conda info --envs|grep -w metagem|wc -l) if [[ "$envcheck1" -ge 1 ]]; then echo "detected! Activating metagem env ... " conda activate metagem else echo "not detected, please run the env_setup.sh script!" fi echo -ne "Searching for metaWRAP conda environment ... " envcheck2=$(conda info --envs|grep -w metawrap|wc -l) if [[ "$envcheck2" -ge 1 ]]; then echo "detected!" else echo "not detected, please run the env_setup.sh script!" fi echo -ne "Searching for prokka-roary conda environment ... " envcheck3=$(conda info --envs|grep -w prokkaroary|wc -l) if [[ "$envcheck3" -ge 1 ]]; then echo -e "detected!\n" else echo -e "not detected, please run the env_setup.sh script!\n" fi # run createFolders rule to create folders in case any of them are missing echo -e "Checking folders in workspace $pwd ... " nFolders=$(ls -d */|wc -l) if [[ "$nFolders" -le 20 ]]; then while true; do read -p "Some folders appear to be missing, do you wish to run the createFolders Snakefile rule? (y/n)" yn case $yn in [Yy]* ) echo "Running the createFolders snakefile rule ... " && snakemake createFolders -j1; break;; [Nn]* ) echo "Skipping folder creation ... "; break;; * ) echo "Please answer yes or no.";; esac done fi # search for folders and files with .gz extension within dataset folder count_files=$(find dataset -name "*.gz"|wc -l) count_samp=$(ls dataset|grep -v gz|wc -l) if [[ "$count_files" -eq 0 ]]; then echo -e "\nThere are no sequencing files (*.gz) in the dataset folder!" echo -e "Please download or move your paired end files into sample specific subfolders within the dataset folder.\n" while true; do read -p "Do you wish to download a 3 sample dataset using the downloadToy Snakefile rule? ~1.8 GB of storage required (y/n)" yn case $yn in [Yy]* ) echo "Running the downloadToy snakefile rule ... " && snakemake downloadToy -j1; break;; [Nn]* ) echo "Skipping toy dataset download ... "; break;; * ) echo "Please answer yes or no.";; esac done elif [[ "$count_samp" -eq 0 && "$count_files" -ne 0 ]]; then echo -e "\nDetected $count_files unorganized files (*.gz) in dataset folder, running organizeData rule ... " while true; do read -p "Do you wish to organize your samples using the organizeData Snakefile rule? (y/n)" yn case $yn in [Yy]* ) echo "Running the organizeData snakefile rule ... " && snakemake organizeData -j1; break;; [Nn]* ) echo "Skipping toy dataset download ... "; break;; * ) echo "Please answer yes or no.";; esac done elif [[ "$count_samp" -ne 0 && "$count_files" -ne 0 ]]; then echo -e "\nFiles appear to be organized into sample specific subdirectories within the dataset folder." echo -e "\nPrinting sample IDs for user verification: " ls dataset|grep -v gz echo "" fi # scratch dir echo -e "\nPlease remember to set the scratch/ path in the config.yaml file" echo 'Ideally this path should be set to a job-specific variable that points to a location on your cluster for high I/O operations (e.g. $SCRATCH or $TMPDIR)' echo "However, it can also be a static directory and metaGEM will create job specific subdirectories automatically." } # Run stats task run_stats() { echo -e "Checking status of current metaGEM analysis ... \n" #dataset: count subfolders to determine total number of samples nsamp=$(ls -d dataset/*/|wc -l) echo "Raw data: $nsamp samples were identified in the dataset folder ... " #qfilter: count .json report files nqfilt=$(find qfiltered -name "*.json"|wc -l) echo "Quality filtering: $nqfilt / $nsamp samples processed ... " #assembly: count .gz fasta files nassm=$(find assemblies -name "*.gz"|wc -l) echo "Assembly: $nassm / $nsamp samples processed ... " #concoct: count *concoct-bins subfolders nconc=$(find concoct -name "*.concoct-bins"|wc -l) echo "Binning (CONCOCT): $nconc / $nsamp samples processed ... " #maxbin2: count *maxbin-bins subfolders nmaxb=$(find maxbin -name "*.maxbin-bins"|wc -l) echo "Binning (MaxBin2): $nmaxb / $nsamp samples processed ... " #metabat2: count *metabat-bins subfolders nmetab=$(find metabat -name "*.metabat-bins"|wc -l) echo "Binning (MetaBAT2): $nmetab / $nsamp samples processed ... " #metawrap_refine: count subfolders nmwref=$(ls -d refined_bins/*|wc -l) echo "Bin refinement: $nmwref / $nsamp samples processed ... " #metawrap_reassemble: count subfolders, also determine total number of final MAGs across samples nmwrea=$(ls -d reassembled_bins/*|wc -l) echo "Bin reassembly: $nmwrea / $nsamp samples processed ... " #taxonomy: count subfolders ntax=$(ls -d GTDBTk/*|wc -l) echo "Taxonomy: $ntax / $nsamp samples processed ... " #abundances: count subfolders nabund=$(ls -d abundance/*|wc -l) echo "Abundance: $nabund / $nsamp samples processed ... " #models: count subfolders for sample progress and count .xml GEM files for total models generated ngems=$(find GEMs -name "*xml"|wc -l) ngemsamp=$(ls -d GEMs/*|wc -l) echo "GEMs: $ngems models generated from $ngemsamp samples ... " #model reports: count subfolders nmemo=$(find memote -name "*.gz"|wc -l) echo "GEM Reports: $nmemo / $ngems models samples ... " #simulations: count .tsv files nsmet=$(find memote -name "*.gz"|wc -l) echo -e "GEM Reports: $nsmet / $ngemsamp communities simulated ... \n" } # Prompt user to confirm input parameters/options checkParams() { echo " " while true; do read -p "Do you wish to continue with these parameters? (y/n)" yn case $yn in [Yy]* ) echo "Proceeding with $task job(s) ... " ; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done } # Display config.yaml function for user inspection snakeConfig() { # Show config.yaml params echo -e "\nPlease verify parameters set in the config.yaml file: \n" paste ../config/config.yaml echo -e "\nPlease pay close attention to make sure that your paths are properly configured!" while true; do read -p "Do you wish to proceed with this config.yaml file? (y/n)" yn case $yn in [Yy]* ) echo " "; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done } # Display cluster_config.json function for user inspection clusterConfig() { # Show cluster_config.json params echo -e "Please verify parameters set in the cluster_config.json file: \n" paste ../config/cluster_config.json echo " " while true; do read -p "Do you wish to proceed with this cluster_config.json file? (y/n)" yn case $yn in [Yy]* ) echo " "; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done } # Prepare to submit cluster jobs function: display config files, unlock, and dry run snakePrep() { snakeConfig clusterConfig echo "Unlocking snakemake ... " snakemake --unlock -j 1 echo -e "\nDry-running snakemake jobs ... " snakemake all -j $njobs -n -k --cluster-config ../config/cluster_config.json -c "sbatch -A {cluster.account} -t {cluster.time} -n {cluster.n} --ntasks {cluster.tasks} --cpus-per-task {cluster.n} --output {cluster.output}" } # Submit login node function, note that is only works for rules with no wildcard expansion submitLogin() { echo "No need to parse Snakefile for target rule: $task ... " checkParams snakeConfig echo "Unlocking snakemake ... " snakemake --unlock -j 1 echo " " while true; do read -p "Do you wish to submit this $task job? (y/n)" yn case $yn in [Yy]* ) snakemake $task -j 1; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done } # Submit local function, similar to submitLogin() but can handle wildcard expanded rules for non-cluster usage submitLocal() { # Parse Snakefile rule all (line 22 of Snakefile) input to match output of desired target rule stored in "$string". Note: Hardcoded line number. echo "Parsing Snakefile to target rule: $task ... " sed -i "22s~^.*$~ $string~" Snakefile checkParams snakeConfig echo "Unlocking snakemake ... " snakemake --unlock -j 1 echo -e "\nDry-running snakemake jobs ... " snakemake all -n while true; do read -p "Do you wish to submit this batch of jobs on your local machine? (y/n)" yn case $yn in [Yy]* ) echo "snakemake all -j $njobs -k"|bash; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done } # Submit cluster function submitCluster() { # Parse Snakefile rule all (line 22 of Snakefile) input to match output of desired target rule stored in "$string". Note: Hardcoded line number. echo "Parsing Snakefile to target rule: $task ... " sed -i "22s~^.*$~ $string~" Snakefile # Check if the number of jobs flag is specified by user for cluster job if [[ -z "$njobs" ]]; then # No number of jobs provided. echo "WARNING: User is requesting to submit cluster job without specifying the number of jobs parameter (-j) ... " else # Number of jobs provided. echo "Number of jobs to be sumitted to cluster: $njobs ... " fi # Check if the number of cores flag is specified by user for cluster job if [[ -z "$ncores" ]]; then # No number of cores provided. echo "WARNING: User is requesting to submit cluster job without specifying the number of cores parameter (-c) ... " else # Parse cluster_config.json cores (line 5) to match number requested cores stored in "$ncores". Note: Hardcoded line number. echo "Parsing cluster_config.json to match requested number of cores: $ncores ... " sed -i "5s/:.*$/: $ncores,/" ../config/cluster_config.json fi # Check if the hours flag is specified by user for cluster job if [[ -z "$hours" ]]; then # No number of jobs provided. echo "WARNING: User is requesting to submit cluster job without specifying the number of hours parameter (-h) ... " else # Parse cluster_config.json time (line 4) to match number requested hours stored in "$hours". Note: Hardcoded line number. echo "Parsing cluster_config.json to match requested time (hours): $hours ... " sed -i "4s/:.*$/: \"0-$hours:00:00\",/" ../config/cluster_config.json fi # Check if memory input argument was provided by user. If so, parse cluster_config.json memory (line 7) to match requested memory stored in "$mem". Note: Hardcoded line number. if [[ -z "$mem" ]]; then # No memory flag provided. echo "WARNING: User is requesting to submit cluster job without specifying the memory flag (-m) ... " checkParams snakePrep while true; do read -p "Do you wish to submit this batch of $task jobs? (y/n)" yn case $yn in [Yy]* ) echo "nohup snakemake all -j $njobs -k --cluster-config ../config/cluster_config.json -c 'sbatch -A {cluster.account} -t {cluster.time} -n {cluster.n} --ntasks {cluster.tasks} --cpus-per-task {cluster.n} --output {cluster.output}' &"|bash; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done else # Memory flag was provided, parse cluster_config.json memory (line 7) to match number requested memory stored in "$mem". Note: Hardcoded line number. echo "Parsing cluster_config.json to match requested memory: $mem ... " sed -i "7s/:.*$/: $(echo $mem)G,/" ../config/cluster_config.json checkParams snakePrep while true; do read -p "Do you wish to submit this batch of jobs? (y/n)" yn case $yn in [Yy]* ) echo "nohup snakemake all -j $njobs -k --cluster-config ../config/cluster_config.json -c 'sbatch -A {cluster.account} -t {cluster.time} --mem {cluster.mem} -n {cluster.n} --ntasks {cluster.tasks} --cpus-per-task {cluster.n} --output {cluster.output}' &"|bash; break;; [Nn]* ) exit;; * ) echo "Please answer yes or no.";; esac done fi } # Parse function parse() { printLogo # Set root folder echo -e "Setting current directory to root in config.yaml file ... \n" root=$(pwd) sed -i "2s~/.*$~$root~" config.yaml # hardcoded line for root, change the number 2 if any new lines are added to the start of config.yaml # No need to parse snakefile for login node jobs, submit the following locally if [ $task == "createFolders" ] || [ $task == "downloadToy" ] || [ $task == "organizeData" ] || [ $task == "qfilterVis" ] || [ $task == "assemblyVis" ] || [ $task == "binningVis" ] || [ $task == "compositionVis" ] || [ $task == "abundanceVis" ] || [ $task == "extractProteinBins" ] || [ $task == "extractDnaBins" ] || [ $task == "organizeGEMs" ] || [ $task == "modelVis" ] || [ $task == "interactionVis" ] || [ $task == "growthVis" ] || [ $task == "binning" ] || [ $task == "binEvaluation" ] || [ $task == "prepareRoary" ]; then submitLogin elif [ $task == "check" ]; then run_check elif [ $task == "stats" ]; then run_stats # Parse snakefile for cluster/local jobs elif [ $task == "fastp" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["qfiltered"]+"/{IDs}/{IDs}_R1.fastq.gz", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "megahit" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["assemblies"]+"/{IDs}/contigs.fasta.gz", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "crossMapSeries" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["concoct"]+"/{IDs}/cov", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "kallistoIndex" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["kallistoIndex"]+"/{focal}/index.kaix", focal = focal)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "kallistoIndex" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["kallistoIndex"]+"/{focal}/index.kaix", focal = focal)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "crossMapParallel" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["kallisto"]+"/{focal}/{IDs}", focal = focal , IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "run_prodigal" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["prodigal"]+"/{IDs}/{IDs}_genes.gff", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "run_blastp" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["blastp"]+"/{IDs}.xml", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "concoct" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["concoct"]+"/{IDs}/{IDs}.concoct-bins", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "metabat" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["metabat"]+"/{IDs}/{IDs}.metabat-bins", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "maxbin" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["maxbin"]+"/{IDs}/{IDs}.maxbin-bins", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "binning" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["benchmarks"]+"/{IDs}/{IDs}.binning.benchmark.txt", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "binRefine" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["refined"]+"/{IDs}", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "binReassemble" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["reassembled"]+"/{IDs}", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "gtdbtk" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["classification"]+"/{IDs}", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "abundance" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["abundance"]+"/{IDs}", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "carveme" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["GEMs"]+"/{binIDs}.xml", binIDs = binIDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "smetana" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["SMETANA"]+"/{IDs}_detailed.tsv", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "memote" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["memote"]+"/{gemIDs}", gemIDs = gemIDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "grid" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["GRiD"]+"/{IDs}", IDs = IDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "prokka" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["pangenome"]+"/prokka/unorganized/{binIDs}", binIDs = binIDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi elif [ $task == "roary" ]; then string='expand(config["path"]["root"]+"/"+config["folder"]["pangenome"]+"/roary/{speciesIDs}/", speciesIDs = speciesIDs)' if [ $local == "true" ]; then submitLocal else submitCluster fi else echo "Task not recognized." usage fi } # Read input arguments if [ $# -eq 0 ]; then echo "No arguments provided ... " usage else local=false; # Read in options while [[ $1 = -?* ]]; do case $1 in -t|--task) shift; task=${1} ;; -j|--nJobs) shift; njobs=${1} ;; -c|--nCores) shift; ncores=${1} ;; -m|--mem) shift; mem=${1} ;; -h|--hours) shift; hours=${1} ;; -l|--local) shift; local=true;; --endopts) shift; break ;; * ) echo "Unknown option(s) provided, please read helpfile ... " && usage && exit 1;; esac shift done parse fi ================================================ FILE: workflow/rules/Snakefile_experimental.smk.py ================================================ rule metaspades: input: R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: config["path"]["root"]+"/"+config["folder"]["assemblies"]+"/{IDs}/contigs.fasta.gz" benchmark: config["path"]["root"]+"/"+"benchmarks/{IDs}.metaspades.benchmark.txt" shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cp {input.R1} {input.R2} $TMPDIR cd $TMPDIR metaspades.py --only-assembler -1 $(basename {input.R1}) -2 $(basename {input.R2}) -t {config[cores][metaspades]} -o . gzip contigs.fasta mkdir -p $(dirname {output}) rm $(basename {input.R1}) $(basename {input.R2}) mv -v contigs.fasta.gz spades.log $(dirname {output}) """ rule megahitCoassembly: input: R1 = f'/scratch/zorrilla/soil/coassembly/data/{{borkSoil}}/R1', R2 = f'/scratch/zorrilla/soil/coassembly/data/{{borkSoil}}/R2' output: f'{config["path"]["root"]}/coassembly/coassemblies/{{borkSoil}}/contigs.fasta.gz' benchmark: f'{config["path"]["root"]}/benchmarks/coassembly.{{borkSoil}}.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd $SCRATCHDIR echo -n "Copying qfiltered reads to $SCRATCHDIR ... " cp -r {input.R1} {input.R2} $SCRATCHDIR echo "done. " R1=$(ls R1/|tr '\n' ','|sed 's/,$//g') R2=$(ls R2/|tr '\n' ','|sed 's/,$//g') mv R1/* . mv R2/* . echo -n "Running megahit ... " megahit -t {config[cores][megahit]} \ --presets {config[params][assemblyPreset]} \ --min-contig-len {config[params][assemblyMin]}\ --verbose \ -1 $R1 \ -2 $R2 \ -o tmp; echo "done. " echo "Renaming assembly ... " mv tmp/final.contigs.fa contigs.fasta echo "Fixing contig header names: replacing spaces with hyphens ... " sed -i 's/ /-/g' contigs.fasta echo "Zipping and moving assembly ... " gzip contigs.fasta mkdir -p $(dirname {output}) mv contigs.fasta.gz $(dirname {output}) echo "Done. " """ rule metabatMultiSample: input: assembly=config["path"]["root"]+"/"+config["folder"]["assemblies"]+"/{IDs}/contigs.fasta.gz", reads=config["path"]["root"]+"/"+config["folder"]["qfiltered"] output: directory(config["path"]["root"]+"/"+config["folder"]["metabat"]+"/{IDs}/{IDs}.metabat-bins") benchmark: config["path"]["root"]+"/"+"benchmarks/{IDs}.metabat.benchmark.txt" shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname $(dirname {output})) mkdir -p $(dirname {output}) cp {input.assembly} $TMPDIR cd $TMPDIR mv $(basename {input.assembly}) $(basename $(dirname {input.assembly})).gz gunzip $(basename $(dirname {input.assembly})).gz bwa index $(basename $(dirname {input.assembly})) for sample in {input.reads}/*;do echo "Mapping sample $sample ... " ID=$(basename $sample); bwa mem -t {config[cores][metabat]} $(basename $(dirname {input.assembly})) $sample/*_1.fastq.gz $sample/*_2.fastq.gz > $ID.sam samtools view -@ {config[cores][metabat]} -Sb $ID.sam > $ID.bam samtools sort -@ {config[cores][metabat]} $ID.bam $ID.sort rm $ID.bam $ID.sam echo "Done mapping sample $sample !" echo "Creating depth file for sample $sample ... " jgi_summarize_bam_contig_depths --outputDepth depth.txt $ID.sort.bam echo "Done creating depth file for sample $sample !" rm $ID.sort.bam paste $sample.depth.txt done runMetaBat.sh $(basename $(dirname {input.assembly})) *.sort.bam mv *.txt *.tab $(basename {output}) $(dirname {output}) """ rule crossMap2: input: contigs = f'{config["path"]["root"]}/{config["folder"]["assemblies"]}/{{focal}}/contigs.fasta.gz', R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: directory(f'{config["path"]["root"]}/{config["folder"]["crossMap"]}/{{focal}}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{focal}}.{{IDs}}.crossMap.benchmark.txt' message: """ This rule is an alternative implementation of the rule crossMap. Instead of taking each focal sample as a job and cross mapping in series using a for loop, here the cross mapping is done completely in parallel. This implementation is not recommended, as it wastefully recreates a bwa index for each mapping operation. Use crossMap for smaller datasets or crossMap3 for larger datasets. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd $SCRATCHDIR echo -e "\nCopying assembly {input.contigs} and reads {input.R1} {input.R2} to $SCRATCHDIR" cp {input.contigs} {input.R1} {input.R2} . mkdir -p {output} # Define the focal sample ID, fsample: # The one sample's assembly that reads will be mapped against fsampleID=$(echo $(basename $(dirname {input.contigs}))) echo -e "\nFocal sample: $fsampleID ... " echo "Renaming and unzipping assembly ... " mv $(basename {input.contigs}) $(echo $fsampleID|sed 's/$/.fa.gz/g') gunzip $(echo $fsampleID|sed 's/$/.fa.gz/g') echo -e "\nIndexing assembly ... " bwa index $fsampleID.fa id=$(basename {output}) echo -e "\nMapping reads from sample $id against assembly of focal sample $fsampleID ..." bwa mem -t {config[cores][crossMap]} $fsampleID.fa *.fastq.gz > $id.sam echo -e "\nDeleting no-longer-needed fastq files ... " rm *.gz echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][crossMap]} -Sb $id.sam > $id.bam echo -e "\nDeleting no-longer-needed sam file ... " rm $id.sam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][crossMap]} -o $id.sort $id.bam echo -e "\nDeleting no-longer-needed bam file ... " rm $id.bam echo -e "\nIndexing sorted BAM file with samtools index for CONCOCT input table generation ... " samtools index $id.sort echo -e "\nCutting up assembly contigs >= 20kbp into 10kbp chunks and creating bedfile ... " cut_up_fasta.py $fsampleID.fa -c 10000 -o 0 --merge_last -b contigs_10K.bed > contigs_10K.fa echo -e "\nGenerating CONCOCT individual/intermediate coverage table ... " concoct_coverage_table.py contigs_10K.bed *.sort > ${{fsampleID}}_${{id}}_individual.tsv echo -e "\nCompressing CONCOCT coverage table ... " gzip ${{fsampleID}}_${{id}}_individual.tsv echo -e "\nRunning jgi_summarize_bam_contig_depths script to generate contig abundance/depth file for maxbin2 input ... " jgi_summarize_bam_contig_depths --outputDepth ${{fsampleID}}_${{id}}_individual.depth $id.sort echo -e "\nCompressing maxbin2/metabat2 depth file ... " gzip ${{fsampleID}}_${{id}}_individual.depth echo -e "\nMoving relevant files to {output}" mv *.gz {output} """ rule gatherCrossMap2: input: expand(f'{config["path"]["root"]}/{config["folder"]["crossMap"]}/{{focal}}/{{IDs}}', focal = focal , IDs = IDs) shell: """ echo idk """ rule maxbinMultiSample: input: assembly=config["path"]["root"]+"/"+config["folder"]["assemblies"]+"/{IDs}/contigs.fasta.gz", reads=config["path"]["root"]+"/"+config["folder"]["qfiltered"] output: directory(config["path"]["root"]+"/"+config["folder"]["maxbin"]+"/{IDs}/{IDs}.maxbin-bins") benchmark: config["path"]["root"]+"/"+"benchmarks/{IDs}.maxbin.benchmark.txt" shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname $(dirname {output})) mkdir -p $(dirname {output}) cp {input.assembly} $TMPDIR cd $TMPDIR focal=$(basename $(dirname {input.assembly})) gunzip contigs.fasta.gz echo "Creating kallisto index for focal sample $focal ... " #kallisto index contigs.fasta -i $focal.kaix cp /home/zorrilla/workspace/straya/test/*.kaix . echo "Done creating kallisto index!" echo "Begin cross mapping samples ... " for folder in {input.reads}/*;do var=$(basename $folder) echo "Mapping sample $var to focal sample $focal using kallisto quant ... " kallisto quant --threads {config[cores][kallisto]} --plaintext -i $focal.kaix -o . $folder/*_1.fastq.gz $folder/*_2.fastq.gz; #tail -n +2 abundance.tsv > $(basename $folder)_abundance.tsv #rm abundance.tsv echo "Done mapping sample $var to focal sample!" done echo "Done cross mapping all samples! " find . -name "*_abundance.tsv" > abund_list.txt echo "Begin running maxbin2 algorithm ... " run_MaxBin.pl -contig contigs.fasta -out $focal -abund_list abund_list.txt -thread {config[cores][maxbin]} echo "Done running maxbin2!" rm contigs.fasta mkdir $(basename {output}) mv *.fasta $(basename {output}) mv $(basename {output}) $(dirname {output}) """ rule mOTUs2classifyGenomes: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', script = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/classify-genomes' output: #directory(f'{config["path"]["root"]}/{config["folder"]["classification"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.classify-genomes.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p {output} cd $SCRATCHDIR cp -r {input.script}/* {input.bins}/* . echo "Begin classifying bins ... " for bin in *.fa; do echo -e "\nClassifying $bin ... " $PWD/classify-genomes $bin -t {config[cores][classify]} -o $(echo $bin|sed 's/.fa/.taxonomy/') cp *.taxonomy {output} rm *.taxonomy rm $bin done echo "Done classifying bins. " """ rule taxonomyVis: input: f'{config["path"]["root"]}/{config["folder"]["classification"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/classification.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/taxonomyVis.pdf' message: """ mOTUs2 taxonomy visualization. Generate bar plot with most common taxa (n>15) and density plots with mapping statistics. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd {input} echo -e "\nBegin reading classification result files ... \n" for folder in */;do for file in $folder*.taxonomy;do # Define sample ID to append to start of each bin name in summary file sample=$(echo $folder|sed 's|/||') # Define bin name with sample ID, shorten metaWRAP naming scheme (orig/permissive/strict) fasta=$(echo $file | sed 's|^.*/||' | sed 's/.taxonomy//g' | sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$sample./g"); # Extract NCBI ID NCBI=$(less $file | grep NCBI | cut -d ' ' -f4); # Extract consensus taxonomy tax=$(less $file | grep tax | sed 's/Consensus taxonomy: //g'); # Extract consensus motus motu=$(less $file | grep mOTUs | sed 's/Consensus mOTUs: //g'); # Extract number of detected genes detect=$(less $file | grep detected | sed 's/Number of detected genes: //g'); # Extract percentage of agreeing genes percent=$(less $file | grep agreeing | sed 's/Percentage of agreeing genes: //g' | sed 's/%//g'); # Extract number of mapped genes map=$(less $file | grep mapped | sed 's/Number of mapped genes: //g'); # Extract COG IDs, need to use set +e;...;set -e to avoid erroring out when reading .taxonomy result file for bin with no taxonomic annotation set +e cog=$(less $file | grep COG | cut -d$'\t' -f1 | tr '\n' ',' | sed 's/,$//g'); set -e # Display and store extracted results echo -e "$fasta \t $NCBI \t $tax \t $motu \t $detect \t $map \t $percent \t $cog" echo -e "$fasta \t $NCBI \t $tax \t $motu \t $detect \t $map \t $percent \t $cog" >> classification.stats; done; done echo -e "\nDone generating classification.stats summary file, moving to stats/ directory and running taxonomyVis.R script ... " mv classification.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][taxonomyVis]} rm Rplots.pdf # Delete redundant pdf file echo "Done. " """ rule parseTaxAb: input: taxonomy = rules.taxonomyVis.output.text , abundance = f'{config["path"]["root"]}/{config["folder"]["abundance"]}' output: directory(f'{config["path"]["root"]}/MAG.table') message: """ Parses an abundance table with MAG taxonomy for rows and samples for columns. Note: parseTaxAb should only be run after the classifyGenomes, taxonomyVis, and abundance rules. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cd {input.abundance} for folder in */;do # Define sample ID sample=$(echo $folder|sed 's|/||g') # Same as in taxonomyVis rule, modify bin names by adding sample ID and shortening metaWRAP naming scheme (orig/permissive/strict) paste $sample/$sample.abund | sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$sample./g" >> abundance.stats done mv abundance.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} """ rule prepareRoary: input: taxonomy = rules.GTDBtkVis.output.text, binning = rules.binningVis.output.text, script = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["prepRoary"]}' output: directory(f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/speciesBinIDs') benchmark: f'{config["path"]["root"]}/benchmarks/prepareRoary.benchmark.txt' message: """ This rule matches the results from classifyGenomes->taxonomyVis with the completeness & contamination CheckM results from the metaWRAP reassembly->binningVis results, identifies speceies represented by at least 10 high quality MAGs (completeness >= 90 & contamination <= 10), and outputs text files with bin IDs for each such species. Also organizes the prokka output folders based on taxonomy. Note: Do not run this before finishing all prokka jobs! """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cd $(dirname {input.taxonomy}) echo -e "\nCreating speciesBinIDs folder containing.txt files with binIDs for each species that is represented by at least 10 high quality MAGs (completeness >= 90 & contamination <= 10) ... " Rscript {input.script} nSpecies=$(ls $(basename {output})|wc -l) nSpeciesTot=$(cat $(basename {output})/*|wc -l) nMAGsTot=$(paste {input.binning}|wc -l) echo -e "\nIdentified $nSpecies species represented by at least 10 high quality MAGs, totaling $nSpeciesTot MAGs out of $nMAGsTot total MAGs generated ... " echo -e "\nMoving speciesBinIDs folder to pangenome directory: $(dirname {output})" mv $(basename {output}) $(dirname {output}) echo -e "\nOrganizing prokka folder according to taxonomy ... " echo -e "\nGFF files of identified species with at least 10 HQ MAGs will be copied to prokka/organzied/speciesSubfolder for roary input ... " cd $(dirname {output}) mkdir -p prokka/organized for species in speciesBinIDs/*.txt;do speciesID=$(echo $(basename $species)|sed 's/.txt//g'); echo -e "\nCreating folder and organizing prokka output for species $speciesID ... " mkdir -p prokka/organized/$speciesID while read line;do binID=$(echo $line|sed 's/.bin/_bin/g') echo "Copying GFF prokka output of bin $binID" cp prokka/unorganized/$binID/*.gff prokka/organized/$speciesID/ done< $species done echo -e "\nDone" """ rule prepareRoaryMOTUS2: input: taxonomy = rules.taxonomyVis.output.text, binning = rules.binningVis.output.text, script = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["prepRoary"]}' output: directory(f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/speciesBinIDs') benchmark: f'{config["path"]["root"]}/benchmarks/prepareRoary.benchmark.txt' message: """ This rule matches the results from classifyGenomes->taxonomyVis with the completeness & contamination CheckM results from the metaWRAP reassembly->binningVis results, identifies speceies represented by at least 10 high quality MAGs (completeness >= 90 & contamination <= 10), and outputs text files with bin IDs for each such species. Also organizes the prokka output folders based on taxonomy. Note: Do not run this before finishing all prokka jobs! """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cd $(dirname {input.taxonomy}) echo -e "\nCreating speciesBinIDs folder containing.txt files with binIDs for each species that is represented by at least 10 high quality MAGs (completeness >= 90 & contamination <= 10) ... " Rscript {input.script} nSpecies=$(ls $(basename {output})|wc -l) nSpeciesTot=$(cat $(basename {output})/*|wc -l) nMAGsTot=$(paste {input.binning}|wc -l) echo -e "\nIdentified $nSpecies species represented by at least 10 high quality MAGs, totaling $nSpeciesTot MAGs out of $nMAGsTot total MAGs generated ... " echo -e "\nMoving speciesBinIDs folder to pangenome directory: $(dirname {output})" mv $(basename {output}) $(dirname {output}) echo -e "\nOrganizing prokka folder according to taxonomy ... " echo -e "\nGFF files of identified species with at least 10 HQ MAGs will be copied to prokka/organzied/speciesSubfolder for roary input ... " cd $(dirname {output}) mkdir -p prokka/organized for species in speciesBinIDs/*.txt;do speciesID=$(echo $(basename $species)|sed 's/.txt//g'); echo -e "\nCreating folder and organizing prokka output for species $speciesID ... " mkdir -p prokka/organized/$speciesID while read line;do binID=$(echo $line|sed 's/.bin/_bin/g') echo "Copying GFF prokka output of bin $binID" cp prokka/unorganized/$binID/*.gff prokka/organized/$speciesID/ done< $species done echo -e "\nDone" """ rule roaryTop10: input: f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/prokka/organized/' output: directory(f'{config["path"]["root"]}/{config["folder"]["pangenome"]}/roary/top10/') benchmark: f'{config["path"]["root"]}/benchmarks/roaryTop10.roary.benchmark.txt' message: """ Runs pangenome for ~692 MAGs belonging to 10 species: Agathobacter rectale, Bacteroides uniformis, Ruminococcus_E bromii_B, Gemmiger sp003476825, Blautia_A wexlerae, Dialister invisus, Anaerostipes hadrus, Fusicatenibacter saccharivorans, Eubacterium_E hallii, and NA """ shell: """ set +u;source activate prokkaroary;set -u mkdir -p $(dirname {output}) cd $SCRATCHDIR cp -r {input}/Agathobacter_rectale/* . cp -r {input}/Bacteroides_uniformis/* . cp -r {input}/Ruminococcus_E_bromii_B/* . cp -r {input}/Gemmiger_sp003476825/* . cp -r {input}/Blautia_A_wexlerae/* . cp -r {input}/Dialister_invisus/* . cp -r {input}/Anaerostipes_hadrus/* . cp -r {input}/Fusicatenibacter_saccharivorans/* . cp -r {input}/Eubacterium_E_hallii/* . cp -r {input}/NA/* . roary -s -p {config[cores][roary]} -i {config[params][roaryI]} -cd {config[params][roaryCD]} -f yes_al -e -v *.gff cd yes_al create_pan_genome_plots.R cd .. mkdir -p {output} mv yes_al/* {output} """ rule phylophlan: input: f'/home/zorrilla/workspace/european/dna_bins' output: directory(f'/scratch/zorrilla/phlan/out') benchmark: f'/scratch/zorrilla/phlan/logs/bench.txt' shell: """ cd $SCRATCHDIR cp -r {input} . cp $(dirname {output})/*.cfg . mkdir -p logs phylophlan -i dna_bins \ -d phylophlan \ -f 02_tol.cfg \ --genome_extension fa \ --diversity low \ --fast \ -o out \ --nproc 128 \ --verbose 2>&1 | tee logs/phylophlan.logs cp -r out $(dirname {output}) """ rule phylophlanPlant: input: f'/home/zorrilla/workspace/china_soil/dna_bins' output: directory(f'/home/zorrilla/workspace/china_soil/phlan/') benchmark: f'/scratch/zorrilla/phlan/logs/benchPlant.txt' shell: """ cd $SCRATCHDIR cp -r {input} . cp /scratch/zorrilla/phlan/*.cfg . mkdir -p logs phylophlan -i dna_bins \ -d phylophlan \ -f 02_tol.cfg \ --genome_extension fa \ --diversity low \ --fast \ -o $(basename {output}) \ --nproc 128 \ --verbose 2>&1 | tee logs/phylophlan.logs cp -r $(basename {output}) $(dirname {output}) """ rule phylophlanMeta: input: f'/home/zorrilla/workspace/european/dna_bins' output: directory(f'/home/zorrilla/workspace/european/phlan/dist') benchmark: f'/scratch/zorrilla/phlan/logs/bench.txt' shell: """ cd {input} cd ../ phylophlan_metagenomic -i $(basename {input}) -o $(basename {output})_dist --nproc 2 --only_input mv $(basename {output})_dist $(basename {output}) mv -r $(basename {output}) $(dirname {output}) """ rule phylophlanMetaAll: input: lab=f'/home/zorrilla/workspace/korem/dna_bins', gut=f'/home/zorrilla/workspace/european/dna_bins' , plant=f'/home/zorrilla/workspace/china_soil/dna_bins' , soil=f'/home/zorrilla/workspace/straya/dna_bins' , ocean=f'/scratch/zorrilla/dna_bins' output: directory(f'/home/zorrilla/workspace/european/phlan/all') benchmark: f'/scratch/zorrilla/phlan/logs/allMetaBench.txt' shell: """ mkdir -p {output} cd $SCRATCHDIR mkdir allMAGs cp {input.lab}/* allMAGs cp {input.gut}/* allMAGs cp {input.plant}/* allMAGs cp {input.soil}/* allMAGs cp {input.ocean}/* allMAGs phylophlan_metagenomic -i allMAGs -o all --nproc 4 --only_input mv all_distmat.tsv $(dirname {output}) """ rule drawTree: input: f'/home/zorrilla/workspace/china_soil/phlan' shell: """ cd {input} graphlan.py dna_bins.tre.iqtree tree.out """ rule makePCA: input: f'/home/zorrilla/workspace/european/phlan' shell: """ cd $SCRATCHDIR echo -e "\nCopying files to scratch dir: $SCRATCHDIR" cp {input}/*.tsv {input}/*.ids {input}/*.R . echo -e "\nRunning nmds.R script ... " Rscript nmds.R rm *.tsv *.ids *.R mkdir -p nmds mv *.pdf nmds mv nmds {input} """ rule drep: input: f'{config["path"]["root"]}/dna_bins' output: directory(f'{config["path"]["root"]}/drep_drep') benchmark: f'{config["path"]["root"]}/benchmarks/drep_drep.benchmark.txt' shell: """ set +u;source activate drep;set -u cp -r {input} $SCRATCHDIR cd $SCRATCHDIR dRep dereplicate drep_drep -g $(basename {input})/*.fa -p 48 -comp 50 -con 10 mv drep_drep $(dirname {input}) """ rule drepComp: input: f'{config["path"]["root"]}/dna_bins' output: directory(f'{config["path"]["root"]}/drep_comp') benchmark: f'{config["path"]["root"]}/benchmarks/drep_comp.benchmark.txt' shell: """ set +u;source activate drep;set -u cp -r {input} $SCRATCHDIR cd $SCRATCHDIR dRep compare drep_comp -g $(basename {input})/*.fa -p 48 mv drep_comp $(dirname {input}) """ ================================================ FILE: workflow/rules/Snakefile_single_end.smk.py ================================================ configfile: "config.yaml" import os import glob def get_ids_from_path_pattern(path_pattern): ids = sorted([os.path.basename(os.path.splitext(val)[0]) for val in (glob.glob(path_pattern))]) return ids # Make sure that final_bins/ folder contains all bins in single folder for binIDs # wildcard to work. Use extractProteinBins rule or perform manually. binIDs = get_ids_from_path_pattern('final_bins/*.faa') IDs = get_ids_from_path_pattern('assemblies/*') DATA_READS = f'{config["path"]["root"]}/{config["folder"]["data"]}/{{IDs}}/{{IDs}}.fastq.gz' # Inserting space here to avoid having to change the hardcoded line 22 edit in the metabagpipes parser to expand wildcards rule all: input: expand(f'{config["path"]["root"]}/GTDBtk/{{IDs}}', IDs=IDs) message: """ WARNING: Be very careful when adding/removing any lines above this message. The metaBAGpipes.sh parser is presently hardcoded to edit line 22 of this Snakefile to expand target rules accordingly, therefore adding/removing any lines before this message will likely result in parser malfunction. """ shell: """ echo {input} """ rule createFolders: input: config["path"]["root"] message: """ Very simple rule to check that the metaBAGpipes.sh parser, Snakefile, and config.yaml file are set up correctly. Generates folders from config.yaml config file, not strictly necessary to run this rule. """ shell: """ cd {input} echo -e "Setting up result folders in the following work directory: $(echo {input}) \n" # Generate folders.txt by extracting folder names from config.yaml file paste config.yaml |cut -d':' -f2|tail -n +4|head -n 18|sed '/^$/d' > folders.txt # NOTE: hardcoded number (18) for folder names, increase number if new folders are introduced. while read line;do echo "Creating $line folder ... " mkdir -p $line; done < folders.txt echo -e "\nDone creating folders. \n" rm folders.txt """ rule downloadToy: input: f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["toy"]}' message: """ Downloads toy dataset into config.yaml data folder and organizes into sample-specific sub-folders. Requires download_toydata.txt to be present in scripts folder. Modify this rule to download a real dataset by replacing the links in the download_toydata.txt file with links to files from your dataset of intertest. """ shell: """ cd {config[path][root]}/{config[folder][data]} # Download each link in download_toydata.txt echo -e "\nBegin downloading toy dataset ... \n" while read line;do wget $line; done < {input} echo -e "\nDone donwloading dataset.\n" # Rename downloaded files, this is only necessary for toy dataset (will cause error if used for real dataset) echo -ne "\nRenaming downloaded files ... " for file in *;do mv $file ./$(echo $file|sed 's/?download=1//g'); done echo -e " done. \n" # Organize data into sample specific sub-folders echo -ne "\nGenerating list of unique sample IDs ... " for file in *.gz; do echo $file; done | sed 's/_.*$//g' | sed 's/.fastq.gz//g' | uniq > ID_samples.txt echo -e " done.\n $(less ID_samples.txt|wc -l) samples identified.\n" echo -ne "\nOrganizing downloaded files into sample specific sub-folders ... " while read line; do mkdir -p $line; mv $line*.gz $line; done < ID_samples.txt echo -e " done. \n" rm ID_samples.txt """ rule organizeData: input: f'{config["path"]["root"]}/{config["folder"]["data"]}' message: """ Sorts paired end raw reads into sample specific sub folders within the dataset folder specified in the config.yaml file. Assumes all samples are present in abovementioned dataset folder. Note: This rule is meant to be run on real datasets. Do not run for toy dataset, as downloadToy rule above sorts the downloaded data already. """ shell: """ cd {input} echo -ne "\nGenerating list of unique sample IDs ... " # Create list of unique sample IDs for file in *.gz; do echo $file; done | sed 's/_.*$//g' | sed 's/.fastq.gz//g' | uniq > ID_samples.txt echo -e " done.\n $(less ID_samples.txt|wc -l) samples identified.\n" # Create folder and move corresponding files for each sample echo -ne "\nOrganizing dataset into sample specific sub-folders ... " while read line; do mkdir -p $line; mv $line*.gz $line; done < ID_samples.txt echo -e " done. \n" rm ID_samples.txt """ rule qfilter: input: READS = DATA_READS output: f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}/{{IDs}}/{{IDs}}.fastq.gz', shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname $(dirname {output})) mkdir -p $(dirname {output}) fastp --thread {config[cores][fastp]} \ -i {input} \ -o {output} \ -j $(dirname {output})/$(echo $(basename $(dirname {output}))).json \ -h $(dirname {output})/$(echo $(basename $(dirname {output}))).html """ rule qfilterVis: input: f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/qfilter.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/qfilterVis.pdf' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname {output.text}) cd {input} echo -e "\nGenerating quality filtering results file qfilter.stats: ... " for folder in */;do for file in $folder*json;do ID=$(echo $file|sed 's|/.*$||g') readsBF=$(head -n 25 $file|grep total_reads|cut -d ':' -f2|sed 's/,//g'|head -n 1) readsAF=$(head -n 25 $file|grep total_reads|cut -d ':' -f2|sed 's/,//g'|tail -n 1) basesBF=$(head -n 25 $file|grep total_bases|cut -d ':' -f2|sed 's/,//g'|head -n 1) basesAF=$(head -n 25 $file|grep total_bases|cut -d ':' -f2|sed 's/,//g'|tail -n 1) q20BF=$(head -n 25 $file|grep q20_rate|cut -d ':' -f2|sed 's/,//g'|head -n 1) q20AF=$(head -n 25 $file|grep q20_rate|cut -d ':' -f2|sed 's/,//g'|tail -n 1) q30BF=$(head -n 25 $file|grep q30_rate|cut -d ':' -f2|sed 's/,//g'|head -n 1) q30AF=$(head -n 25 $file|grep q30_rate|cut -d ':' -f2|sed 's/,//g'|tail -n 1) percent=$(awk -v RBF="$readsBF" -v RAF="$readsAF" 'BEGIN{{print RAF/RBF}}' ) echo "$ID $readsBF $readsAF $basesBF $basesAF $percent $q20BF $q20AF $q30BF $q30AF" >> qfilter.stats echo "Sample $ID retained $percent * 100 % of reads ... " done done echo "Done summarizing quality filtering results ... \nMoving to /stats/ folder and running plotting script ... " mv qfilter.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][qfilterVis]} echo "Done. " rm Rplots.pdf """ rule megahit: input: rules.qfilter.output output: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}/{{IDs}}/contigs.fasta.gz' benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.megahit.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd $TMPDIR echo -n "Copying qfiltered reads to $TMPDIR ... " cp {input} $TMPDIR echo "done. " echo -n "Running megahit ... " megahit -t {config[cores][megahit]} \ --verbose \ -r $(basename {input}) \ -o tmp; echo "done. " echo "Renaming assembly ... " mv tmp/final.contigs.fa contigs.fasta echo "Fixing contig header names: replacing spaces with hyphens ... " sed -i 's/ /-/g' contigs.fasta echo "Zipping and moving assembly ... " gzip contigs.fasta mkdir -p $(dirname {output}) mv contigs.fasta.gz $(dirname {output}) echo "Done. " """ rule assemblyVis: input: f'{config["path"]["root"]}/{config["folder"]["assemblies"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/assembly.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/assemblyVis.pdf', shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname {output.text}) cd {input} echo -e "\nGenerating assembly results file assembly.stats: ... " for folder in */;do for file in $folder*.gz;do ID=$(echo $file|sed 's|/contigs.fasta.gz||g') N=$(less $file|grep -c ">"); L=$(less $file|grep ">"|cut -d '-' -f4|sed 's/len=//'|awk '{{sum+=$1}}END{{print sum}}'); T=$(less $file|grep ">"|cut -d '-' -f4|sed 's/len=//'|awk '$1>=1000{{c++}} END{{print c+0}}'); S=$(less $file|grep ">"|cut -d '-' -f4|sed 's/len=//'|awk '$1>=1000'|awk '{{sum+=$1}}END{{print sum}}'); echo $ID $N $L $T $S>> assembly.stats; echo -e "Sample $ID has a total of $L bp across $N contigs, with $S bp present in $T contigs >= 1000 bp ... " done; done echo "Done summarizing assembly results ... \nMoving to /stats/ folder and running plotting script ... " mv assembly.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][assemblyVis]} echo "Done. " rm Rplots.pdf """ rule metabat: input: contigs = rules.megahit.output, READS = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}' output: directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.metabat.benchmark.txt' message: """ Cross map all samples with bwa then use the output of this rule to create contig abundance/depth files to be used for binning with metabat2 and maxbin2. After depth files are copied back to workspace and metabat2 finishes we avoid the need to copy bam files back to workspace saving space as well as reducing total nubmer of jobs to run. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd $TMPDIR cp {input.contigs} . mkdir -p {output} # Define the focal sample ID, fsample: # The one sample that all other samples will be mapped against mapping sample msampleID in for loop fsampleID=$(echo $(basename $(dirname {input.contigs}))) echo -e "\nFocal sample: $fsampleID ... " echo "Renaming and unzipping assembly ... " mv $(basename {input.contigs}) $(echo $fsampleID|sed 's/$/.fa.gz/g') gunzip $(echo $fsampleID|sed 's/$/.fa.gz/g') echo -e "\nIndexing assembly ... " bwa index $fsampleID.fa for folder in {input.READS}/*;do id=$(basename $folder) echo -e "\nCopying sample $id to be mapped againts the focal sample $fsampleID ..." cp $folder/*.gz . # Maybe I should be piping the lines below to reduce I/O ? echo -e "\nMapping sample to assembly ... " bwa mem -t {config[cores][metabat]} $fsampleID.fa *.fastq.gz > $id.sam echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][metabat]} -Sb $id.sam > $id.bam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][metabat]} -o $id.sort $id.bam echo -e "\nRunning jgi_summarize_bam_contig_depths script to generate contig abundance/depth file ... " jgi_summarize_bam_contig_depths --outputDepth $id.depth $id.sort echo -e "\nCopying depth file to workspace" mv $id.depth {output} echo -e "\nRemoving temporary files ... " rm *.fastq.gz *.sam *.bam done nSamples=$(ls {input.READS}|wc -l) echo -e "\nDone mapping focal sample $fsampleID agains $nSamples samples in dataset folder." echo -e "\nRunning jgi_summarize_bam_contig_depths for all sorted bam files ... " jgi_summarize_bam_contig_depths --outputDepth $id.all.depth *.sort echo -e "\nRunning metabat2 ... " metabat2 -i $fsampleID.fa -a $id.all.depth -o $fsampleID mv *.fa $id.all.depth $(dirname {output}) """ rule maxbin: input: assembly = rules.megahit.output, depth = rules.metabat.output output: directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.maxbin.benchmark.txt' message: """ Note that this rule uses of the output depth of metabat2 as an input to bin using maxbin2. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cp -r {input.assembly} {input.depth} $TMPDIR mkdir -p $(dirname $(dirname {output})) cd $TMPDIR echo -e "\nUnzipping assembly ... " gunzip contigs.fasta.gz echo -e "\nGenerating list of depth files based on metabat2 output ... " find $(basename {input.depth}) -name "*.depth" > abund.list echo -e "\nRunning maxbin2 ... " run_MaxBin.pl -contig contigs.fasta -out $(basename $(dirname {output})) -abund_list abund.list rm contigs.fasta *.gz mkdir $(basename {output}) mkdir -p $(dirname {output}) mv *.fasta $(basename {output}) mv $(basename {output}) *.summary *.abundance $(dirname {output}) """ rule concoct: input: contigs = rules.megahit.output, reads = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}' output: directory(f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/{{IDs}}.concoct-bins') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.concoct.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p $(dirname $(dirname {output})) fsampleID=$(echo $(basename $(dirname {input.contigs}))) echo -e "\nCopying focal sample assembly $fsampleID to TMPDIR ... " cp {input.contigs} $TMPDIR cd $TMPDIR echo "Unzipping assembly ... " gunzip $(basename {input.contigs}) echo -e "Done. \nCutting up contigs to 10kbp chunks (default), do not use this for mapping!" cut_up_fasta.py -c {config[params][cutfasta]} -o 0 -m contigs.fasta -b assembly_c10k.bed > assembly_c10k.fa echo -e "\nIndexing assembly of original contigs for mapping (not 10kbp chunks assembly file) ... " bwa index contigs.fasta echo -e "Done. \nPreparing to map focal sample against other samples ... " for folder in {input.reads}/*;do id=$(basename $folder) echo -e "\nCopying sample $id to be mapped againts the focal sample $fsampleID ..." cp $folder/*.gz . # Maybe I should be piping the lines below to reduce I/O ? echo -e "\nMapping sample to assembly ... " bwa mem -t {config[cores][concoct]} contigs.fasta *.fastq.gz > $id.sam echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][concoct]} -Sb $id.sam > $id.bam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][concoct]} -o $id.sort $id.bam echo -e "\nIndexing sorted BAM file with samtools index ... " samtools index $id.sort echo -e "\nRemoving temporary files ... " rm *.fastq.gz *.sam *.bam done echo -e "\nSummarizing sorted and indexed BAM files with concoct_coverage_table.py ... " concoct_coverage_table.py assembly_c10k.bed *.sort > coverage_table.tsv echo -e "\nRunning CONCOCT ... " concoct --coverage_file coverage_table.tsv --composition_file assembly_c10k.fa \ -b $(basename $(dirname {output})) \ -t {config[cores][concoct]} \ -c {config[params][concoct]} echo -e "\nMerging clustering results into original contigs with merge_cutup_clustering.py ... " merge_cutup_clustering.py $(basename $(dirname {output}))_clustering_gt1000.csv > $(basename $(dirname {output}))_clustering_merged.csv echo -e "\nExtracting bins ... " mkdir -p $(basename {output}) extract_fasta_bins.py contigs.fasta $(basename $(dirname {output}))_clustering_merged.csv --output_path $(basename {output}) mkdir -p $(dirname {output}) mv $(basename {output}) *.txt *.csv $(dirname {output}) """ rule binRefine: input: concoct = f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/{{IDs}}.concoct-bins', metabat = f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins', maxbin = f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins' output: directory(f'{config["path"]["root"]}/{config["folder"]["refined"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.binRefine.benchmark.txt' shell: """ set +u;source activate {config[envs][metawrap]};set -u; mkdir -p $(dirname {output}) mkdir -p {output} cd $TMPDIR echo "Copying bins from CONCOCT, metabat2, and maxbin2 to tmpdir ... " cp -r {input.concoct} {input.metabat} {input.maxbin} $TMPDIR echo "Renaming bin folders to avoid errors with metaWRAP ... " mv $(basename {input.concoct}) $(echo $(basename {input.concoct})|sed 's/-bins//g') mv $(basename {input.metabat}) $(echo $(basename {input.metabat})|sed 's/-bins//g') mv $(basename {input.maxbin}) $(echo $(basename {input.maxbin})|sed 's/-bins//g') echo "Running metaWRAP bin refinement module ... " metaWRAP bin_refinement -o . \ -A $(echo $(basename {input.concoct})|sed 's/-bins//g') \ -B $(echo $(basename {input.metabat})|sed 's/-bins//g') \ -C $(echo $(basename {input.maxbin})|sed 's/-bins//g') \ -t {config[cores][refine]} \ -m {config[params][refineMem]} \ -c {config[params][refineComp]} \ -x {config[params][refineCont]} rm -r $(echo $(basename {input.concoct})|sed 's/-bins//g') $(echo $(basename {input.metabat})|sed 's/-bins//g') $(echo $(basename {input.maxbin})|sed 's/-bins//g') work_files mv * {output} """ rule binReassemble: input: READS = rules.qfilter.output, refinedBins = rules.binRefine.output output: directory(f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.binReassemble.benchmark.txt' shell: """ set +u;source activate {config[envs][metawrap]};set -u; mkdir -p $(dirname {output}) cp -r {input.refinedBins}/metawrap_*_bins {input.READS} $TMPDIR cd $TMPDIR echo "Running metaWRAP bin reassembly ... " metaWRAP reassemble_bins -o $(basename {output}) \ -b metawrap_*_bins \ -1 $(basename {input.READS}) \ -2 $(basename {input.READS}) \ -t {config[cores][reassemble]} \ -m {config[params][reassembleMem]} \ -c {config[params][reassembleComp]} \ -x {config[params][reassembleCont]} rm -r metawrap_*_bins rm -r $(basename {output})/work_files rm *.fastq.gz mv * $(dirname {output}) """ rule binningVis: input: f'{config["path"]["root"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/reassembled_bins.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/binningVis.pdf' message: """ Generate bar plot with number of bins and density plot of bin contigs, total length, completeness, and contamination across different tools. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; # READ CONCOCT BINS echo "Generating concoct_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][concoct]} for folder in */;do var=$(echo $folder|sed 's|/||g'); # Define sample name for bin in $folder*concoct-bins/*.fa;do name=$(echo $bin | sed "s|^.*/|$var.bin.|g" | sed 's/.fa//g'); # Define bin name N=$(less $bin | grep -c ">"); L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> concoct_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} echo "Done reading CONCOCT bins, moving concoct_bins.stats file to $(echo {input}/{config[folder][reassembled]}) ." # READ METABAT2 BINS echo "Generating metabat_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][metabat]} for folder in */;do var=$(echo $folder | sed 's|/||'); # Define sample name for bin in $folder*metabat-bins/*.fa;do name=$(echo $bin|sed 's/.fa//g'|sed 's|^.*/||g'|sed "s/^/$var./g"); # Define bin name N=$(less $bin | grep -c ">"); L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> metabat_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} echo "Done reading metabat2 bins, moving metabat_bins.stats file to $(echo {input}/{config[folder][reassembled]}) ." # READ MAXBIN2 BINS echo "Generating maxbin_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][maxbin]} for folder in */;do for bin in $folder*maxbin-bins/*.fasta;do name=$(echo $bin | sed 's/.fasta//g' | sed 's|^.*/||g'); # Define bin name N=$(less $bin | grep -c ">"); L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len=//g'|awk '{{sum+=$1}}END{{print sum}}') echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> maxbin_bins.stats; done; done mv *.stats {input}/{config[folder][reassembled]} echo "Done reading maxbin2 bins, moving maxbin_bins.stats file to $(echo {input}/{config[folder][reassembled]}) ." # READ METAWRAP REFINED BINS echo "Generating refined_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][refined]} for folder in */;do samp=$(echo $folder | sed 's|/||'); # Define sample name for bin in $folder*metawrap_*_bins/*.fa;do name=$(echo $bin | sed 's/.fa//g'|sed 's|^.*/||g'|sed "s/^/$samp./g"); # Define bin name N=$(less $bin | grep -c ">"); L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}') echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> refined_bins.stats; done; done echo "Done reading metawrap refined bins ... " # READ METAWRAP REFINED CHECKM OUTPUT echo "Generating CheckM summary files across samples: concoct.checkm, metabat.checkm, maxbin.checkm, and refined.checkm ... " for folder in */;do var=$(echo $folder|sed 's|/||g'); # Define sample name paste $folder*concoct.stats|tail -n +2 | sed "s/^/$var.bin./g" >> concoct.checkm paste $folder*metabat.stats|tail -n +2 | sed "s/^/$var./g" >> metabat.checkm paste $folder*maxbin.stats|tail -n +2 >> maxbin.checkm paste $folder*metawrap_*_bins.stats|tail -n +2|sed "s/^/$var./g" >> refined.checkm done echo "Done reading metawrap refined output, moving refined_bins.stats, concoct.checkm, metabat.checkm, maxbin.checkm, and refined.checkm files to $(echo {input}/{config[folder][reassembled]}) ." mv *.stats *.checkm {input}/{config[folder][reassembled]} # READ METAWRAP REASSEMBLED BINS echo "Generating reassembled_bins.stats file containing bin ID, number of contigs, and length ... " cd {input}/{config[folder][reassembled]} for folder in */;do samp=$(echo $folder | sed 's|/||'); # Define sample name for bin in $folder*reassembled_bins/*.fa;do name=$(echo $bin | sed 's/.fa//g' | sed 's|^.*/||g' | sed "s/^/$samp./g"); # Define bin name N=$(less $bin | grep -c ">"); # Need to check if bins are original (megahit-assembled) or strict/permissive (metaspades-assembled) if [[ $name == *.strict ]] || [[ $name == *.permissive ]];then L=$(less $bin |grep ">"|cut -d '_' -f4|awk '{{sum+=$1}}END{{print sum}}') else L=$(less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}') fi echo "Reading bin $bin ... Contigs: $N , Length: $L " echo $name $N $L >> reassembled_bins.stats; done; done echo "Done reading metawrap reassembled bins ... " # READ METAWRAP REFINED CHECKM OUTPUT echo "Generating CheckM summary file reassembled.checkm across samples for reassembled bins ... " for folder in */;do var=$(echo $folder|sed 's|/||g'); paste $folder*reassembled_bins.stats|tail -n +2|sed "s/^/$var./g"; done >> reassembled.checkm echo "Done generating all statistics files for binning results ... running plotting script ... " # RUN PLOTTING R SCRIPT mv *.stats *.checkm {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][binningVis]} rm Rplots.pdf # Delete redundant pdf file echo "Done. " """ rule GTDBtk: input: f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins' output: directory(f'{config["path"]["root"]}/GTDBtk/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.GTDBtk.benchmark.txt' message: """ The folder dna_bins_organized assumes subfolders containing dna bins for refined and reassembled bins. Note: slightly modified inputs/outputs for european dataset. """ shell: """ set +u;source activate gtdbtk-tmp;set -u; export GTDBTK_DATA_PATH=/g/scb2/patil/zorrilla/conda/envs/gtdbtk/share/gtdbtk-1.1.0/db/ cd $SCRATCHDIR cp -r {input} . gtdbtk classify_wf --genome_dir $(basename {input}) --out_dir GTDBtk -x fa --cpus {config[cores][gtdbtk]} mkdir -p {output} mv GTDBtk/* {output} """ rule classifyGenomes: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', script = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/classify-genomes' output: directory(f'{config["path"]["root"]}/{config["folder"]["classification"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.classify-genomes.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p {output} cd $TMPDIR cp -r {input.script}/* {input.bins}/* . echo "Begin classifying bins ... " for bin in *.fa; do echo -e "\nClassifying $bin ... " $PWD/classify-genomes $bin -t {config[cores][classify]} -o $(echo $bin|sed 's/.fa/.taxonomy/') cp *.taxonomy {output} rm *.taxonomy rm $bin done echo "Done classifying bins. " """ rule taxonomyVis: input: f'{config["path"]["root"]}/{config["folder"]["classification"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/classification.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/taxonomyVis.pdf' message: """ Generate bar plot with most common taxa (n>15) and density plots with mapping statistics. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd {input} echo -e "\nBegin reading classification result files ... \n" for folder in */;do for file in $folder*.taxonomy;do # Define sample ID to append to start of each bin name in summary file sample=$(echo $folder|sed 's|/||') # Define bin name with sample ID, shorten metaWRAP naming scheme (orig/permissive/strict) fasta=$(echo $file | sed 's|^.*/||' | sed 's/.taxonomy//g' | sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$sample./g"); # Extract NCBI ID NCBI=$(less $file | grep NCBI | cut -d ' ' -f4); # Extract consensus taxonomy tax=$(less $file | grep tax | sed 's/Consensus taxonomy: //g'); # Extract consensus motus motu=$(less $file | grep mOTUs | sed 's/Consensus mOTUs: //g'); # Extract number of detected genes detect=$(less $file | grep detected | sed 's/Number of detected genes: //g'); # Extract percentage of agreeing genes percent=$(less $file | grep agreeing | sed 's/Percentage of agreeing genes: //g' | sed 's/%//g'); # Extract number of mapped genes map=$(less $file | grep mapped | sed 's/Number of mapped genes: //g'); # Extract COG IDs, need to use set +e;...;set -e to avoid erroring out when reading .taxonomy result file for bin with no taxonomic annotation set +e cog=$(less $file | grep COG | cut -d$'\t' -f1 | tr '\n' ',' | sed 's/,$//g'); set -e # Display and store extracted results echo -e "$fasta \t $NCBI \t $tax \t $motu \t $detect \t $map \t $percent \t $cog" echo -e "$fasta \t $NCBI \t $tax \t $motu \t $detect \t $map \t $percent \t $cog" >> classification.stats; done; done echo -e "\nDone generating classification.stats summary file, moving to stats/ directory and running taxonomyVis.R script ... " mv classification.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][taxonomyVis]} rm Rplots.pdf # Delete redundant pdf file echo "Done. " """ rule abundance: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', READS = rules.qfilter.output output: directory(f'{config["path"]["root"]}/{config["folder"]["abundance"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.abundance.benchmark.txt' message: """ Calculate bin abundance fraction using the following: binAbundanceFraction = ( X / Y / Z) * 1000000 X = # of reads mapped to bin_i from sample_k Y = length of bin_i (bp) Z = # of reads mapped to all bins in sample_k Note: 1000000 scaling factor converts length in bp to Mbp """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; mkdir -p {output} cd $TMPDIR echo -e "\nCopying quality filtered single end reads and generated MAGs to TMPDIR ... " cp {input.READS} {input.bins}/* . echo -e "\nConcatenating all bins into one FASTA file ... " cat *.fa > $(basename {output}).fa echo -e "\nCreating bwa index for concatenated FASTA file ... " bwa index $(basename {output}).fa echo -e "\nMapping quality filtered single end reads to concatenated FASTA file with bwa mem ... " bwa mem -t {config[cores][abundance]} $(basename {output}).fa \ $(basename {input.READS}) > $(basename {output}).sam echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][abundance]} -Sb $(basename {output}).sam > $(basename {output}).bam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][abundance]} -o $(basename {output}).sort.bam $(basename {output}).bam echo -e "\nExtracting stats from sorted BAM file with samtools flagstat ... " samtools flagstat $(basename {output}).sort.bam > map.stats echo -e "\nCopying sample_map.stats file to root/abundance/sample for bin concatenation and deleting temporary FASTA file ... " cp map.stats {output}/$(basename {output})_map.stats rm $(basename {output}).fa echo -e "\nRepeat procedure for each bin ... " for bin in *.fa;do echo -e "\nSetting up temporary sub-directory to map against bin $bin ... " mkdir -p $(echo "$bin"| sed "s/.fa//") mv $bin $(echo "$bin"| sed "s/.fa//") cd $(echo "$bin"| sed "s/.fa//") echo -e "\nCreating bwa index for bin $bin ... " bwa index $bin echo -e "\nMapping quality filtered single end reads to bin $bin with bwa mem ... " bwa mem -t {config[cores][abundance]} $bin ../$(basename {input.READS}) > $(echo "$bin"|sed "s/.fa/.sam/") echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][abundance]} -Sb $(echo "$bin"|sed "s/.fa/.sam/") > $(echo "$bin"|sed "s/.fa/.bam/") echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][abundance]} -o $(echo "$bin"|sed "s/.fa/.sort.bam/") $(echo "$bin"|sed "s/.fa/.bam/") echo -e "\nExtracting stats from sorted BAM file with samtools flagstat ... " samtools flagstat $(echo "$bin"|sed "s/.fa/.sort.bam/") > $(echo "$bin"|sed "s/.fa/.map/") echo -e "\nAppending bin length to bin.map stats file ... " echo -n "Bin Length = " >> $(echo "$bin"|sed "s/.fa/.map/") # Need to check if bins are original (megahit-assembled) or strict/permissive (metaspades-assembled) if [[ $bin == *.strict.fa ]] || [[ $bin == *.permissive.fa ]];then less $bin |grep ">"|cut -d '_' -f4|awk '{{sum+=$1}}END{{print sum}}' >> $(echo "$bin"|sed "s/.fa/.map/") else less $bin |grep ">"|cut -d '-' -f4|sed 's/len_//g'|awk '{{sum+=$1}}END{{print sum}}' >> $(echo "$bin"|sed "s/.fa/.map/") fi paste $(echo "$bin"|sed "s/.fa/.map/") echo -e "\nCalculating abundance for bin $bin ... " echo -n "$bin"|sed "s/.fa//" >> $(echo "$bin"|sed "s/.fa/.abund/") echo -n $'\t' >> $(echo "$bin"|sed "s/.fa/.abund/") X=$(less $(echo "$bin"|sed "s/.fa/.map/")|grep "mapped ("|awk -F' ' '{{print $1}}') Y=$(less $(echo "$bin"|sed "s/.fa/.map/")|tail -n 1|awk -F' ' '{{print $4}}') Z=$(less "../map.stats"|grep "mapped ("|awk -F' ' '{{print $1}}') awk -v x="$X" -v y="$Y" -v z="$Z" 'BEGIN{{print (x/y/z) * 1000000}}' >> $(echo "$bin"|sed "s/.fa/.abund/") paste $(echo "$bin"|sed "s/.fa/.abund/") echo -e "\nRemoving temporary files for bin $bin ... " rm $bin cp $(echo "$bin"|sed "s/.fa/.map/") {output} mv $(echo "$bin"|sed "s/.fa/.abund/") ../ cd .. rm -r $(echo "$bin"| sed "s/.fa//") done echo -e "\nDone processing all bins, summarizing results into sample.abund file ... " cat *.abund > $(basename {output}).abund echo -ne "\nSumming calculated abundances to obtain normalization value ... " norm=$(less $(basename {output}).abund |awk '{{sum+=$2}}END{{print sum}}'); echo $norm echo -e "\nGenerating column with abundances normalized between 0 and 1 ... " awk -v NORM="$norm" '{{printf $1"\t"$2"\t"$2/NORM"\\n"}}' $(basename {output}).abund > abundance.txt rm $(basename {output}).abund mv abundance.txt $(basename {output}).abund mv $(basename {output}).abund {output} """ rule abundanceVis: input: abundance = f'{config["path"]["root"]}/{config["folder"]["abundance"]}', taxonomy = rules.taxonomyVis.output.text output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/abundance.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/abundanceVis.pdf' message: """ Generate stacked bar plots showing composition of samples """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cd {input.abundance} for folder in */;do # Define sample ID sample=$(echo $folder|sed 's|/||g') # Same as in taxonomyVis rule, modify bin names by adding sample ID and shortening metaWRAP naming scheme (orig/permissive/strict) paste $sample/$sample.abund | sed 's/orig/o/g' | sed 's/permissive/p/g' | sed 's/strict/s/g' | sed "s/^/$sample./g" >> abundance.stats done mv abundance.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][abundanceVis]} """ rule extractProteinBins: message: "Extract ORF annotated protein fasta files for each bin from reassembly checkm files." shell: """ cd {config[path][root]} mkdir -p {config[folder][proteinBins]} echo -e "Begin moving and renaming ORF annotated protein fasta bins from reassembled_bins/ to final_bins/ ... \n" for folder in reassembled_bins/*/;do echo "Moving bins from sample $(echo $(basename $folder)) ... " for bin in $folder*reassembled_bins.checkm/bins/*;do var=$(echo $bin/genes.faa | sed 's|reassembled_bins/||g'|sed 's|/reassembled_bins.checkm/bins||'|sed 's|/genes||g'|sed 's|/|_|g'|sed 's/permissive/p/g'|sed 's/orig/o/g'|sed 's/strict/s/g'); cp $bin/*.faa {config[path][root]}/{config[folder][proteinBins]}/$var; done; done """ rule carveme: input: bin = f'{config["path"]["root"]}/{config["folder"]["proteinBins"]}/{{binIDs}}.faa', media = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["carveme"]}' output: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{binIDs}}.xml' benchmark: f'{config["path"]["root"]}/benchmarks/{{binIDs}}.carveme.benchmark.txt' message: """ Make sure that the input files are ORF annotated and preferably protein fasta. If given raw fasta files, Carveme will run without errors but each contig will be treated as one gene. """ shell: """ echo "Activating {config[envs][metabagpipes]} conda environment ... " set +u;source activate {config[envs][metabagpipes]};set -u mkdir -p $(dirname {output}) mkdir -p logs cp {input.bin} {input.media} $TMPDIR cd $TMPDIR echo "Begin carving GEM ... " #carve -g {config[params][carveMedia]} \ # -v \ # --mediadb $(basename {input.media}) \ # --fbc2 \ # -o $(echo $(basename {input.bin}) | sed 's/.faa/.xml/g') $(basename {input.bin}) carve -v \ --fbc2 \ -o $(echo $(basename {input.bin}) | sed 's/.faa/.xml/g') $(basename {input.bin}) echo "Done carving GEM. " [ -f *.xml ] && mv *.xml $(dirname {output}) """ rule modelVis: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}' output: text = f'{config["path"]["root"]}/{config["folder"]["stats"]}/GEMs.stats', plot = f'{config["path"]["root"]}/{config["folder"]["stats"]}/modelVis.pdf' message: """ Generate bar plot with GEMs generated across samples and density plots showing number of unique metabolites, reactions, and genes across GEMs. """ shell: """ set +u;source activate {config[envs][metabagpipes]};set -u; cd {input} echo -e "\nBegin reading models ... \n" for model in *.xml;do id=$(echo $model|sed 's/.xml//g'); mets=$(less $model| grep "species id="|cut -d ' ' -f 8|sed 's/..$//g'|sort|uniq|wc -l); rxns=$(less $model|grep -c 'reaction id='); genes=$(less $model|grep 'fbc:geneProduct fbc:id='|grep -vic spontaneous); echo "Model: $id has $mets mets, $rxns reactions, and $genes genes ... " echo "$id $mets $rxns $genes" >> GEMs.stats; done echo -e "\nDone generating GEMs.stats summary file, moving to stats/ folder and running modelVis.R script ... " mv GEMs.stats {config[path][root]}/{config[folder][stats]} cd {config[path][root]}/{config[folder][stats]} Rscript {config[path][root]}/{config[folder][scripts]}/{config[scripts][modelVis]} rm Rplots.pdf # Delete redundant pdf file echo "Done. " """ rule organizeGEMs: input: f'{config["path"]["root"]}/{config["folder"]["refined"]}' message: """ Organizes GEMs into sample specific subfolders. Necessary to run smetana per sample using the IDs wildcard. """ shell: """ cd {input} for folder in */;do echo -n "Creating GEM subfolder for sample $folder ... " mkdir -p ../{config[folder][GEMs]}/$folder; echo -n "moving GEMs ... " mv ../{config[folder][GEMs]}/$(echo $folder|sed 's|/||')_*.xml ../{config[folder][GEMs]}/$folder; echo "done. " done """ rule memote: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{IDs}}' output: directory(f'{config["path"]["root"]}/{config["folder"]["memote"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.memote.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u mkdir -p $(dirname {output}) mkdir -p {output} cp {input}/*.xml $TMPDIR cd $TMPDIR for model in *.xml;do memote report snapshot --filename $(echo $model|sed 's/.xml/.html/') $model memote run $model > $(echo $model|sed 's/.xml/-summary.txt/') mv *.txt *.html {output} rm $model done """ rule smetana: input: f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{IDs}}' output: f'{config["path"]["root"]}/{config["folder"]["SMETANA"]}/{{IDs}}_detailed.tsv' benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.smetana.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u mkdir -p {config[path][root]}/{config[folder][SMETANA]} cp {config[path][root]}/{config[folder][scripts]}/{config[scripts][carveme]} {input}/*.xml $TMPDIR cd $TMPDIR smetana -o $(basename {input}) --flavor fbc2 \ --mediadb media_db.tsv -m {config[params][smetanaMedia]} \ --detailed \ --solver {config[params][smetanaSolver]} -v *.xml mv *.tsv $(dirname {output}) """ rule motus2: input: rules.qfilter.output output: directory(f'{config["path"]["root"]}/test/motus2/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.motus2.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cp {input} $TMPDIR cd $TMPDIR motus profile -s $(basename {input}) -o $(basename {input}).motus2 -t 12 mkdir -p {output} rm $(basename {input}) mv * {output} """ rule grid: input: bins = f'{config["path"]["root"]}/{config["folder"]["reassembled"]}/{{IDs}}/reassembled_bins', reads = rules.qfilter.output output: directory(f'{config["path"]["root"]}/{config["folder"]["GRiD"]}/{{IDs}}') benchmark: f'{config["path"]["root"]}/benchmarks/{{IDs}}.grid.benchmark.txt' shell: """ set +u;source activate {config[envs][metabagpipes]};set -u cp -r {input.bins} {input.reads} $TMPDIR cd $TMPDIR mkdir MAGdb out update_database -d MAGdb -g $(basename {input.bins}) -p MAGdb rm -r $(basename {input.bins}) grid multiplex -r . -e fastq.gz -d MAGdb -p -c 0.2 -o out -n {config[cores][grid]} rm $(basename {input.reads}) mkdir {output} mv out/* {output} """ ================================================ FILE: workflow/rules/kallisto2concoctTable.smk ================================================ rule kallisto2concoctTable: input: f'{config["path"]["root"]}/{config["folder"]["kallisto"]}/{{focal}}/' output: f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{focal}}/cov/coverage_table.tsv' message: """ This rule is necessary for the crossMapParallel implementation subworkflow. It summarizes the individual concoct input tables for a given focal sample. Note: silence output if not using parallel mapping approach """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p $(dirname {output}) # Compile individual mapping results into coverage table for given assembly python {config[path][root]}/{config[folder][scripts]}/{config[scripts][kallisto2concoct]} \ --samplenames <(for s in {input}/*; do echo $s|sed 's|^.*/||'; done) \ $(find {input} -name "*.gz") > {output} """ ================================================ FILE: workflow/rules/maxbin_single.smk ================================================ rule maxbin_single: input: assembly = rules.megahit.output, R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.maxbin.benchmark.txt' shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Create output folder mkdir -p $(dirname {output}) # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.assembly}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][maxbin]}/${{fsampleID}} # Copy files to tmp cp -r {input.assembly} {input.R1} {input.R2} . echo -e "\nUnzipping assembly ... " gunzip $(basename {input.assembly}) echo -e "\nRunning maxbin2 ... " run_MaxBin.pl -contig $(echo $(basename {input.assembly})|sed 's/.gz//') \ -out $(basename $(dirname {output})) \ -reads $(basename {input.R1}) \ -reads2 $(basename {input.R2}) \ -thread {config[cores][maxbin]} rm $(echo $(basename {input.assembly})|sed 's/.gz//') mkdir $(basename {output}) mv *.fasta *.summary *.abundance *.abund1 *.abund2 $(basename {output}) mv $(basename {output}) $(dirname {output}) """ ================================================ FILE: workflow/rules/metabat_single.smk ================================================ rule metabat_single: input: assembly = rules.megahit.output, R1 = rules.qfilter.output.R1, R2 = rules.qfilter.output.R2 output: directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins') benchmark: f'{config["path"]["root"]}/{config["folder"]["benchmarks"]}/{{IDs}}.metabat.benchmark.txt' message: """ Implementation of metabat2 where only coverage information from the focal sample is used for binning. Use with the crossMapParallel subworkflow, where cross sample coverage information is only used by CONCOCT. """ shell: """ # Activate metagem environment set +u;source activate {config[envs][metagem]};set -u; # Make job specific scratch dir fsampleID=$(echo $(basename $(dirname {input.assembly}))) echo -e "\nCreating temporary directory {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} ... " mkdir -p {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} # Move into scratch dir cd {config[path][scratch]}/{config[folder][metabat]}/${{fsampleID}} # Copy files cp {input.assembly} {input.R1} {input.R2} . echo -e "\nFocal sample: $fsampleID ... " echo "Renaming and unzipping assembly ... " mv $(basename {input.assembly}) $(echo $fsampleID|sed 's/$/.fa.gz/g') gunzip $(echo $fsampleID|sed 's/$/.fa.gz/g') echo -e "\nIndexing assembly ... " bwa index $fsampleID.fa id=$(basename {output}) echo -e "\nMapping reads from sample against assembly $fsampleID ..." bwa mem -t {config[cores][metabat]} $fsampleID.fa *.fastq.gz > $id.sam echo -e "\nDeleting no-longer-needed fastq files ... " rm *.gz echo -e "\nConverting SAM to BAM with samtools view ... " samtools view -@ {config[cores][metabat]} -Sb $id.sam > $id.bam echo -e "\nDeleting no-longer-needed sam file ... " rm $id.sam echo -e "\nSorting BAM file with samtools sort ... " samtools sort -@ {config[cores][metabat]} -o $id.sort $id.bam echo -e "\nDeleting no-longer-needed bam file ... " rm $id.bam # Run metabat2 echo -e "\nRunning metabat2 ... " jgi_summarize_bam_contig_depths --outputDepth $id.depth.txt $id.sort metabat2 -i $fsampleID.fa -a $id.depth.txt -s \ {config[params][metabatMin]} \ -v --seed {config[params][seed]} \ -t 0 -m {config[params][minBin]} \ -o $(basename $(dirname {output})) rm $fsampleID.fa rm $id.depth.txt # Move files to output dir mv *.fa {output} """ ================================================ FILE: workflow/scripts/assemblyVis.R ================================================ library(ggplot2) library(gridExtra) assembly = read.delim("assembly.stats",stringsAsFactors = FALSE,header = FALSE,sep = " ") colnames(assembly) = c("sample","contigs","length_total") assembly$ave = assembly$length_total/assembly$contigs aveplot = ggplot(data=assembly) + geom_density(aes(x=ave,fill="All contigs")) + xlab("Average contig length") + ggtitle("Average contig length across samples") + theme(legend.title = element_blank())+ theme(legend.position = "none") contplot = ggplot(data=assembly) + geom_density(aes(x=contigs,fill="Total contigs")) + ggtitle("Contigs across samples") + theme(legend.title = element_blank()) + scale_x_log10() + theme(legend.position = "none") scatplot = ggplot(data=assembly) + geom_point(aes(x=length_total,y=contigs)) + ggtitle("Total length vs number of contigs")+ xlab("Total length") + ylab("Number of contigs") + expand_limits(x = 0, y = 0) barplot = ggplot(data=assembly) + geom_bar(aes(x=reorder(sample,-length_total),y=length_total,fill="Contigs"),stat = "identity",color="black",size=0.2) + ggtitle("Total length across assemblies") + ylab("Length (bp)") + xlab("Sample ID") + #scale_y_log10() + coord_flip() + theme(legend.position = "none") assemblyplot=grid.arrange(barplot,arrangeGrob(scatplot,aveplot,contplot,nrow=3),ncol=2,nrow=1) ggsave("assemblyVis.pdf",plot= assemblyplot,device = "pdf",dpi = 300, width = 30, height = 40, units = "cm") ================================================ FILE: workflow/scripts/assemblyVis_alternative.R ================================================ library(ggplot2) library(gridExtra) assembly = read.delim("500assembly.stats",stringsAsFactors = FALSE,header = FALSE,sep = " ") colnames(assembly) = c("sample","contigs","length_total","gt1000","gt1000_total") assembly$ave = assembly$length_total/assembly$contigs assembly$gt1000_ave = assembly$gt1000_total/assembly$gt1000 aveplot = ggplot(data=assembly) + geom_density(aes(x=ave,fill="All contigs")) + geom_density(aes(x=gt1000_ave,fill="Contigs >= 1000bp")) + xlab("Average contig length") + ggtitle("Average contig length across samples") + theme(legend.title = element_blank()) contplot = ggplot(data=assembly) + geom_density(aes(x=contigs,fill="Total contigs")) + geom_density(aes(x=gt1000,fill="Contigs >= 1000")) + ggtitle("Contigs across samples") + theme(legend.title = element_blank()) + scale_x_log10() contplot2 = ggplot(data=assembly) + geom_point(aes(x=contigs,y=gt1000)) + xlab("Total contigs") + ylab("Contigs >= 1000bp") + ggtitle("Total contigs vs >=1000bp across samples") + geom_abline(slope = 1,intercept=0) + expand_limits(x = 0, y = 0) lenplot = ggplot(data=assembly) + geom_density(aes(x=length_total,fill="Total length")) + geom_density(aes(x=gt1000_total,fill="Length >= 1000")) + xlab("Length") + ggtitle("Length across samples") + theme(legend.title = element_blank()) + scale_x_log10() lenplot2= ggplot(data=assembly) + geom_point(aes(x=length_total,y=gt1000_total)) + ggtitle("Total length vs >= 1000bp across samples")+ xlab("Total length") + ylab("Length of contigs >= 1000 bp") + geom_abline(slope = 1,intercept=0) + expand_limits(x = 0, y = 0) fracplot = ggplot(data=assembly) + geom_density(aes(100*gt1000/contigs,fill="Contigs")) + geom_density(aes(100*gt1000_total/length_total,fill="Length")) + ggtitle("% Information captured by contigs >= 1000 bp") + xlab("% Information") + theme(legend.title = element_blank()) assemblyplot=grid.arrange(fracplot,contplot2,contplot,aveplot,lenplot2,lenplot,ncol=3,nrow=2) ggsave("assemblyVis_old.pdf",plot= assemblyplot,device = "pdf",dpi = 300, width = 40, height = 20, units = "cm") ================================================ FILE: workflow/scripts/binFilter.py ================================================ #!/usr/bin/env python """ Based on the checkm results, approves bins according to the leves of contamination and completeness. Copies approved bins to output directory. @author: alneberg """ from __future__ import print_function import sys import os import argparse import pandas as pd from shutil import copyfile def main(args): # Read in the checkm table df = pd.read_table(args.checkm_stats, index_col=0) # extract the ids for all rows that meet the requirements filtered_df = df[(df['Completeness'] >= args.min_completeness) & (df['Contamination'] <= args.max_contamination)] approved_bins = list(filtered_df.index) # copy the approved bins to the new output directory for approved_bin_int in approved_bins: approved_bin = str(approved_bin_int) bin_source = os.path.join(args.bin_directory, approved_bin) bin_source += '.' + args.extension bin_destination = os.path.join(args.output_directory) bin_destination += '/' + os.path.basename(bin_source) sys.stderr.write("Copying approved bin {} from {} to {}\n".format(approved_bin, bin_source, bin_destination)) copyfile(bin_source, bin_destination) sys.stderr.write("\nApproved {} bins\n\n".format(len(approved_bins))) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("bin_directory", help=("Input fasta files should be within directory.")) parser.add_argument("checkm_stats", help="Checkm qa stats in tab_table format") parser.add_argument("output_directory", help="Directory where to put approved bins") parser.add_argument("--min_completeness", default=85, type=float, help="default=85") parser.add_argument("--max_contamination", default=5, type=float, help="default=5") parser.add_argument("--extension", default='fa') args = parser.parse_args() main(args) ================================================ FILE: workflow/scripts/binningVis.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) concoctCheckm = read.delim("concoct.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(concoctCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") concoctBins= read.delim("concoct_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(concoctBins) = c("bin","contigs","length") concoct = left_join(concoctCheckm,concoctBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50) %>% distinct() %>% select(-set) concoct$sample = gsub("\\..*$","",concoct$bin) concoct$binner = "CONCOCT" metabatCheckm = read.delim("metabat.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(metabatCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") metabatBins= read.delim("metabat_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(metabatBins) = c("bin","contigs","length") metabat = left_join(metabatCheckm,metabatBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) metabat$sample = gsub("\\..*$","",metabat$bin) metabat$binner = "MetaBAT2" maxbinCheckm = read.delim("maxbin.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(maxbinCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") maxbinBins= read.delim("maxbin_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(maxbinBins) = c("bin","contigs","length") maxbin = left_join(maxbinCheckm,maxbinBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) maxbin$contigs = as.numeric(maxbin$contigs) maxbin$sample = gsub("\\..*$","",maxbin$bin) maxbin$binner = "MaxBin2" refinedCheckm = read.delim("refined.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(refinedCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") refinedBins= read.delim("refined_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(refinedBins) = c("bin","contigs","length") refined = left_join(refinedCheckm,refinedBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) refined$sample = gsub("\\..*$","",refined$bin) refined$binner = "metaWRAP_refined" reassembledCheckm = read.delim("reassembled.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(reassembledCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size") reassembledBins= read.delim("reassembled_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(reassembledBins) = c("bin","contigs","length") reassembled = left_join(reassembledCheckm,reassembledBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() reassembled$sample = gsub("\\..*$","",reassembled$bin) reassembled$binner = "metaWRAP_reassembled" #bins <- as.data.frame(matrix(0,nrow = 5,ncol=2)) #colnames(bins) = c("variable","value") #bins$variable = c("maxbin2","refined","CONCOCT","metabat2","reassembled") #bins$value = c(as.numeric(dim(maxbin)[1]),as.numeric(dim(refined)[1]),as.numeric(dim(concoct)[1]),as.numeric(dim(metabat)[1]),as.numeric(dim(reassembled)[1])) rbind(concoct,metabat,maxbin,refined,reassembled) %>% group_by(binner,sample) %>% summarize(count=n()) -> bins binplot = ggplot(data = bins,aes(x=reorder(binner,-count),y=count,fill= binner)) + geom_bar(stat = "identity",color="black") + ylab("Generated bins") + xlab("Binning tool") + theme(legend.title = element_blank()) + ggtitle("Number of MQ bins") + coord_flip() + theme(legend.position = "none")+ facet_wrap(~sample,ncol=1) compplot = ggplot() + geom_density(data=concoct,aes(completeness,color="CONCOCT")) + geom_density(data=maxbin,aes(completeness,color="maxbin2")) + geom_density(data=metabat,aes(completeness,color="metabat2")) + geom_density(data=refined,aes(completeness,color="refined")) + geom_density(data=reassembled,aes(completeness,color="reassembled")) + ggtitle("Completeness") + theme(axis.text.y=element_blank()) + theme(legend.position = "none") contplot = ggplot() + geom_density(data=concoct,aes(contamination,color="CONCOCT")) + geom_density(data=maxbin,aes(contamination,color="maxbin2")) + geom_density(data=metabat,aes(contamination,color="metabat2")) + geom_density(data=refined,aes(contamination,color="refined")) + geom_density(data=reassembled,aes(contamination,color="reassembled")) + ggtitle("Contamination") + theme(axis.text.y=element_blank()) + theme(legend.position = "none") lengthplot = ggplot() + geom_density(data=concoct,aes(size,color="CONCOCT")) + geom_density(data=maxbin,aes(size,color="maxbin2")) + geom_density(data=metabat,aes(size,color="metabat2")) + geom_density(data=refined,aes(size,color="refined")) + geom_density(data=reassembled,aes(size,color="reassembled")) + ggtitle("BP Length") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) contigplot = ggplot() + geom_density(data=concoct,aes(contigs,color="CONCOCT")) + geom_density(data=maxbin,aes(contigs,color="maxbin2")) + geom_density(data=metabat,aes(contigs,color="metabat2")) + geom_density(data=refined,aes(contigs,color="refined")) + geom_density(data=reassembled,aes(contigs,color="reassembled")) + ggtitle("Number of contigs") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) densities1= grid.arrange(compplot,lengthplot,nrow=2,ncol=1) densities2=grid.arrange(contplot,contigplot,nrow=2,ncol=1) plot=grid.arrange(binplot,densities1,densities2,nrow=1,ncol=3) ggsave("binningVis.pdf",plot=plot, height = 8, width = 12) ================================================ FILE: workflow/scripts/binningVis_perSample.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) concoctCheckm = read.delim("concoct.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(concoctCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") concoctBins= read.delim("concoct_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(concoctBins) = c("bin","contigs","length") concoct = left_join(concoctCheckm,concoctBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50) %>% distinct() %>% select(-set) concoct$sample = gsub("\\..*$","",concoct$bin) concoct$binner = "CONCOCT" metabatCheckm = read.delim("metabat.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(metabatCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") metabatBins= read.delim("metabat_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(metabatBins) = c("bin","contigs","length") metabat = left_join(metabatCheckm,metabatBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) metabat$sample = gsub("\\..*$","",metabat$bin) metabat$binner = "MetaBAT2" maxbinCheckm = read.delim("maxbin.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(maxbinCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") maxbinBins= read.delim("maxbin_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(maxbinBins) = c("bin","contigs","length") maxbin = left_join(maxbinCheckm,maxbinBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) maxbin$contigs = as.numeric(maxbin$contigs) maxbin$sample = gsub("\\..*$","",maxbin$bin) maxbin$binner = "MaxBin2" refinedCheckm = read.delim("refined.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(refinedCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size","set") refinedBins= read.delim("refined_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(refinedBins) = c("bin","contigs","length") refined = left_join(refinedCheckm,refinedBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() %>% select(-set) refined$sample = gsub("\\..*$","",refined$bin) refined$binner = "metaWRAP_refined" reassembledCheckm = read.delim("reassembled.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(reassembledCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size") reassembledBins= read.delim("reassembled_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(reassembledBins) = c("bin","contigs","length") reassembled = left_join(reassembledCheckm,reassembledBins%>%select(-length),by="bin") %>% filter(contamination<=10,completeness>=50)%>% distinct() reassembled$sample = gsub("\\..*$","",reassembled$bin) reassembled$binner = "metaWRAP_reassembled" #bins <- as.data.frame(matrix(0,nrow = 5,ncol=2)) #colnames(bins) = c("variable","value") #bins$variable = c("maxbin2","refined","CONCOCT","metabat2","reassembled") #bins$value = c(as.numeric(dim(maxbin)[1]),as.numeric(dim(refined)[1]),as.numeric(dim(concoct)[1]),as.numeric(dim(metabat)[1]),as.numeric(dim(reassembled)[1])) rbind(concoct,metabat,maxbin,refined,reassembled) %>% group_by(binner,sample) %>% summarize(count=n()) -> bins binplot = ggplot(data = bins,aes(x=reorder(binner,-count),y=count,fill= binner)) + geom_bar(stat = "identity",color="black") + ylab("Generated bins") + xlab("Binning tool") + theme(legend.title = element_blank()) + ggtitle("Number of MQ bins") + coord_flip() + theme(legend.position = "none")+ facet_wrap(~sample,ncol=1) compplot = ggplot() + geom_density(data=concoct,aes(completeness,color="CONCOCT")) + geom_density(data=maxbin,aes(completeness,color="maxbin2")) + geom_density(data=metabat,aes(completeness,color="metabat2")) + geom_density(data=refined,aes(completeness,color="refined")) + geom_density(data=reassembled,aes(completeness,color="reassembled")) + ggtitle("Completeness") + theme(axis.text.y=element_blank()) + theme(legend.position = "none") contplot = ggplot() + geom_density(data=concoct,aes(contamination,color="CONCOCT")) + geom_density(data=maxbin,aes(contamination,color="maxbin2")) + geom_density(data=metabat,aes(contamination,color="metabat2")) + geom_density(data=refined,aes(contamination,color="refined")) + geom_density(data=reassembled,aes(contamination,color="reassembled")) + ggtitle("Contamination") + theme(axis.text.y=element_blank()) + theme(legend.position = "none") lengthplot = ggplot() + geom_density(data=concoct,aes(size,color="CONCOCT")) + geom_density(data=maxbin,aes(size,color="maxbin2")) + geom_density(data=metabat,aes(size,color="metabat2")) + geom_density(data=refined,aes(size,color="refined")) + geom_density(data=reassembled,aes(size,color="reassembled")) + ggtitle("BP Length") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) contigplot = ggplot() + geom_density(data=concoct,aes(contigs,color="CONCOCT")) + geom_density(data=maxbin,aes(contigs,color="maxbin2")) + geom_density(data=metabat,aes(contigs,color="metabat2")) + geom_density(data=refined,aes(contigs,color="refined")) + geom_density(data=reassembled,aes(contigs,color="reassembled")) + ggtitle("Number of contigs") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) densities1= grid.arrange(compplot,lengthplot,nrow=2,ncol=1) densities2=grid.arrange(contplot,contigplot,nrow=2,ncol=1) plot=grid.arrange(binplot,densities1,densities2,nrow=1,ncol=3) ggsave("binningVis.pdf",plot=plot, height = 6, width = 12) ================================================ FILE: workflow/scripts/compositionVis.R ================================================ library(tidyverse) library(tidytext) taxonomy=read.delim("GTDBTk.stats",header=TRUE) %>% select(user_genome,classification) %>% separate(.,classification,into = c("kingdom","phylum","class","order","family","genus","species"),sep = ";") abundance=read.delim("abundance.stats",header=FALSE) colnames(abundance)=c("user_genome","absolute_ab","rel_ab") taxab = left_join(taxonomy,abundance,by="user_genome") taxab$sample = gsub("\\..*$","",taxab$user_genome) taxab$species = gsub("s__$","Undefined sp.",taxab$species) taxab$species = gsub("s__","",taxab$species) ggplot(taxab%>% filter(species!="Undefined sp.")) + geom_bar(aes(x=reorder_within(species,-rel_ab,sample),y=rel_ab*100),stat="identity") + scale_x_reordered() + facet_wrap(~sample,scales = "free") + ylab("Relative abundance (%)") + xlab("Species") + coord_flip() ggsave("compositionVis.pdf",width = 12,height=8) ================================================ FILE: workflow/scripts/compositionVis_old.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) classification = read.delim("classification.stats",stringsAsFactors = FALSE,header = FALSE) colnames(classification)=c("fasta","NCBI","taxonomy","motu","detect","map","percent","cog") # add descriptive column names based on classify-genomes output classification$percent=as.numeric(classification$percent) # force percentage to be numeric, will default to chr if any bin in dataset cannot be assigned a taxonomy classification$taxonomy=substr(classification$taxonomy,1,50) # subset taxonomy name, some can get very long, making plot labels obscene classification$percent[is.na(classification$percent)] <- 0 # replace NAs with zero's (occurs when bin taxonomy cannot be assigned due to no marker genes) classification$fasta=gsub(" $","",classification$fasta) # make sure there are not trailing white spaces classification$sample=gsub("\\..*$","",classification$fasta) # add sample info from bin name abundance = read.delim("abundance.stats",stringsAsFactors = FALSE,header=FALSE) colnames(abundance) = c("fasta","ab","abNorm") classification = left_join(classification,abundance,by="fasta") plot = ggplot(classification, aes(x = sample, y = abNorm, fill = taxonomy)) + geom_bar(stat = "identity") + ggtitle("Taxonomic composition of samples based on MAGs") + xlab("Sample ID") + ylab("Normalized relative abundance") + coord_flip() ggsave("abundanceVis.pdf",plot=plot, height = 8, width = 12) ================================================ FILE: workflow/scripts/download_toydata.txt ================================================ https://zenodo.org/record/3534949/files/sample1_1.fastq.gz?download=1 https://zenodo.org/record/3534949/files/sample1_2.fastq.gz?download=1 https://zenodo.org/record/3534949/files/sample2_1.fastq.gz?download=1 https://zenodo.org/record/3534949/files/sample2_2.fastq.gz?download=1 https://zenodo.org/record/3534949/files/sample3_1.fastq.gz?download=1 https://zenodo.org/record/3534949/files/sample3_2.fastq.gz?download=1 ================================================ FILE: workflow/scripts/env_setup.sh ================================================ #!/bin/bash echo ' ================================================================================================================================= Developed by: Francisco Zorrilla, Kiran R. Patil, and Aleksej Zelezniak___________________________________________________________ Publication: doi.org/10.1101/2020.12.31.424982___________________________/\\\\\\\\\\\\___/\\\\\\\\\\\\\\\___/\\\\____________/\\\\_ ________________________________________________________________________/\\\//////////___\/\\\///////////___\/\\\\\\________/\\\\\\_ ____________________________________________/\\\________________________/\\\______________\/\\\______________\/\\\//\\\____/\\\//\\\_ _______/\\\\\__/\\\\\________/\\\\\\\\____/\\\\\\\\\\\___/\\\\\\\\\_____\/\\\____/\\\\\\\__\/\\\\\\\\\\\______\/\\\\///\\\/\\\/_\/\\\_ ______/\\\///\\\\\///\\\____/\\\/////\\\__\////\\\////___\////////\\\____\/\\\___\/////\\\__\/\\\///////_______\/\\\__\///\\\/___\/\\\_ ______\/\\\_\//\\\__\/\\\___/\\\\\\\\\\\______\/\\\_________/\\\\\\\\\\___\/\\\_______\/\\\__\/\\\______________\/\\\____\///_____\/\\\_ _______\/\\\__\/\\\__\/\\\__\//\\///////_______\/\\\_/\\____/\\\/////\\\___\/\\\_______\/\\\__\/\\\______________\/\\\_____________\/\\\_ ________\/\\\__\/\\\__\/\\\___\//\\\\\\\\\\_____\//\\\\\____\//\\\\\\\\/\\__\//\\\\\\\\\\\\/___\/\\\\\\\\\\\\\\\__\/\\\_____________\/\\\_ _________\///___\///___\///_____\//////////_______\/////______\////////\//____\////////////_____\///////////////___\///______________\///__ ============================================================================================================================================= A Snakemake-based pipeline desinged to predict metabolic interactions directly from metagenomics data using high performance computer clusters =============================================================================================================================================== ' #check if conda is installed/available echo -ne "Checking if conda is available ... " if ! command -v conda &> /dev/null ; then echo -e "\nWARNING: Conda is not available! Please load your cluster's conda module or install locally and re-run the env_setup.sh script using:\n\nbash env_setup.sh\n" && exit else condav=$(conda --version | cut -d ' ' -f2) echo -e "detected version $condav!" fi #check if mamba or mamba env are available echo -ne "Checking if mamba environment is available ... " repl="etc\/profile\.d\/conda\.sh" source $(which conda | sed -e "s/condabin\/conda/${repl}/" | sed -e "s/bin\/conda/${repl}/") if conda info --envs | grep -q mamba ; then conda activate mamba if command -v mamba &> /dev/null ; then #mamba env installed and activated mambav=$(mamba --version|head -n1|cut -d ' ' -f2) && echo -e "detected version $mambav!\n" else #mamba not installed in mamba env conda install mamba && echo "Installed mamba\n" fi else while true; do read -p "Do you wish to create an environment for mamba installation? This is recommended for faster setup (y/n)" yn case $yn in [Yy]* ) echo "conda create -n mamba mamba -c conda-forge"|bash; break;; [Nn]* ) echo -e "\nPlease set up mamba before proceeding.\n"; exit;; * ) echo "Please answer yes or no.";; esac done fi conda activate mamba && echo "activated mamba environment!" while true; do read -p "Do you wish to download and set up metaGEM conda environment? (y/n)" yn case $yn in [Yy]* ) echo "mamba env create --prefix ./envs/metagem -f envs/metaGEM_env.yml && source activate envs/metagem && pip install --user memote carveme smetana && echo "|bash; break;; [Nn]* ) echo -e "\nSkipping metaGEM env setup, note that you will need this for refinement & reassembly of MAGs.\n"; break;; * ) echo "Please answer yes or no.";; esac done while true; do read -p "Do you wish to download the GTDB-tk database (~25 Gb)? (y/n)" yn case $yn in [Yy]* ) echo "download-db.sh && source deactivate && source activate mamba"|bash; break;; [Nn]* ) echo -e "\nSkipping GTDB-tk database download, note that you will need this for taxonomic classification of MAGs.\n"; break;; * ) echo "Please answer yes or no.";; esac done while true; do read -p "Do you wish to download and set up metaWRAP conda environment? (y/n)" yn case $yn in [Yy]* ) echo "mamba env create --prefix ./envs/metawrap -f envs/metaWRAP_env.yml"|bash; break;; [Nn]* ) echo -e "\nSkipping metaWRAP env setup, note that you will need this for refinement & reassembly of MAGs.\n"; break;; * ) echo "Please answer yes or no.";; esac done while true; do read -p "Do you wish to download the CheckM database (~275 Mb)? (y/n)" yn case $yn in [Yy]* ) echo "wget https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz"|bash; break;; [Nn]* ) echo -e "\nSkipping CheckM database download, note that you will need this for bin refinement & reassembly.\n"; break;; * ) echo "Please answer yes or no.";; esac done while true; do read -p "Do you wish to download and set up prokka + roary conda environment? (y/n)" yn case $yn in [Yy]* ) echo "mamba env create --prefix ./envs/prokkaroary -f envs/prokkaroary_env.yml"|bash; break;; [Nn]* ) echo -e "\nSkipping prokka-roary env setup, note that you will need this for pangenome analysis of MAGs.\n"; break;; * ) echo "Please answer yes or no.";; esac done echo 'Please ensure that the installation directory is present in your $PATH variable if installation issues arise with any tools.' echo "" ================================================ FILE: workflow/scripts/kallisto2concoct.py ================================================ #!/usr/bin/env python """A script to create a concoct input table from kallisto abundance.txt output files. https://github.com/EnvGen/toolbox/blob/master/scripts/kallisto_concoct/input_table.py""" import argparse import pandas as pd import os import sys def samplenames_from_file(name_file): if name_file: with open(name_file, 'r') as name_file_h: return [l.strip() for l in name_file_h] else: return None def main(args): sample_dfs = [] samplenames = samplenames_from_file(args.samplenames) for i, sample in enumerate(args.quantfiles): if samplenames: samplename = samplenames[i] else: samplename = os.path.basename(sample) sample_df = pd.read_table(sample, index_col=0) sample_dfs.append((samplename, sample_df)) kallisto_df = pd.DataFrame(index=sample_df.index) for sample, sample_df in sample_dfs: kallisto_df['kallisto_coverage_{0}'.format(sample)] = 200*sample_df['est_counts'].divide(sample_df['length']) kallisto_df.to_csv(sys.stdout, sep="\t", float_format="%.6f") if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("quantfiles", nargs='+', help="Kallisto abundance.txt files") parser.add_argument("--samplenames", default=None, help="File with sample names, one line each, Should be the same order and the same number as the abundance.txt files") args = parser.parse_args() main(args) ================================================ FILE: workflow/scripts/media_db.tsv ================================================ medium description compound name M1 M1 3mb 3mb M1 M1 4abz 4abz M1 M1 ac ac M1 M1 btn btn M1 M1 but but M1 M1 ca2 ca2 M1 M1 cbl1 cbl1 M1 M1 cbl2 cbl2 M1 M1 cellb cellb M1 M1 cl cl M1 M1 cobalt2 cobalt2 M1 M1 cu2 cu2 M1 M1 cys__L cys__L M1 M1 fe2 fe2 M1 M1 fe3 fe3 M1 M1 fol fol M1 M1 fru fru M1 M1 glc__D glc__D M1 M1 h h M1 M1 h2o h2o M1 M1 hco3 hco3 M1 M1 his__L his__L M1 M1 k k M1 M1 lipoate lipoate M1 M1 malt malt M1 M1 mg2 mg2 M1 M1 mn2 mn2 M1 M1 mndn mndn M1 M1 mobd mobd M1 M1 na1 na1 M1 M1 nac nac M1 M1 ni2 ni2 M1 M1 no3 no3 M1 M1 pheme pheme M1 M1 pi pi M1 M1 pnto__R pnto__R M1 M1 ppa ppa M1 M1 pydxn pydxn M1 M1 ribflv ribflv M1 M1 slnt slnt M1 M1 so4 so4 M1 M1 thm thm M1 M1 tungs tungs M1 M1 zn2 zn2 M2 M2 4abz 4abz M2 M2 ac ac M2 M2 ade ade M2 M2 ala__L ala__L M2 M2 arg__L arg__L M2 M2 ascb__L ascb__L M2 M2 asn__L asn__L M2 M2 asp__L asp__L M2 M2 btn btn M2 M2 ca2 ca2 M2 M2 cit cit M2 M2 cl cl M2 M2 cobalt2 cobalt2 M2 M2 cu2 cu2 M2 M2 cys__L cys__L M2 M2 fe2 fe2 M2 M2 fe3 fe3 M2 M2 fol fol M2 M2 glc__D glc__D M2 M2 gln__L gln__L M2 M2 glu__L glu__L M2 M2 gly gly M2 M2 gthrd gthrd M2 M2 gua gua M2 M2 h h M2 M2 h2o h2o M2 M2 his__L his__L M2 M2 ile__L ile__L M2 M2 inost inost M2 M2 k k M2 M2 leu__L leu__L M2 M2 lipoate lipoate M2 M2 lys__L lys__L M2 M2 met__L met__L M2 M2 mg2 mg2 M2 M2 mn2 mn2 M2 M2 mobd mobd M2 M2 na1 na1 M2 M2 nac nac M2 M2 nh4 nh4 M2 M2 ni2 ni2 M2 M2 phe__L phe__L M2 M2 pi pi M2 M2 pnto__R pnto__R M2 M2 pro__L pro__L M2 M2 pydam pydam M2 M2 pydxn pydxn M2 M2 ribflv ribflv M2 M2 ser__L ser__L M2 M2 so4 so4 M2 M2 thm thm M2 M2 thr__L thr__L M2 M2 trp__L trp__L M2 M2 tyr__L tyr__L M2 M2 ura ura M2 M2 val__L val__L M2 M2 xan xan M2 M2 zn2 zn2 M3 M3 3mb 3mb M3 M3 4abz 4abz M3 M3 ac ac M3 M3 ade ade M3 M3 ala__L ala__L M3 M3 arg__L arg__L M3 M3 ascb__L ascb__L M3 M3 asn__L asn__L M3 M3 asp__L asp__L M3 M3 btn btn M3 M3 but but M3 M3 ca2 ca2 M3 M3 cbl1 cbl1 M3 M3 cbl2 cbl2 M3 M3 cellb cellb M3 M3 cit cit M3 M3 cl cl M3 M3 cobalt2 cobalt2 M3 M3 cu2 cu2 M3 M3 cys__L cys__L M3 M3 fe2 fe2 M3 M3 fe3 fe3 M3 M3 fol fol M3 M3 fru fru M3 M3 glc__D glc__D M3 M3 gln__L gln__L M3 M3 glu__L glu__L M3 M3 gly gly M3 M3 gthrd gthrd M3 M3 gua gua M3 M3 h h M3 M3 h2o h2o M3 M3 hco3 hco3 M3 M3 his__L his__L M3 M3 ile__L ile__L M3 M3 inost inost M3 M3 k k M3 M3 lcts lcts M3 M3 leu__L leu__L M3 M3 lipoate lipoate M3 M3 lys__L lys__L M3 M3 malt malt M3 M3 met__L met__L M3 M3 mg2 mg2 M3 M3 mn2 mn2 M3 M3 mndn mndn M3 M3 mobd mobd M3 M3 na1 na1 M3 M3 nac nac M3 M3 nad nad M3 M3 nh4 nh4 M3 M3 ni2 ni2 M3 M3 no3 no3 M3 M3 phe__L phe__L M3 M3 pheme pheme M3 M3 pi pi M3 M3 pnto__R pnto__R M3 M3 ppa ppa M3 M3 pro__L pro__L M3 M3 pydam pydam M3 M3 pydxn pydxn M3 M3 ribflv ribflv M3 M3 ser__L ser__L M3 M3 slnt slnt M3 M3 so4 so4 M3 M3 thm thm M3 M3 thr__L thr__L M3 M3 trp__L trp__L M3 M3 tungs tungs M3 M3 tyr__L tyr__L M3 M3 ura ura M3 M3 val__L val__L M3 M3 xan xan M3 M3 zn2 zn2 M4 M4 3mb 3mb M4 M4 4abz 4abz M4 M4 ac ac M4 M4 ade ade M4 M4 ala__L ala__L M4 M4 arg__L arg__L M4 M4 ascb__L ascb__L M4 M4 asn__L asn__L M4 M4 asp__L asp__L M4 M4 btn btn M4 M4 but but M4 M4 ca2 ca2 M4 M4 cbl1 cbl1 M4 M4 cbl2 cbl2 M4 M4 cellb cellb M4 M4 cit cit M4 M4 cl cl M4 M4 cobalt2 cobalt2 M4 M4 cu2 cu2 M4 M4 cys__L cys__L M4 M4 fe2 fe2 M4 M4 fe3 fe3 M4 M4 fol fol M4 M4 fru fru M4 M4 glc__D glc__D M4 M4 gln__L gln__L M4 M4 glu__L glu__L M4 M4 gly gly M4 M4 gthrd gthrd M4 M4 gua gua M4 M4 h h M4 M4 h2o h2o M4 M4 hco3 hco3 M4 M4 his__L his__L M4 M4 ile__L ile__L M4 M4 inost inost M4 M4 k k M4 M4 lcts lcts M4 M4 leu__L leu__L M4 M4 lipoate lipoate M4 M4 lys__L lys__L M4 M4 malt malt M4 M4 met__L met__L M4 M4 mg2 mg2 M4 M4 mn2 mn2 M4 M4 mndn mndn M4 M4 mobd mobd M4 M4 na1 na1 M4 M4 nac nac M4 M4 nad nad M4 M4 nh4 nh4 M4 M4 ni2 ni2 M4 M4 no3 no3 M4 M4 phe__L phe__L M4 M4 pheme pheme M4 M4 pi pi M4 M4 pnto__R pnto__R M4 M4 ppa ppa M4 M4 pro__L pro__L M4 M4 pydam pydam M4 M4 pydxn pydxn M4 M4 ribflv ribflv M4 M4 ser__L ser__L M4 M4 slnt slnt M4 M4 so4 so4 M4 M4 thm thm M4 M4 thr__L thr__L M4 M4 trp__L trp__L M4 M4 tungs tungs M4 M4 tyr__L tyr__L M4 M4 ura ura M4 M4 val__L val__L M4 M4 xan xan M4 M4 zn2 zn2 M5 M5 4abz 4abz M5 M5 ac ac M5 M5 ade ade M5 M5 ala__L ala__L M5 M5 arg__L arg__L M5 M5 ascb__L ascb__L M5 M5 asn__L asn__L M5 M5 asp__L asp__L M5 M5 btn btn M5 M5 ca2 ca2 M5 M5 cbl1 cbl1 M5 M5 cbl2 cbl2 M5 M5 cellb cellb M5 M5 cit cit M5 M5 cl cl M5 M5 cobalt2 cobalt2 M5 M5 cu2 cu2 M5 M5 cys__L cys__L M5 M5 fe2 fe2 M5 M5 fe3 fe3 M5 M5 fol fol M5 M5 fru fru M5 M5 glc__D glc__D M5 M5 gln__L gln__L M5 M5 glu__L glu__L M5 M5 gly gly M5 M5 gthrd gthrd M5 M5 gua gua M5 M5 h h M5 M5 h2o h2o M5 M5 hco3 hco3 M5 M5 his__L his__L M5 M5 ile__L ile__L M5 M5 inost inost M5 M5 k k M5 M5 lcts lcts M5 M5 leu__L leu__L M5 M5 lipoate lipoate M5 M5 lys__L lys__L M5 M5 malt malt M5 M5 met__L met__L M5 M5 mg2 mg2 M5 M5 mn2 mn2 M5 M5 mndn mndn M5 M5 mobd mobd M5 M5 na1 na1 M5 M5 nac nac M5 M5 nad nad M5 M5 nh4 nh4 M5 M5 ni2 ni2 M5 M5 no3 no3 M5 M5 phe__L phe__L M5 M5 pheme pheme M5 M5 pi pi M5 M5 pnto__R pnto__R M5 M5 pro__L pro__L M5 M5 pydam pydam M5 M5 pydxn pydxn M5 M5 ribflv ribflv M5 M5 ser__L ser__L M5 M5 slnt slnt M5 M5 so4 so4 M5 M5 thm thm M5 M5 thr__L thr__L M5 M5 trp__L trp__L M5 M5 tungs tungs M5 M5 tyr__L tyr__L M5 M5 ura ura M5 M5 val__L val__L M5 M5 xan xan M5 M5 zn2 zn2 M7 M7 3mb 3mb M7 M7 4abz 4abz M7 M7 ac ac M7 M7 ade ade M7 M7 ala__L ala__L M7 M7 arg__L arg__L M7 M7 ascb__L ascb__L M7 M7 asn__L asn__L M7 M7 asp__L asp__L M7 M7 btn btn M7 M7 but but M7 M7 ca2 ca2 M7 M7 cbl1 cbl1 M7 M7 cbl2 cbl2 M7 M7 cit cit M7 M7 cl cl M7 M7 cobalt2 cobalt2 M7 M7 cu2 cu2 M7 M7 cys__L cys__L M7 M7 fe2 fe2 M7 M7 fe3 fe3 M7 M7 fol fol M7 M7 fru fru M7 M7 glc__D glc__D M7 M7 gln__L gln__L M7 M7 glu__L glu__L M7 M7 gly gly M7 M7 gthrd gthrd M7 M7 gua gua M7 M7 h h M7 M7 h2o h2o M7 M7 hco3 hco3 M7 M7 his__L his__L M7 M7 ile__L ile__L M7 M7 inost inost M7 M7 k k M7 M7 leu__L leu__L M7 M7 lipoate lipoate M7 M7 lys__L lys__L M7 M7 met__L met__L M7 M7 mg2 mg2 M7 M7 mn2 mn2 M7 M7 mndn mndn M7 M7 mobd mobd M7 M7 na1 na1 M7 M7 nac nac M7 M7 nad nad M7 M7 nh4 nh4 M7 M7 ni2 ni2 M7 M7 no3 no3 M7 M7 phe__L phe__L M7 M7 pheme pheme M7 M7 pi pi M7 M7 pnto__R pnto__R M7 M7 ppa ppa M7 M7 pro__L pro__L M7 M7 pydam pydam M7 M7 pydxn pydxn M7 M7 ribflv ribflv M7 M7 ser__L ser__L M7 M7 slnt slnt M7 M7 so4 so4 M7 M7 thm thm M7 M7 thr__L thr__L M7 M7 trp__L trp__L M7 M7 tungs tungs M7 M7 tyr__L tyr__L M7 M7 ura ura M7 M7 val__L val__L M7 M7 xan xan M7 M7 zn2 zn2 M8 M8 3mb 3mb M8 M8 4abz 4abz M8 M8 ac ac M8 M8 ade ade M8 M8 ala__L ala__L M8 M8 arg__L arg__L M8 M8 ascb__L ascb__L M8 M8 asn__L asn__L M8 M8 asp__L asp__L M8 M8 btn btn M8 M8 but but M8 M8 ca2 ca2 M8 M8 cbl1 cbl1 M8 M8 cbl2 cbl2 M8 M8 cellb cellb M8 M8 cit cit M8 M8 cl cl M8 M8 cobalt2 cobalt2 M8 M8 cu2 cu2 M8 M8 cys__L cys__L M8 M8 fe2 fe2 M8 M8 fe3 fe3 M8 M8 fol fol M8 M8 fru fru M8 M8 glc__D glc__D M8 M8 gln__L gln__L M8 M8 glu__L glu__L M8 M8 gly gly M8 M8 gthrd gthrd M8 M8 gua gua M8 M8 h h M8 M8 h2o h2o M8 M8 hco3 hco3 M8 M8 his__L his__L M8 M8 ile__L ile__L M8 M8 inost inost M8 M8 k k M8 M8 lcts lcts M8 M8 leu__L leu__L M8 M8 lipoate lipoate M8 M8 lys__L lys__L M8 M8 malt malt M8 M8 met__L met__L M8 M8 mg2 mg2 M8 M8 mn2 mn2 M8 M8 mndn mndn M8 M8 mobd mobd M8 M8 na1 na1 M8 M8 nac nac M8 M8 nad nad M8 M8 nh4 nh4 M8 M8 ni2 ni2 M8 M8 no3 no3 M8 M8 phe__L phe__L M8 M8 pheme pheme M8 M8 pi pi M8 M8 pnto__R pnto__R M8 M8 ppa ppa M8 M8 pro__L pro__L M8 M8 pydam pydam M8 M8 pydxn pydxn M8 M8 ribflv ribflv M8 M8 ser__L ser__L M8 M8 slnt slnt M8 M8 so4 so4 M8 M8 thm thm M8 M8 thr__L thr__L M8 M8 trp__L trp__L M8 M8 tungs tungs M8 M8 tyr__L tyr__L M8 M8 ura ura M8 M8 val__L val__L M8 M8 xan xan M8 M8 zn2 zn2 M9 M9 3mb 3mb M9 M9 4abz 4abz M9 M9 ac ac M9 M9 ade ade M9 M9 ala__L ala__L M9 M9 arg__L arg__L M9 M9 ascb__L ascb__L M9 M9 asn__L asn__L M9 M9 asp__L asp__L M9 M9 btn btn M9 M9 but but M9 M9 ca2 ca2 M9 M9 cbl1 cbl1 M9 M9 cbl2 cbl2 M9 M9 cit cit M9 M9 cl cl M9 M9 cobalt2 cobalt2 M9 M9 cu2 cu2 M9 M9 cys__L cys__L M9 M9 fe2 fe2 M9 M9 fe3 fe3 M9 M9 fol fol M9 M9 gln__L gln__L M9 M9 glu__L glu__L M9 M9 gly gly M9 M9 gthrd gthrd M9 M9 gua gua M9 M9 h h M9 M9 h2o h2o M9 M9 hco3 hco3 M9 M9 his__L his__L M9 M9 ile__L ile__L M9 M9 inost inost M9 M9 k k M9 M9 leu__L leu__L M9 M9 lipoate lipoate M9 M9 lys__L lys__L M9 M9 met__L met__L M9 M9 mg2 mg2 M9 M9 mn2 mn2 M9 M9 mndn mndn M9 M9 mobd mobd M9 M9 na1 na1 M9 M9 nac nac M9 M9 nad nad M9 M9 nh4 nh4 M9 M9 ni2 ni2 M9 M9 no3 no3 M9 M9 phe__L phe__L M9 M9 pheme pheme M9 M9 pi pi M9 M9 pnto__R pnto__R M9 M9 ppa ppa M9 M9 pro__L pro__L M9 M9 pydam pydam M9 M9 pydxn pydxn M9 M9 ribflv ribflv M9 M9 ser__L ser__L M9 M9 slnt slnt M9 M9 so4 so4 M9 M9 thm thm M9 M9 thr__L thr__L M9 M9 trp__L trp__L M9 M9 tungs tungs M9 M9 tyr__L tyr__L M9 M9 ura ura M9 M9 val__L val__L M9 M9 xan xan M9 M9 zn2 zn2 M10 M10 3mb 3mb M10 M10 4abz 4abz M10 M10 ac ac M10 M10 ade ade M10 M10 ala__L ala__L M10 M10 arg__L arg__L M10 M10 ascb__L ascb__L M10 M10 asn__L asn__L M10 M10 asp__L asp__L M10 M10 btn btn M10 M10 but but M10 M10 ca2 ca2 M10 M10 cbl1 cbl1 M10 M10 cbl2 cbl2 M10 M10 cellb cellb M10 M10 cit cit M10 M10 cl cl M10 M10 cobalt2 cobalt2 M10 M10 cu2 cu2 M10 M10 cys__L cys__L M10 M10 fe2 fe2 M10 M10 fe3 fe3 M10 M10 fol fol M10 M10 fru fru M10 M10 glc__D glc__D M10 M10 gln__L gln__L M10 M10 glu__L glu__L M10 M10 gly gly M10 M10 gthrd gthrd M10 M10 gua gua M10 M10 h h M10 M10 h2o h2o M10 M10 hco3 hco3 M10 M10 his__L his__L M10 M10 ile__L ile__L M10 M10 inost inost M10 M10 k k M10 M10 lcts lcts M10 M10 leu__L leu__L M10 M10 lipoate lipoate M10 M10 lys__L lys__L M10 M10 malt malt M10 M10 met__L met__L M10 M10 mg2 mg2 M10 M10 mn2 mn2 M10 M10 mndn mndn M10 M10 mobd mobd M10 M10 na1 na1 M10 M10 nac nac M10 M10 nad nad M10 M10 nh4 nh4 M10 M10 ni2 ni2 M10 M10 no3 no3 M10 M10 phe__L phe__L M10 M10 pheme pheme M10 M10 pi pi M10 M10 pnto__R pnto__R M10 M10 ppa ppa M10 M10 pro__L pro__L M10 M10 pydam pydam M10 M10 pydxn pydxn M10 M10 ribflv ribflv M10 M10 ser__L ser__L M10 M10 slnt slnt M10 M10 so4 so4 M10 M10 thm thm M10 M10 thr__L thr__L M10 M10 trp__L trp__L M10 M10 tungs tungs M10 M10 tyr__L tyr__L M10 M10 ura ura M10 M10 val__L val__L M10 M10 xan xan M10 M10 zn2 zn2 M11 M11 3mb 3mb M11 M11 4abz 4abz M11 M11 ac ac M11 M11 ade ade M11 M11 ala__L ala__L M11 M11 arg__L arg__L M11 M11 ascb__L ascb__L M11 M11 asn__L asn__L M11 M11 asp__L asp__L M11 M11 btn btn M11 M11 but but M11 M11 ca2 ca2 M11 M11 cbl1 cbl1 M11 M11 cbl2 cbl2 M11 M11 cellb cellb M11 M11 cit cit M11 M11 cl cl M11 M11 cobalt2 cobalt2 M11 M11 cu2 cu2 M11 M11 cys__L cys__L M11 M11 fe2 fe2 M11 M11 fe3 fe3 M11 M11 fol fol M11 M11 fru fru M11 M11 glc__D glc__D M11 M11 gln__L gln__L M11 M11 glu__L glu__L M11 M11 gly gly M11 M11 gthrd gthrd M11 M11 gua gua M11 M11 h h M11 M11 h2o h2o M11 M11 hco3 hco3 M11 M11 his__L his__L M11 M11 ile__L ile__L M11 M11 inost inost M11 M11 k k M11 M11 lcts lcts M11 M11 leu__L leu__L M11 M11 lipoate lipoate M11 M11 lys__L lys__L M11 M11 malt malt M11 M11 met__L met__L M11 M11 mg2 mg2 M11 M11 mn2 mn2 M11 M11 mndn mndn M11 M11 mobd mobd M11 M11 na1 na1 M11 M11 nac nac M11 M11 nad nad M11 M11 nh4 nh4 M11 M11 ni2 ni2 M11 M11 no3 no3 M11 M11 pheme pheme M11 M11 pi pi M11 M11 pnto__R pnto__R M11 M11 ppa ppa M11 M11 pro__L pro__L M11 M11 pydam pydam M11 M11 pydxn pydxn M11 M11 ribflv ribflv M11 M11 ser__L ser__L M11 M11 slnt slnt M11 M11 so4 so4 M11 M11 thm thm M11 M11 thr__L thr__L M11 M11 tungs tungs M11 M11 ura ura M11 M11 val__L val__L M11 M11 xan xan M11 M11 zn2 zn2 M13 M13 ca2 ca2 M13 M13 cbl1 cbl1 M13 M13 cbl2 cbl2 M13 M13 cl cl M13 M13 cobalt2 cobalt2 M13 M13 cu2 cu2 M13 M13 cys__L cys__L M13 M13 fe2 fe2 M13 M13 fe3 fe3 M13 M13 glc__D glc__D M13 M13 h h M13 M13 h2o h2o M13 M13 his__L his__L M13 M13 k k M13 M13 mg2 mg2 M13 M13 mn2 mn2 M13 M13 mndn mndn M13 M13 na1 na1 M13 M13 nh4 nh4 M13 M13 ni2 ni2 M13 M13 pheme pheme M13 M13 pi pi M13 M13 so4 so4 M13 M13 zn2 zn2 M14 M14 ade ade M14 M14 ala__L ala__L M14 M14 arg__L arg__L M14 M14 ascb__L ascb__L M14 M14 asp__L asp__L M14 M14 btn btn M14 M14 ca2 ca2 M14 M14 cl cl M14 M14 cobalt2 cobalt2 M14 M14 cu2 cu2 M14 M14 cys__L cys__L M14 M14 fe2 fe2 M14 M14 fe3 fe3 M14 M14 glc__D glc__D M14 M14 glu__L glu__L M14 M14 gly gly M14 M14 h h M14 M14 h2o h2o M14 M14 his__L his__L M14 M14 ile__L ile__L M14 M14 k k M14 M14 leu__L leu__L M14 M14 lys__L lys__L M14 M14 met__L met__L M14 M14 mg2 mg2 M14 M14 mn2 mn2 M14 M14 na1 na1 M14 M14 nac nac M14 M14 ni2 ni2 M14 M14 phe__L phe__L M14 M14 pi pi M14 M14 pnto__R pnto__R M14 M14 pro__L pro__L M14 M14 pydam pydam M14 M14 ser__L ser__L M14 M14 so4 so4 M14 M14 thiog thiog M14 M14 thm thm M14 M14 thr__L thr__L M14 M14 trp__L trp__L M14 M14 tyr__L tyr__L M14 M14 ura ura M14 M14 val__L val__L M14 M14 zn2 zn2 M15A M15A ca2 ca2 M15A M15A cl cl M15A M15A cobalt2 cobalt2 M15A M15A cu2 cu2 M15A M15A fe2 fe2 M15A M15A fe3 fe3 M15A M15A glc__D glc__D M15A M15A h h M15A M15A h2o h2o M15A M15A k k M15A M15A mg2 mg2 M15A M15A mn2 mn2 M15A M15A mobd mobd M15A M15A na1 na1 M15A M15A nh4 nh4 M15A M15A ni2 ni2 M15A M15A pi pi M15A M15A so4 so4 M15A M15A zn2 zn2 M15B M15B ca2 ca2 M15B M15B cl cl M15B M15B cobalt2 cobalt2 M15B M15B cu2 cu2 M15B M15B fe2 fe2 M15B M15B fe3 fe3 M15B M15B glc__D glc__D M15B M15B h h M15B M15B h2o h2o M15B M15B k k M15B M15B mg2 mg2 M15B M15B mn2 mn2 M15B M15B mobd mobd M15B M15B na1 na1 M15B M15B nh4 nh4 M15B M15B ni2 ni2 M15B M15B pi pi M15B M15B so4 so4 M15B M15B zn2 zn2 M16 M16 4abz 4abz M16 M16 asp__L asp__L M16 M16 btn btn M16 M16 ca2 ca2 M16 M16 cl cl M16 M16 cobalt2 cobalt2 M16 M16 cu2 cu2 M16 M16 cys__L cys__L M16 M16 fe2 fe2 M16 M16 fe3 fe3 M16 M16 glu__L glu__L M16 M16 h h M16 M16 h2co3 h2co3 M16 M16 h2o h2o M16 M16 k k M16 M16 lac__L lac__L M16 M16 mg2 mg2 M16 M16 mn2 mn2 M16 M16 mobd mobd M16 M16 na1 na1 M16 M16 nac nac M16 M16 nh4 nh4 M16 M16 ni2 ni2 M16 M16 pi pi M16 M16 pnto__R pnto__R M16 M16 ptrc ptrc M16 M16 pydx pydx M16 M16 ser__L ser__L M16 M16 so4 so4 M16 M16 thiog thiog M16 M16 thm thm M16 M16 tyr__L tyr__L M16 M16 zn2 zn2 MILK MILK h2o H2O MILK MILK o2 O2 MILK MILK co2 CO2 MILK MILK ca2 Ca2+ MILK MILK cl Cl- MILK MILK cobalt2 Co2+ MILK MILK cu2 Cu2+ MILK MILK fe2 Fe2+ MILK MILK fe3 Fe3+ MILK MILK h H+ MILK MILK k K+ MILK MILK mg2 Mg MILK MILK mn2 Mn2+ MILK MILK mobd Molybdate MILK MILK na1 Na+ MILK MILK nh4 Ammonium MILK MILK ni2 Ni2+ MILK MILK pi Phosphate MILK MILK so4 Sulfate MILK MILK zn2 Zn2+ MILK MILK ala__L L-Alanine MILK MILK asn__L L-Asparagine MILK MILK asp__L L-Aspartate MILK MILK glu__L L-Glutamate MILK MILK gln__L L-Glutamine MILK MILK gly Glycine MILK MILK his__L L-Histidine MILK MILK ile__L L-Isoleucine MILK MILK leu__L L-Leucine MILK MILK lys__L L-Lysine MILK MILK orn Ornithine MILK MILK phe__L L-Phenylalanine MILK MILK peamn Phylethylamine MILK MILK pro__L L-Proline MILK MILK ser__L L-Serine MILK MILK thr__L L-Threonine MILK MILK trp__L tryptophan MILK MILK tyr__L L-Tyrosine MILK MILK val__L L-Valine MILK MILK lcts Lactose MILK MILK glc__D D-Glucose MILK MILK gal D-Galactose MILK MILK gal_bD Beta D-Galactose MILK MILK cit Citrate MILK MILK lac__D D-Lactate MILK MILK lac__L L-Lactate MILK MILK for Formate MILK MILK ac Acetate MILK MILK oxa Oxalate MILK MILK pydx Pyridoxal MILK MILK cbl1 Vitamin B12 MILK MILK thm Thiamin MILK MILK pnto__R (R)-Pantothenate MILK MILK fol Folate MILK MILK ribflv Riboflavin MILK MILK nac Isonicotinic acid MILK MILK btn Biotin MILK MILK but Butyrate MILK MILK caproic Caproic acid MILK MILK octa octanoate MILK MILK dca decanoate MILK MILK ddca dodecanoate MILK MILK ttdca tetradecanoate MILK MILK ptdca Pentadecanoate MILK MILK hdca hexadecanoate MILK MILK ocdca octadecanoate MILK MILK arach arachidic acid MILK MILK ttdcea Tetradecenoate (n-C14:1) MILK MILK hdcea hexadecanoate (n-C16:1) MILK MILK ocdcea octadecanoate (n-C18:1) MILK MILK lnlc linoleic acid MILK MILK arachd arachidonic acid MILK MILK ade Adenine MILK MILK gua Guanine MILK MILK ins Inosine MILK MILK thymd Thymidine MILK MILK ura Uracil MILK MILK xan Xanthine ================================================ FILE: workflow/scripts/modelVis.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) gems = read.delim("GEMs.stats",stringsAsFactors = FALSE,header=FALSE, sep = " ") gems$V5 = gsub("_.*$","",gems$V1) colnames(gems) = c("bin","mets","rxns","genes","sample") samplesplot = gems %>% count(sample) %>% ggplot(aes(x=reorder(sample,-n),y=n)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("Number of GEMs across samples") + ylab("Number of GEMs carved") + xlab("Sample ID") metplot = ggplot() + geom_density(data=gems,aes(mets),fill="#7fc97f") + ggtitle("Unique metabolites across GEMs") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) rxnplot = ggplot() + geom_density(data=gems,aes(rxns),fill="#beaed4") + ggtitle("Reactions across GEMs") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) geneplot = ggplot() + geom_density(data=gems,aes(genes),fill="#fdc086") + ggtitle("Genes across GEMs") + theme(legend.position = "none") + theme(axis.text.y=element_blank()) plot=grid.arrange(samplesplot,arrangeGrob(metplot,rxnplot,geneplot,nrow=3,ncol=1),nrow=1,ncol=2,heights=c(60),widths=c(30,30)) ggsave("modelVis.pdf",plot=plot, height = 8, width = 12) ================================================ FILE: workflow/scripts/prepareRoaryInput.R ================================================ # Prepare roary input script library(dplyr) library(tidyr) # Load in classification just as in taxonomyVis.R classification = read.delim("GTDBtk.stats",stringsAsFactors = FALSE,header = TRUE) classification$bin = classification$user_genome gtdbtk_class = classification[,c(2,20)] gtdbtk_class$classification = gsub("*.__","",gtdbtk_class$classification) gtdbtk_class %>% separate(classification,c("domain","phylum","class","order","family","genus","species"),sep = ";") -> gtdbtk_class gtdbtk_class[gtdbtk_class==""]<-'NA' gtdbtk_class$sample = gsub("_.*$","",gtdbtk_class$bin) # Load in refined+reassembled consensus bins just as in binningVis.R reassembledCheckm = read.delim("reassembled.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(reassembledCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size") reassembledBins= read.delim("reassembled_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(reassembledBins) = c("bin","contigs","length") reassembled = left_join(reassembledCheckm,reassembledBins,by="bin") reassembled$bin=gsub(" $","",reassembled$bin) reassembled$bin=gsub("permissive","p",reassembled$bin) reassembled$bin=gsub("orig","o",reassembled$bin) reassembled$bin=gsub("strict","s",reassembled$bin) reassembled$bin=gsub("\\.bin","_bin",reassembled$bin) # Join dataframes by bin and filter out low completeness bins bins = left_join(reassembled,gtdbtk_class,by="bin") %>% filter(completeness >= 90) %>% filter(contamination <= 5) # Identify which species are represented by at least 10 high quality bins bins %>% group_by(species) %>% count() %>% filter(n>=10) -> species # Run for loop to generate text file with bin IDs for each identified species dir.create(gsub("$","/speciesBinIDs",getwd())) for (i in species$species) { #Remove any forbidden characters if present: #(spaces, forward slashes, square brackets, parentheses) name = gsub(" ","_",i) name = gsub("/","_",name) name = gsub("\\[","_",name) name = gsub("]","_",name) name = gsub("\\(","_",name) name = gsub(")","_",name) write.table(bins %>% filter(species == i) %>% select(bin),paste0(paste0("speciesBinIDs/",name),".txt"),sep="\n",row.names=FALSE,col.names = FALSE,quote = FALSE) } ================================================ FILE: workflow/scripts/prepareRoaryInputGTDBtk.R ================================================ # Prepare roary input script library(dplyr) # Load in classification just as in taxonomyVis.R classification = read.delim("GTDBtk.stats",stringsAsFactors = FALSE,header = TRUE) classification$bin = classification$user_genome # Load in refined+reassembled consensus bins just as in binningVis.R reassembledCheckm = read.delim("reassembled.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(reassembledCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size") reassembledBins= read.delim("reassembled_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(reassembledBins) = c("bin","contigs","length") reassembled = left_join(reassembledCheckm,reassembledBins,by="bin") reassembled$bin=gsub(" $","",reassembled$bin) reassembled$bin=gsub("permissive","p",reassembled$bin) reassembled$bin=gsub("orig","o",reassembled$bin) reassembled$bin=gsub("strict","s",reassembled$bin) reassembled$bin=gsub("\\.bin","_bin",reassembled$bin) # Join dataframes by bin and filter out low completeness bins bins = left_join(reassembled,classification,by="bin") %>% filter(completeness >= 90) %>% filter(contamination <= 5) bins$taxonomy = gsub("^ ","",bins$taxonomy) bins$taxonomy = gsub(" $","",bins$taxonomy) # Identify which species are represented by at least 10 high quality bins bins %>% group_by(taxonomy) %>% count() %>% filter(n>=10) -> species # Run for loop to generate text file with bin IDs for each identified species dir.create(gsub("$","/speciesBinIDs",getwd())) for (i in species$taxonomy) { name = gsub(" ","_",i) name = gsub("/","_",name) name = gsub("\\[","_",name) name = gsub("]","_",name) name = gsub("\\(","_",name) name = gsub(")","_",name) write.table(bins %>% filter(taxonomy == i) %>% select(bin),paste0(paste0("speciesBinIDs/",name),".txt"),sep="\n",row.names=FALSE,col.names = FALSE,quote = FALSE) } ================================================ FILE: workflow/scripts/prepareRoaryInput_old.R ================================================ # Prepare roary input script library(dplyr) # Load in classification just as in taxonomyVis.R classification = read.delim("classification.stats",stringsAsFactors = FALSE,header = FALSE) colnames(classification)=c("bin","NCBI","taxonomy","motu","detect","map","percent","cog") classification$percent=as.numeric(classification$percent) classification$taxonomy=substr(classification$taxonomy,1,40) classification$percent[is.na(classification$percent)] <- 0 classification$bin=gsub(" $","",classification$bin) # Load in refined+reassembled consensus bins just as in binningVis.R reassembledCheckm = read.delim("reassembled.checkm",stringsAsFactors = FALSE,header = FALSE) colnames(reassembledCheckm) = c("bin","completeness","contamination","GC","lineage","N50","size") reassembledBins= read.delim("reassembled_bins.stats",stringsAsFactors = FALSE,header = FALSE, sep = " ") colnames(reassembledBins) = c("bin","contigs","length") reassembled = left_join(reassembledCheckm,reassembledBins,by="bin") reassembled$bin=gsub(" $","",reassembled$bin) reassembled$bin=gsub("permissive","p",reassembled$bin) reassembled$bin=gsub("orig","o",reassembled$bin) reassembled$bin=gsub("strict","s",reassembled$bin) # Join dataframes by bin and filter out low completeness bins bins = left_join(reassembled,classification,by="bin") %>% filter(completeness >= 90) bins$taxonomy = gsub("^ ","",bins$taxonomy) bins$taxonomy = gsub(" $","",bins$taxonomy) # Identify which species are represented by at least 10 high quality bins bins %>% group_by(taxonomy) %>% count() %>% filter(n>=10) -> species # Run for loop to generate text file with bin IDs for each identified species dir.create(gsub("$","/speciesBinIDs",getwd())) for (i in species$taxonomy) { name = gsub(" ","_",i) name = gsub("/","_",name) name = gsub("\\[","_",name) name = gsub("]","_",name) name = gsub("\\(","_",name) name = gsub(")","_",name) write.table(bins %>% filter(taxonomy == i) %>% select(bin),paste0(paste0("speciesBinIDs/",name),".txt"),sep="\n",row.names=FALSE,col.names = FALSE,quote = FALSE) } ================================================ FILE: workflow/scripts/qfilterVis.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) qfilter = read.delim("qfilter.stats",stringsAsFactors = FALSE, header = FALSE, sep = " ") colnames(qfilter) = c("ID","readsBF","readsAF","basesBF","basesAF","percentReads","q20BF","q20AF","q30BF","q30AF") reads = ggplot(data = qfilter) + geom_density(aes(readsBF,fill="Pre-filtering"),alpha=0.8) + geom_density(aes(readsAF,fill="Post-filtering"),alpha=0.8) + ggtitle("Number of reads across samples") + xlab("Total reads") + theme(legend.title = element_blank()) bases = ggplot(data = qfilter) + geom_density(aes(basesBF,fill="Pre-filtering"),alpha=0.8) + geom_density(aes(basesAF,fill="Post-filtering"),alpha=0.8) + ggtitle("Number of bases across samples") + xlab("Total bases") + theme(legend.title = element_blank()) q20 = ggplot(data = qfilter) + geom_density(aes(q20BF*100,fill="Pre-filtering"),alpha=0.8) + geom_density(aes(q20AF*100,fill="Post-filtering"),alpha=0.8) + ggtitle("Percent Q20 bases across samples") + xlab("Percent of Q20 bases") + theme(legend.title = element_blank()) q30 = ggplot(data = qfilter) + geom_density(aes(q30BF*100,fill="Pre-filtering"),alpha=0.8) + geom_density(aes(q30AF*100,fill="Post-filtering"),alpha=0.8) + ggtitle("Percent Q30 bases across samples") + xlab("Percent of Q30 bases") + theme(legend.title = element_blank()) bar = ggplot(data = qfilter) + geom_bar(aes(x=reorder(ID,-basesBF),y=basesBF,fill="Pre-filtering"),color="black",stat = "identity") + geom_bar(aes(x=reorder(ID,-basesBF),y=basesAF,fill="Post-filtering"),color="black",stat = "identity") + geom_bar(aes(x=reorder(ID,-basesBF),y=q20AF*basesAF,fill="Q20 bases"),color="black",stat = "identity") + geom_bar(aes(x=reorder(ID,-basesBF),y=q30AF*basesAF,fill="Q30 bases"),color="black",stat = "identity") + coord_flip() + ggtitle("Raw read QC summary stacked bar plot") + xlab("Sample ID") + ylab("Base pairs") + theme(legend.title = element_blank()) qfilt=grid.arrange(bar,arrangeGrob(reads,bases,q20,q30,nrow=4,ncol=1),ncol =2,nrow=1) ggsave("qfilterVis.pdf",plot= qfilt,device = "pdf",height = 6, width=8) ================================================ FILE: workflow/scripts/taxonomyVis.R ================================================ library(gridExtra) library(dplyr) library(ggplot2) classification = read.delim("classification.stats",stringsAsFactors = FALSE,header = FALSE) colnames(classification)=c("fasta","NCBI","taxonomy","motu","detect","map","percent","cog") classification$percent=as.numeric(classification$percent) classification$taxonomy=substr(classification$taxonomy,1,40) classification$percent[is.na(classification$percent)] <- 0 classification$fasta=gsub(" $","",classification$fasta) taxplot = classification %>% count(taxonomy) %>% filter(n>10) %>% ggplot(aes(x=reorder(taxonomy,-n),y=n)) + geom_bar(stat = "identity") + ggtitle("Taxonomy of reconstructed MAGs") + xlab("Taxonomy") + ylab("Count") + coord_flip() mapplot=ggplot(classification)+ geom_density(aes(map),fill="#7fc97f") + ggtitle("Density of marker genes mapped ") + xlab("Number of marker genes mapped to MAG") + ylab("Density") detplot=ggplot(classification)+ geom_density(aes(detect),fill="#beaed4") + ggtitle("Density of marker genes detected") + xlab("Number of marker genes detected in MAG") + ylab("Density") perplot=ggplot(classification)+ geom_density(aes(percent),fill="#fdc086") + ggtitle("Density of agreeing percentage of marker genes") + xlab("Percentage of mapped marker genes agreeing with assigned taxomy") + ylab("Density") plotax=grid.arrange(taxplot,arrangeGrob(detplot,mapplot,perplot,nrow=3,ncol = 1),nrow=1,ncol=2) ggsave("taxonomyVis.pdf",plot= plotax,device = "pdf",dpi = 300, width = 40, height = 20, units = "cm")