Repository: assemblerflow/flowcraft
Branch: master
Commit: 66cef2555892
Files: 328
Total size: 1.0 MB

Directory structure:
gitextract_0fzlskug/

├── .gitignore
├── .gitmodules
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── changelog.md
├── docker/
│   └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── _static/
│   │   └── custom.css
│   ├── about/
│   │   └── about.rst
│   ├── conf.py
│   ├── dev/
│   │   ├── containers.rst
│   │   ├── create_process.rst
│   │   ├── create_recipe.rst
│   │   ├── create_recipes.rst
│   │   ├── create_template.rst
│   │   ├── general_orientation.rst
│   │   ├── pipeline_reporting.rst
│   │   ├── process_dotfiles.rst
│   │   └── reports.rst
│   ├── flowcraft.flowcraft.rst
│   ├── flowcraft.generator.components.annotation.rst
│   ├── flowcraft.generator.components.assembly.rst
│   ├── flowcraft.generator.components.assembly_processing.rst
│   ├── flowcraft.generator.components.distance_estimation.rst
│   ├── flowcraft.generator.components.downloads.rst
│   ├── flowcraft.generator.components.metagenomics.rst
│   ├── flowcraft.generator.components.mlst.rst
│   ├── flowcraft.generator.components.patlas_mapping.rst
│   ├── flowcraft.generator.components.reads_quality_control.rst
│   ├── flowcraft.generator.components.rst
│   ├── flowcraft.generator.components.typing.rst
│   ├── flowcraft.generator.engine.rst
│   ├── flowcraft.generator.error_handling.rst
│   ├── flowcraft.generator.footer_skeleton.rst
│   ├── flowcraft.generator.header_skeleton.rst
│   ├── flowcraft.generator.inspect.rst
│   ├── flowcraft.generator.pipeline_parser.rst
│   ├── flowcraft.generator.process.rst
│   ├── flowcraft.generator.process_details.rst
│   ├── flowcraft.generator.recipe.rst
│   ├── flowcraft.generator.rst
│   ├── flowcraft.rst
│   ├── flowcraft.templates.assembly_report.rst
│   ├── flowcraft.templates.fastqc.rst
│   ├── flowcraft.templates.fastqc_report.rst
│   ├── flowcraft.templates.flowcraft_utils.flowcraft_base.rst
│   ├── flowcraft.templates.flowcraft_utils.rst
│   ├── flowcraft.templates.integrity_coverage.rst
│   ├── flowcraft.templates.mapping2json.rst
│   ├── flowcraft.templates.mashdist2json.rst
│   ├── flowcraft.templates.mashscreen2json.rst
│   ├── flowcraft.templates.megahit.rst
│   ├── flowcraft.templates.metaspades.rst
│   ├── flowcraft.templates.pATLAS_consensus_json.rst
│   ├── flowcraft.templates.pipeline_status.rst
│   ├── flowcraft.templates.process_abricate.rst
│   ├── flowcraft.templates.process_assembly.rst
│   ├── flowcraft.templates.process_assembly_mapping.rst
│   ├── flowcraft.templates.rst
│   ├── flowcraft.templates.skesa.rst
│   ├── flowcraft.templates.spades.rst
│   ├── flowcraft.templates.trimmomatic.rst
│   ├── flowcraft.templates.trimmomatic_report.rst
│   ├── flowcraft.tests.data_pipelines.rst
│   ├── flowcraft.tests.rst
│   ├── flowcraft.tests.test_assemblerflow.rst
│   ├── flowcraft.tests.test_engine.rst
│   ├── flowcraft.tests.test_pipeline_parser.rst
│   ├── flowcraft.tests.test_process_details.rst
│   ├── flowcraft.tests.test_processes.rst
│   ├── flowcraft.tests.test_sanity.rst
│   ├── getting_started/
│   │   ├── installation.rst
│   │   └── overview.rst
│   ├── index.rst
│   ├── make.bat
│   ├── setup.rst
│   └── user/
│       ├── available_components.rst
│       ├── basic_usage.rst
│       ├── components/
│       │   ├── abricate.rst
│       │   ├── assembly_mapping.rst
│       │   ├── bowtie.rst
│       │   ├── card_rgi.rst
│       │   ├── check_coverage.rst
│       │   ├── chewbbaca.rst
│       │   ├── diamond.rst
│       │   ├── downsample_fastq.rst
│       │   ├── fast_ani.rst
│       │   ├── fasterq_dump.rst
│       │   ├── fastqc.rst
│       │   ├── fastqc_trimmomatic.rst
│       │   ├── filter_poly.rst
│       │   ├── integrity_coverage.rst
│       │   ├── kraken.rst
│       │   ├── kraken2.rst
│       │   ├── mapping_patlas.rst
│       │   ├── mash_dist.rst
│       │   ├── mash_screen.rst
│       │   ├── mash_sketch_fasta.rst
│       │   ├── mash_sketch_fastq.rst
│       │   ├── maxbin2.rst
│       │   ├── megahit.rst
│       │   ├── metamlst.rst
│       │   ├── metaspades.rst
│       │   ├── midas_species.rst
│       │   ├── mlst.rst
│       │   ├── momps.rst
│       │   ├── patho_typing.rst
│       │   ├── pilon.rst
│       │   ├── process_skesa.rst
│       │   ├── process_spades.rst
│       │   ├── prokka.rst
│       │   ├── reads_download.rst
│       │   ├── remove_host.rst
│       │   ├── retrieve_mapped.rst
│       │   ├── seq_typing.rst
│       │   ├── sistr.rst
│       │   ├── skesa.rst
│       │   ├── spades.rst
│       │   └── trimmomatic.rst
│       ├── pipeline_building.rst
│       ├── pipeline_configuration.rst
│       ├── pipeline_inspect.rst
│       ├── pipeline_reports.rst
│       └── reports/
│           ├── abricate.rst
│           ├── assembly_mapping.rst
│           ├── check_coverage.rst
│           ├── chewbbaca.rst
│           ├── dengue_typing.rst
│           ├── fastqc.rst
│           ├── fastqc_trimmomatic.rst
│           ├── integrity_coverage.rst
│           ├── mash_dist.rst
│           ├── maxbin2.rst
│           ├── mlst.rst
│           ├── patho_typing.rst
│           ├── pilon.rst
│           ├── process_mapping.rst
│           ├── process_newick.rst
│           ├── process_skesa.rst
│           ├── process_spades.rst
│           ├── process_viral_assembly.rst
│           ├── seq_typing.rst
│           ├── sistr.rst
│           ├── trimmomatic.rst
│           └── true_coverage.rst
├── flowcraft/
│   ├── __init__.py
│   ├── bin/
│   │   ├── final_POST.sh
│   │   ├── merge_json.py
│   │   ├── metadata_POST.sh
│   │   ├── parse_fasta.py
│   │   ├── parse_true_coverage.py
│   │   ├── prepare_reports.py
│   │   ├── renamePE_samtoolsFASTQ.py
│   │   ├── report_POST.sh
│   │   ├── set_dotfiles.sh
│   │   └── startup_POST.sh
│   ├── flowcraft.py
│   ├── generator/
│   │   ├── __init__.py
│   │   ├── components/
│   │   │   ├── __init__.py
│   │   │   ├── alignment.py
│   │   │   ├── annotation.py
│   │   │   ├── assembly.py
│   │   │   ├── assembly_processing.py
│   │   │   ├── distance_estimation.py
│   │   │   ├── downloads.py
│   │   │   ├── mapping.py
│   │   │   ├── metagenomics.py
│   │   │   ├── mlst.py
│   │   │   ├── patlas_mapping.py
│   │   │   ├── phylogeny.py
│   │   │   ├── reads_quality_control.py
│   │   │   ├── typing.py
│   │   │   └── variant_calling.py
│   │   ├── engine.py
│   │   ├── error_handling.py
│   │   ├── footer_skeleton.py
│   │   ├── header_skeleton.py
│   │   ├── inspect.py
│   │   ├── pipeline_parser.py
│   │   ├── process.py
│   │   ├── process_collector.py
│   │   ├── process_details.py
│   │   ├── recipe.py
│   │   ├── recipes/
│   │   │   ├── __init__.py
│   │   │   ├── denim.py
│   │   │   ├── innuca.py
│   │   │   └── plasmids.py
│   │   ├── report.py
│   │   ├── templates/
│   │   │   ├── Helper.groovy
│   │   │   ├── abricate.nf
│   │   │   ├── abyss.nf
│   │   │   ├── assembly_mapping.nf
│   │   │   ├── bandage.nf
│   │   │   ├── base_recalibrator.nf
│   │   │   ├── bcalm.nf
│   │   │   ├── bowtie.nf
│   │   │   ├── bwa.nf
│   │   │   ├── card_rgi.nf
│   │   │   ├── check_coverage.nf
│   │   │   ├── chewbbaca.nf
│   │   │   ├── compiler_channels.txt
│   │   │   ├── concoct.nf
│   │   │   ├── containers.config
│   │   │   ├── dengue_typing.nf
│   │   │   ├── diamond.nf
│   │   │   ├── downsample_fastq.nf
│   │   │   ├── fast_ani.nf
│   │   │   ├── fasterq_dump.nf
│   │   │   ├── fastqc.nf
│   │   │   ├── fastqc_trimmomatic.nf
│   │   │   ├── filter_poly.nf
│   │   │   ├── haplotypecaller.nf
│   │   │   ├── init.nf
│   │   │   ├── integrity_coverage.nf
│   │   │   ├── kraken.nf
│   │   │   ├── kraken2.nf
│   │   │   ├── mafft.nf
│   │   │   ├── mapping_patlas.nf
│   │   │   ├── mark_duplicates.nf
│   │   │   ├── mash_dist.nf
│   │   │   ├── mash_screen.nf
│   │   │   ├── mash_sketch_fasta.nf
│   │   │   ├── mash_sketch_fastq.nf
│   │   │   ├── maxbin2.nf
│   │   │   ├── megahit.nf
│   │   │   ├── metabat2.nf
│   │   │   ├── metamlst.nf
│   │   │   ├── metaprob.nf
│   │   │   ├── metaspades.nf
│   │   │   ├── midas_species.nf
│   │   │   ├── mlst.nf
│   │   │   ├── momps.nf
│   │   │   ├── nextflow.config
│   │   │   ├── params.config
│   │   │   ├── patho_typing.nf
│   │   │   ├── patlas_consensus.nf
│   │   │   ├── pilon.nf
│   │   │   ├── pipeline_graph.html
│   │   │   ├── post.txt
│   │   │   ├── process_skesa.nf
│   │   │   ├── process_spades.nf
│   │   │   ├── progressive_mauve.nf
│   │   │   ├── prokka.nf
│   │   │   ├── quast.nf
│   │   │   ├── raxml.nf
│   │   │   ├── reads_download.nf
│   │   │   ├── remove_host.nf
│   │   │   ├── report_compiler.nf
│   │   │   ├── report_post.txt
│   │   │   ├── resources.config
│   │   │   ├── retrieve_mapped.nf
│   │   │   ├── seq_typing.nf
│   │   │   ├── seroba.nf
│   │   │   ├── sistr.nf
│   │   │   ├── skesa.nf
│   │   │   ├── spades.nf
│   │   │   ├── split_assembly.nf
│   │   │   ├── status_compiler.nf
│   │   │   ├── trace_compiler.nf
│   │   │   ├── trimmomatic.nf
│   │   │   ├── true_coverage.nf
│   │   │   ├── unicycler.nf
│   │   │   ├── user.config
│   │   │   └── viral_assembly.nf
│   │   └── utils.py
│   ├── lib/
│   │   └── CheckParams.groovy
│   ├── profiles.config
│   ├── templates/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── assembly_report.py
│   │   ├── compile_reports.py
│   │   ├── dengue_typing_assembly.py
│   │   ├── dengue_typing_reads.py
│   │   ├── downsample_fastq.py
│   │   ├── fasta_spliter.py
│   │   ├── fastqc.py
│   │   ├── fastqc_report.py
│   │   ├── flowcraft_utils/
│   │   │   ├── __init__.py
│   │   │   └── flowcraft_base.py
│   │   ├── integrity_coverage.py
│   │   ├── mapping2json.py
│   │   ├── mashdist2json.py
│   │   ├── mashscreen2json.py
│   │   ├── megahit.py
│   │   ├── metaspades.py
│   │   ├── pATLAS_consensus_json.py
│   │   ├── pipeline_status.py
│   │   ├── process_abricate.py
│   │   ├── process_assembly.py
│   │   ├── process_assembly_mapping.py
│   │   ├── process_concoct.py
│   │   ├── process_mapping.py
│   │   ├── process_metabat.py
│   │   ├── process_newick.py
│   │   ├── process_tsv.py
│   │   ├── process_viral_assembly.py
│   │   ├── skesa.py
│   │   ├── spades.py
│   │   ├── split_fasta.py
│   │   ├── trimmomatic.py
│   │   └── trimmomatic_report.py
│   └── tests/
│       ├── __init__.py
│       ├── broadcast_tests/
│       │   ├── empty_log.txt
│       │   ├── log_with_command.txt
│       │   ├── log_with_command_regex.txt
│       │   └── log_without_command.txt
│       ├── data_pipelines.py
│       ├── pipeline_tests/
│       │   ├── pipe1.txt
│       │   ├── pipe2.txt
│       │   ├── pipe3.txt
│       │   ├── pipe4.txt
│       │   ├── pipe5.txt
│       │   ├── pipe6.txt
│       │   ├── pipe7.txt
│       │   └── pipe8.txt
│       ├── test_assemblerflow.py
│       ├── test_broadcast.py
│       ├── test_engine.py
│       ├── test_pipeline_parser.py
│       ├── test_process_details.py
│       ├── test_processes.py
│       ├── test_recipes.py
│       └── test_sanity.py
├── requirements.txt
└── setup.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# ignore ide folders
.idea/
.vscode/

# ignore python generated files
*.pyc


================================================
FILE: .gitmodules
================================================


================================================
FILE: .travis.yml
================================================
language: python

python:
  - "3.6"

install:
  - pip install pytest
  - pip install coverage
  - pip install pytest-cov
  - python setup.py install

script:
  - py.test --cov=./

after_success:
  - bash <(curl -s https://codecov.io/bash)

================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
 advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
 address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
 professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at odiogosilva@gmail.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Assemblerflow

Thank you for your interest in contributing to Assemblerflow. All kinds of 
contributions are welcome :tada:!

## Issues

Feel free to [submit issues](https://github.com/assemblerflow/assemblerflow/issues)
and enhancement requests.

## Git branch convention

Contributions with new code (not documentation), should follow this standard procedure:

    <new_branch> >> dev >> master

1. Create a new branch for the new feature/bug fix.
2. One the new code is finished and **passes all automated tests**, it will be 
merged into the `dev` branch. This branch is where all the new code lives and 
serves as an incubator stage while field tests are performed to ensure that everything
is working correctly.
3. Merging the `dev` code into `master` is associated with a new release. Therefore, 
the `master` branch is basically the same of the latest official release in PyPI. 

## Contributing

In general, we follow the "fork-and-pull" Git workflow.

 1. **Fork** the repo on GitHub
 2. **Clone** the project to your own machine
 3. **Commit** changes to your own branch
 4. **Push** your work back up to your fork
 5. Submit a **Pull request** so that we can review your changes. Pull requests will be merged first into the `dev` branch to perform some field tests before being merged into `master` 

NOTE: Be sure to merge the latest from "upstream" before making a pull request!
  

================================================
FILE: LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.


================================================
FILE: README.md
================================================
# FlowCraft :whale2::package:

![Nextflow version](https://img.shields.io/badge/nextflow->0.27.0-brightgreen.svg)
![Python version](https://img.shields.io/badge/python-3.6-brightgreen.svg)
[![Build Status](https://travis-ci.org/assemblerflow/flowcraft.svg?branch=master)](https://travis-ci.org/assemblerflow/flowcraft)
[![codecov](https://codecov.io/gh/assemblerflow/flowcraft/branch/master/graph/badge.svg)](https://codecov.io/gh/assemblerflow/flowcraft)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/f518854f780b41a08ca2fb1c14e360f0)](https://www.codacy.com/app/o.diogosilva/assemblerflow?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=ODiogoSilva/assemblerflow&amp;utm_campaign=Badge_Grade)
[![Documentation Status](https://readthedocs.org/projects/flowcraft/badge/?version=latest)](http://flowcraft.readthedocs.io/en/latest/?badge=latest)
[![PyPI version](https://badge.fury.io/py/flowcraft.svg)](https://badge.fury.io/py/flowcraft)
[![Anaconda-Server Badge](https://anaconda.org/bioconda/flowcraft/badges/version.svg)](https://anaconda.org/bioconda/flowcraft)
[![Gitter](https://badges.gitter.im/flowcraft-community/community.svg)](https://gitter.im/flowcraft-community/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)

<p align="center">
  <img width="360" src="docs/resources/logo_large.png" alt="nextflow_logo"/>
</p>

A [Nextflow](https://www.nextflow.io/) pipeline assembler for genomics.
Pick your modules. Assemble them. Run the pipeline.

(Previously known as Assemblerflow)

## The premisse

#### Build a pipeline

What if building your own genomics pipeline would be as simple as:

```
flowcraft.py build -t "trimmomatic fastqc skesa pilon" -o my_pipeline.nf
```

Seems pretty simple right? What if we could run this pipeline with a single command on any linux machine or cluster by leveraging
the awesomeness of [nextflow](https://www.nextflow.io/) and [docker](https://www.docker.com/)/[singularity](http://singularity.lbl.gov/)
containers without having to install any of the pipeline dependencies?

#### Run the pipeline

```
nextflow run my_pipeline.nf --fastq path/to/fastq

N E X T F L O W  ~  version 0.30.1
Launching `my_pipeline.nf` [admiring_lamarck] - revision: 82cc9cd2ed

============================================================
                M Y   P I P E L I N E
============================================================
Built using flowcraft v1.2.1

 Input FastQ                 : 2
 Input samples               : 1
 Reports are found in        : ./reports
 Results are found in        : ./results
 Profile                     : standard

Starting pipeline at Tue Jun 12 19:38:26 WEST 2018

[warm up] executor > local
[7c/eb5f2f] Submitted process > integrity_coverage_1_1 (02AR0553)
(...)
[31/7d90a1] Submitted process > compile_pilon_report_1_6

Completed at: Tue Jun 12 19:58:32 WEST 2018
Duration    : 20m 6s
Success     : true
Exit status : 0
```

Congratulations! You just built and executed your own pipeline with
only two commands! :tada:

## Installation

FlowCraft is available as a bioconda package, which already brings
nextflow:

```
conda install flowcraft
```

#### Container engines

Pipelines built with FlowCraft require at least one container
engine to be installed, among `docker`, `singularity` or `shifter`.
If you already have any one of these installed, you're good to go.
If not, we recommend installing singularity, though it should be installed with
root privileges and accessible in all compute nodes. 

## How to use it

The complete user guide of FlowCraft can be found on [readthedocs.org](http://flowcraft.readthedocs.io/en/latest/?badge=latest).
For a quick and dirty demonstration, see below.

### Quick guide

#### Building a pipeline

FlowCraft comes with a number of [ready-to-use components](http://flowcraft.readthedocs.io/en/latest/user/available_components.html) to build your
own pipeline. Following some basic rules, such as the output type of one process
must match the input type of the next process, assembling a pipeline is done
using the `build` mode and the `-t` option:

```
flowcraft build -t "trimmomatic spades abricate" -o my_pipeline.nf -n "assembly pipe"
```

This command will generate everything that is necessary to run the
pipeline automatically, but the main pipeline executable
file will be `my_pipeline.nf`. This file will contain a nextflow pipeline
for genome assembly starts with `trimmomatic` and finishes with anti-microbial
gene annotation using `abricate`.

#### Wait... what about the software parameters?

Each component in the pipeline has its own set of parameters that can be
modified before or when executing the pipeline. These parameters are
described in the documentation of each process and you can check the options
of your particular pipeline using the `help` option:

```
$ nextflow run my_pipeline.nf --help
N E X T F L O W  ~  version 0.30.1
Launching `my_pipeline.nf` [prickly_picasso] - revision: 2e1a226e6d

============================================================
                F L O W C R A F T
============================================================
Built using flowcraft v1.2.1


Usage: 
    nextflow run my_pipeline.nf

       --fastq                     Path expression to paired-end fastq files. (default: fastq/*_{1,2}.*) (default: 'fastq/*_{1,2}.*')
       
       Component 'INTEGRITY_COVERAGE_1_1'
       ----------------------------------
       --genomeSize_1_1            Genome size estimate for the samples in Mb. It is used to estimate the coverage and other assembly parameters andchecks (default: 1)
       --minCoverage_1_1           Minimum coverage for a sample to proceed. By default it's setto 0 to allow any coverage (default: 0)
       
       Component 'TRIMMOMATIC_1_2'
       ---------------------------
       --adapters_1_2              Path to adapters files, if any. (default: 'None')
       --trimSlidingWindow_1_2     Perform sliding window trimming, cutting once the average quality within the window falls below a threshold (default: '5:20')
       --trimLeading_1_2           Cut bases off the start of a read, if below a threshold quality (default: 3)
       --trimTrailing_1_2          Cut bases of the end of a read, if below a threshold quality (default: 3)
       --trimMinLength_1_2         Drop the read if it is below a specified length  (default: 55)
       
       Component 'FASTQC_1_3'
       ----------------------
       --adapters_1_3              Path to adapters files, if any. (default: 'None')
       
       Component 'ASSEMBLY_MAPPING_1_5'
       --------------------------------
       --minAssemblyCoverage_1_5   In auto, the default minimum coverage for each assembled contig is 1/3 of the assembly mean coverage or 10x, if the mean coverage is below 10x (default: 'auto')
       --AMaxContigs_1_5           A warning is issued if the number of contigs is overthis threshold. (default: 100)
       --genomeSize_1_5            Genome size estimate for the samples. It is used to check the ratio of contig number per genome MB (default: 2.1)
```

This help message is dynamically generated depending on the pipeline you build.
Since this pipeline starts with `trimmomatic`, which receives fastq files as input,
`--fastq` is the default parameter for providing paired-end fastq files.

#### Running a pipeline

Now that we have our nextflow pipeline built, we are ready to executed it by
providing input data. By default, FlowCraft pipelines will run locally and use
`singularity` to run the containers of each component. This can be
changed in multiple ways, but for convenience FlowCraft has already defined
profiles for most configurations of `executors` and `container` engines.

Running a pipeline locally with `singularity` can be done with:

```
# Pattern for paired-end fastq is '<sample>_1.fastq.gz <sample>_2.fastq.gz'
nextflow run my_pipeline --fastq "path/to/fastq/*_{1,2}.*"
```

If you want to run a pipeline in a cluster with SLURM and singularity, just use
the appropriate profile:

```
nextflow run my_pipeline --fastq "path/to/fastq/*_{1,2}.*" -profile slurm_sing
```

During the execution of the pipeline, the results and reports for each component
are continuously saved to the `results` and `reports` directory, respectively.

#### Inspecting a pipeline progress

Since version 1.2.0, it is possible to inspect the progress of a nextflow pipeline
using the `flowcraft inspect` mode. To check the progress in a terminal, simply
type:

```
flowcraft inspect
```

On the directory where the pipeline is running. Alternatively, you can view the progress
in FlowCraft's web service by using the ``broadcast`` option:

```
flowcraft inspect -m broadcast
```

<img src="https://github.com/assemblerflow/flowcraft-webapp/raw/master/flowcraft-webapp/frontend/resources/fc_short_demo.gif"/>

## Why not just write a Nextflow pipeline?

In many cases, building a static nextflow pipeline is sufficient for our goals.
However, when building our own pipelines, we often felt the need to add dynamism
to this process, particularly if we take into account how fast new tools arise
and existing ones change. Our biological goals also change over time and we
might need different pipelines to answer different questions. FlowCraft makes
this very easy, by having a set of pre-made and ready-to-use components that can
be freely assembled.

For instance, changing the assembly software in a genome assembly pipeline becomes
as easy as:

```
# Use spades
trimmomatic spades pilon
# Use skesa
trimmomatic skesa pilon
```

![example1](https://github.com/assemblerflow/flowcraft/raw/master/docs/resources/example_3.png)

If you are interested in having some sort of genome annotation, simply add those
components at the end, using a fork syntax:

```
# Run prokka and abricate at the end of the assembly
trimmomatic spades pilon (prokka | abricate)
```

![example2](https://github.com/assemblerflow/flowcraft/raw/master/docs/resources/example_1.png)

On the other hand, if you are interest in just perform allele calling for wgMLST,
simply add `chewbbaca`:

```
trimmomatic spades pilon chewbbaca
```

![example3](https://github.com/assemblerflow/flowcraft/raw/master/docs/resources/example_2.png)

Since nextflow handles parallelism of large sets of data so well, simple pipelines
of two components are also useful to build:

```
trimmomatic fastqc
```

As the number of existing components grow, so does your freedom to build pipelines.

## Roadmap

You can see what we're planning next on our [roadmap guide](https://github.com/assemblerflow/flowcraft/wiki/Roadmap).

## Developer guide

### Adding new components

Is there a missing component that you would like to see included? We would love
to expand! You could make a component request in our
[issue tracker](https://github.com/assemblerflow/flowcraft/issues).

If you want to be part of the team, you can contribute with the code as well. Each component
in FlowCraft can be independently added without having to worry about
the rest of the code base. You'll just need to have some knowledge of python
and nextflow. [Check the developer documentation for how-to guides](http://assemblerflow.readthedocs.io/en/latest/)


================================================
FILE: changelog.md
================================================
# Changelog

## 1.4.2

### New components

- `Bwa`: align short paired-end sequencing reads to long reference sequences
- `MarkDuplicates`: Identifies duplicate reads
- `BaseRecalibrator`: Detects systematic errors in base quality scores
- `Haplotypecaller`: Call germline SNPs and indels via local re-assembly of haplotypes

- `Seroba`: Serotyping of *Streptococcus pneumoniae* sequencing data (FastQ)
- `Concoct`: Clustering metagenomic assembled comtigs with coverage and composition
- `MetaBAT2`: A robust statistical framework for reconstructing genomes from metagenomic data

### Minor/Other changes

- added manifest information to the `nextflow.config` file to allow for remote execution
- Added checks for the DAG's dot files in the compile_reports component

## 1.4.1

### New features

- Added support for the report system to:
    - `maxbin2`
- Added new `manifest.config` with the pipeline metadata

### New components

- `Kraken2`: Taxonomic identification on FastQ files

### Bug fixes

- Fix bug in `momps`component related to added in the introduction of the clear input parameter
- Fixed bug with the `-ft` parameters not retrieving the dockerhub tags for 
all the components.
- Fixed bug in the `megahit` process where the fastg mode would break the process
- Fix inspect and report mode to fetch the nextflow file independently of its 
position in the `nextflow run` command inside the .nextflow.log file.
- Fix parsing of .nextflow.log file when searching for `nextflow run` command.
- Fixed bug between mash_sketch_fasta and mash_dist.

### Minor/Other changes

- Added option to `dengue_typing` to retrieve closest reference sequence and link it 
with a secondary channel into `mafft`
- New version of DEN-IM recipe
- Now prints an ordered list of components
- Moved taxonomy results from `results/annotation/` to `results/taxonomy/`


## 1.4.0

### New features

- Added new `recipe` system to flowcraft along with 6 starting recipes.
Recipes are pre-made and curated pipelines that address specific questions.
To create a recipe, the `-r <recipe_name>` can be used. To list available
recipes, the `--recipe-list` and `--recipe-list-short` options were added. 
- Added `-ft` or `--fetch-tags` which allows to retrieve all DockerHub 
container tags.
- Added function to collect all the components from the components classes,
replacing the current process_map dictionary implementation. Now, it will be
generated from the engine rather than hardcoded into the dict.

### Components changes

- Added new `disableRR` param in the `spades` component that disables repeat
resolution
- The `abyss` and `spades` components emit GFA in a secondary channel.
- The new `bandage` component can accept either FASTA from a primary channel
  or GFA from a secondary channel.
- Updated skesa to version 2.3.0.
- Updated mash based components for the latest version - 1.6.0-1.

### New components

- Added component `abyss`.
- Added component `bandage`.
- Added component `unicycler`.
- Added component `prokka`.
- Added component `bcalm`.
- Added component `diamond`.

### Minor/Other changes

- Added removal of duplicate IDs from `reads_download` component input.
- Added seed parameter to `downsample_fastq` component.
- Added bacmet database to `abricate` component.
- Added default docker option to avoid docker permission errors.
- Changed the default URL generated by inspect and report commands. 
- Changed the default URL generated by inspect and report commands.
- Added directives to `-L` parameter of build module.


### Bug fixes

- Fixed forks with same source process name.
- Fixed `inspect` issue when tasks took more than a day in duration.
- Added hardware address to `inpsect` and `report` hash.

## 1.3.1

### Features

- Added a new `clearInput` parameter to components that change their input.
The aim of this option is to allow the controlled removal of temporary files,
which is particularly useful in very large workflows.

### Components changes

- Updated images for components `mash_dist`, `mash_screen` and 
`mapping_patlas`.

### New components

- Added component `fast_ani`.

### Minor/Other changes

- Added `--export-directives` option to `build` mode to export component's 
directives in JSON format to standard output.
- Added more date information in `inspect` mode, including the year and the
locale of the executing system.

## 1.3.0

### Features
- Added `report` run mode to Flowcraft that displays the report of any given
pipeline in the Flowcraft's web application. The `report` mode can be executed
after a pipeline ended or during the pipeline execution using the `--watch`
option.
- Added standalone report HTML at the end of the pipeline execution.
- Components with support for the new report system:
    - `abricate`
    - `assembly_mapping`
    - `check_coverage`
    - `chewbbaca`
    - `dengue_typing`
    - `fastqc`
    - `fastqc_trimmomatic`
    - `integrity_coverage`
    - `mlst`
    - `patho_typing`
    - `pilon`
    - `process_mapping`
    - `process_newick`
    - `process_skesa`
    - `process_spades`
    - `process_viral_assembly`
    - `seq_typing`
    - `trimmomatic`
    - `true_coverage`

### Minor/Other changes

- Refactored report json for components `mash_dist`, `mash_screen` and 
`mapping_patlas`

### Bug fixes
- Fixed issue where `seq_typing` and `patho_typing` processes were not feeding
report data to report compiler.
- Fixed fail messages for `process_assembly` and `process_viral_assembly` 
components

## 1.2.2

### Components changes

- `mapping_patlas`: refactored to remove temporary files used to create
sam and bam files and added data to .report.json. Updated databases to pATLAS
version 1.5.2.
- `mash_screen` and `mash_dist`: added data to .report.json. Updated databases 
to pATLAS version 1.5.2.
- Added new options to `abricate` componente. Users can now provide custom database
directories, minimum coverage and minimum identity parameters.

### New components

- Added component `fasterq_dump`
- Added component `mash_sketch_fasta`
- Added component `mash_sketch_fastq`
- Added component `downsample_fastq` for FastQ read sub sampling using seqtk
- Added component `momps` for typing of Legionella pneumophila
- Added component `split_assembly`
- Added component `mafft`
- Added component `raxml`
- Added component `viral_assembly`
- Added component `progressive_mauve`
- Added component `dengue_typing`

### Minor/Other changes

- Added check for `params.accessions` that enables to report a proper
error when it is set to `null`.
- Added `build` option to export component parameters information in JSON format. 
- Fixed minor issue preventing the `maxbin2` and `split_assembly` components 
from being used multiples times in a pipeline
- Added a catch to the `filter_poly` process for cases where the input file is empty. 
- spades template now reports the exit code of spades' execution

### Bug fixes

- Removed the need for the nf process templates to have an empty line
at the beginning of the template files.
- Fixed issue when the `inspect` mode was executed on a pipeline directory
with failed processes but with the work directory removed (the log files
where no longer available).
- Fixed issue when the `inspect` mode was executed on a pipeline without the 
memory directory defined.
- Fixed issue in the `inspect` mode, where there is a rare race condition between
tags in the log and trace files.
- Fixed bug on `midas_species` process where the output file was not being 
linked correctly, causing the process to fail
- Fixed bug on `bowtie` where the reference parameter was missing the pid
- Fixed bug on `filter_poly` where the tag was missing

## 1.2.1

### Improvements

- The parameter system has been revamped, and parameters are now component-specific
and independent by default. This allows a better fine-tuning of the parameters
and also the execution of the same component multiple times (for instance in a fork)
with different parameters. The old parameter system that merged identical parameters
is still available by using the `--merge-params` flag when building the pipeline.
- Added a global `--clearAtCheckpoint` parameter that, when set to true, will remove
temporary files that are no longer necessary for downstream steps of the pipeline
from the work directory. This option is currently supported for the `trimmomatic`,
`fastqc_trimmomatic`, `skesa` and `spades` components. 

### New components

- `maxbin2`: An automatic tool for binning metagenomic sequences.
- `bowtie2`: Align short paired-end sequencing reads to long reference
sequences.
- `retrieve_mapped`: Retrieves the mapped reads of a previous bowtie2 mapping process.

### New recipes

- `plasmids`: A recipe to perform mapping, mash screen on reads
and also mash dist for assembly based approaches (all to detect
plasmids). This also includes annotation with abricate for the assembly.
- `plasmids_mapping`: A recipe to perform mapping for plasmids.
- `plasmids_mash`: A recipe to perform mash screen for plasmids.
- `plasmids_assembly`: A recipe to perform mash dist for plasmid
assemblies.

### Minor/Other changes

- Added "smart" check when the user provides a typo in pipeline string
for a given process, outputting some "educated" guesses to the
terminal.
- Added "-cr" option to show current recipe `pipeline_string`.
- Changed the way recipes were being parsed by `proc_collector` for the
usage of `-l` and `-L` options.
- Added check for non-ascii characters in colored_print.
- Fixed log when a file with the pipeline is provided to -t option
instead of a string.

### Bug fixes

- Fixed pipeline names that contain new line characters.
- Fixed pipeline generation when automatic dependencies were added right after a fork
- **Template: sistr.nf**: Fixed comparison that determined process status.
- Fixed issue with `--version` option.

## 1.2.0

### New components

- `card_rgi`: Anti-microbial resistance gene screening for assemblies
- `filter_poly`: Runs PrinSeq on paired-end FastQ files to remove low complexity sequences
- `kraken`: Taxonomic identification on FastQ files
- `megahit`: Metagenomic assembler for paired-end FastQ files
- `metaprob`: Performs read binning on metagenomic FastQ files
- `metamlst`: Checks the Sequence Type of metagenomic FastQ reads using Multilocus Sequence Typing
- `metaspades`: Metagenomic assembler for paired-end FastQ files
- `midas_species`: Taxonomic identification on FastQ files at the species level
- `remove host`: Read mapping with Bowtie2 against the target host genome (default hg19) and removes the mapping reads
- `sistr`: Salmonella *in silico* typing component for assemblies. 

### Features

- Added `inspect` run mode to flowcraft for displaying the progress overview
  during a nextflow run. This run mode has `overview` and `broadcast` options
  for viewing the progress of a pipeline.

### Minor/Other changes

- Changed `mapping_patlas` docker container tag and variable
(PR [#76](https://github.com/assemblerflow/assemblerflow/pull/76)).
- The `env` scope of nextflow.config now extends the `PYTHONPATH`
environmental variable.
- Updated indexes for both `mapping_patlas` and `mash` based processes.
- New logo!

### Bug Fixes

- **Template: fastqc_report.py**: Added fix to trim range evaluation.
- **Script: merge_json.py**: Fixed chewbbaca JSON merge function.


================================================
FILE: docker/Dockerfile
================================================
FROM python:3.6-alpine3.7
MAINTAINER Bruno Gonçalves <bfgoncalves@medicina.ulisboa.pt>

RUN apk add --no-cache git

WORKDIR /flowcraft

# Clone FlowCraft
RUN git clone https://github.com/assemblerflow/flowcraft.git
WORKDIR ./flowcraft

# Install flowcraft
RUN python setup.py install

WORKDIR /flowcraft

# Remove unnecessary packages
RUN apk del git

================================================
FILE: docs/Makefile
================================================
# Makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS   ?=
SPHINXBUILD  ?= sphinx-build
PAPER        ?=
BUILDDIR      = _build

# Internal variables.
PAPEROPT_a4     = -D latex_elements.papersize=a4
PAPEROPT_letter = -D latex_elements.papersize=letter
ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .

.PHONY: help
help:
	@echo "Please use \`make <target>' where <target> is one of"
	@echo "  html        to make standalone HTML files"
	@echo "  dirhtml     to make HTML files named index.html in directories"
	@echo "  singlehtml  to make a single large HTML file"
	@echo "  pickle      to make pickle files"
	@echo "  json        to make JSON files"
	@echo "  htmlhelp    to make HTML files and an HTML help project"
	@echo "  qthelp      to make HTML files and a qthelp project"
	@echo "  applehelp   to make an Apple Help Book"
	@echo "  devhelp     to make HTML files and a Devhelp project"
	@echo "  epub        to make an epub"
	@echo "  epub3       to make an epub3"
	@echo "  latex       to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
	@echo "  latexpdf    to make LaTeX files and run them through pdflatex"
	@echo "  latexpdfja  to make LaTeX files and run them through platex/dvipdfmx"
	@echo "  lualatexpdf to make LaTeX files and run them through lualatex"
	@echo "  xelatexpdf  to make LaTeX files and run them through xelatex"
	@echo "  text        to make text files"
	@echo "  man         to make manual pages"
	@echo "  texinfo     to make Texinfo files"
	@echo "  info        to make Texinfo files and run them through makeinfo"
	@echo "  gettext     to make PO message catalogs"
	@echo "  changes     to make an overview of all changed/added/deprecated items"
	@echo "  xml         to make Docutils-native XML files"
	@echo "  pseudoxml   to make pseudoxml-XML files for display purposes"
	@echo "  linkcheck   to check all external links for integrity"
	@echo "  doctest     to run all doctests embedded in the documentation (if enabled)"
	@echo "  coverage    to run coverage check of the documentation (if enabled)"
	@echo "  dummy       to check syntax errors of document sources"

.PHONY: clean
clean:
	rm -rf $(BUILDDIR)/*

.PHONY: html
html:
	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
	@echo
	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."

.PHONY: dirhtml
dirhtml:
	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
	@echo
	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."

.PHONY: singlehtml
singlehtml:
	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
	@echo
	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."

.PHONY: pickle
pickle:
	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
	@echo
	@echo "Build finished; now you can process the pickle files."

.PHONY: json
json:
	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
	@echo
	@echo "Build finished; now you can process the JSON files."

.PHONY: htmlhelp
htmlhelp:
	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
	@echo
	@echo "Build finished; now you can run HTML Help Workshop with the" \
	      ".hhp project file in $(BUILDDIR)/htmlhelp."

.PHONY: qthelp
qthelp:
	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
	@echo
	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Templates.qhcp"
	@echo "To view the help file:"
	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Templates.qhc"

.PHONY: applehelp
applehelp:
	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
	@echo
	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
	@echo "N.B. You won't be able to view it unless you put it in" \
	      "~/Library/Documentation/Help or install it in your application" \
	      "bundle."

.PHONY: devhelp
devhelp:
	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
	@echo
	@echo "Build finished."
	@echo "To view the help file:"
	@echo "# mkdir -p $$HOME/.local/share/devhelp/Templates"
	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Templates"
	@echo "# devhelp"

.PHONY: epub
epub:
	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
	@echo
	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."

.PHONY: epub3
epub3:
	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
	@echo
	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."

.PHONY: latex
latex:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo
	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
	@echo "Run \`make' in that directory to run these through (pdf)latex" \
	      "(use \`make latexpdf' here to do that automatically)."

.PHONY: latexpdf
latexpdf:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through pdflatex..."
	$(MAKE) -C $(BUILDDIR)/latex all-pdf
	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

.PHONY: latexpdfja
latexpdfja:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through platex and dvipdfmx..."
	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

.PHONY: lualatexpdf
lualatexpdf:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through lualatex..."
	$(MAKE) PDFLATEX=lualatex -C $(BUILDDIR)/latex all-pdf
	@echo "lualatex finished; the PDF files are in $(BUILDDIR)/latex."

.PHONY: xelatexpdf
xelatexpdf:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through xelatex..."
	$(MAKE) PDFLATEX=xelatex -C $(BUILDDIR)/latex all-pdf
	@echo "xelatex finished; the PDF files are in $(BUILDDIR)/latex."

.PHONY: text
text:
	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
	@echo
	@echo "Build finished. The text files are in $(BUILDDIR)/text."

.PHONY: man
man:
	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
	@echo
	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."

.PHONY: texinfo
texinfo:
	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
	@echo
	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
	@echo "Run \`make' in that directory to run these through makeinfo" \
	      "(use \`make info' here to do that automatically)."

.PHONY: info
info:
	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
	@echo "Running Texinfo files through makeinfo..."
	make -C $(BUILDDIR)/texinfo info
	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."

.PHONY: gettext
gettext:
	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
	@echo
	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."

.PHONY: changes
changes:
	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
	@echo
	@echo "The overview file is in $(BUILDDIR)/changes."

.PHONY: linkcheck
linkcheck:
	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
	@echo
	@echo "Link check complete; look for any errors in the above output " \
	      "or in $(BUILDDIR)/linkcheck/output.txt."

.PHONY: doctest
doctest:
	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
	@echo "Testing of doctests in the sources finished, look at the " \
	      "results in $(BUILDDIR)/doctest/output.txt."

.PHONY: coverage
coverage:
	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
	@echo "Testing of coverage in the sources finished, look at the " \
	      "results in $(BUILDDIR)/coverage/python.txt."

.PHONY: xml
xml:
	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
	@echo
	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."

.PHONY: pseudoxml
pseudoxml:
	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
	@echo
	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

.PHONY: dummy
dummy:
	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
	@echo
	@echo "Build finished. Dummy builder generates no files."


================================================
FILE: docs/_static/custom.css
================================================
div.wy-side-nav-search, div.wy-nav-top {
  background: #5c6bc0;
}

.wy-menu > .caption > .caption-text {
  color: #5c6bc0;
}

.wy-nav-content {
  max-width: 100%
}

================================================
FILE: docs/about/about.rst
================================================
About
=====

FlowCraft is developed by the Molecular `Microbiology and Infection Unit (UMMI) <http://darwin.phyloviz.net/wiki/doku.php>`_
at the `Instituto de Medicina Molecular Joao Antunes <https://imm.medicina.ulisboa.pt/en/>`_.

This project is licensed under the `GPLv3 license <https://github.com/assemblerflow/flowcraft/blob/master/LICENSE>`_.
The source code of FlowCraft is available at `<https://github.com/assemblerflow/flowcraft>`_ and the
webservice is available at `<https://github.com/assemblerflow/flowcraft-webapp>`_.

================================================
FILE: docs/conf.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Templates documentation build configuration file, created by
# sphinx-quickstart on Mon Feb  5 14:24:12 2018.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(".."))
sys.path.insert(0, os.path.abspath("../flowcraft/templates"))
import flowcraft

# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.todo',
    'sphinx.ext.viewcode',
    'sphinx.ext.githubpages',
    'numpydoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.mathjax'
]

autodoc_member_order = 'bysource'

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'index'

# General information about the project.
project = 'FlowCraft'
copyright = '2018, FlowCraft team'
author = 'Diogo N. Silva, Tiago F. Jesus, Ines Mendes, Bruno Ribeiro-Goncalves'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = flowcraft.__version__
# The full version, including alpha/beta/rc tags.
release = '1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True


# -- Options for HTML output ----------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {"collapse_navigation": True}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# -- Options for HTMLHelp output ------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'Templatesdoc'


# -- Options for LaTeX output ---------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',

    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',

    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',

    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, 'Templates.tex', 'Templates Documentation',
     'Diogo N. Silva', 'manual'),
]


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
    (master_doc, 'templates', 'Templates Documentation',
     [author], 1)
]


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (master_doc, 'Templates', 'Templates Documentation',
     author, 'Templates', 'One line description of project.',
     'Miscellaneous'),
]

# -- Options for Epub output ----------------------------------------------

# Bibliographic Dublin Core info.
epub_title = project
epub_author = author
epub_publisher = author
epub_copyright = copyright

# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''

# A unique identification for the text.
#
# epub_uid = ''

# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']


def setup(app):
    app.add_stylesheet('custom.css')

================================================
FILE: docs/dev/containers.rst
================================================
Docker containers guidelines
============================

All FlowCraft components require a docker container in order to be executed,
thus if a new component is added, a docker image should be added as well and
uploaded to
.. _docker hub: https://hub.docker.com/ in order to be available to pull in
other machines. Although this can be done in any personal
repository, we recommend that this docker images are added to an already
existing .. _FlowCraft github repository: https://github.com/assemblerflow/docker-imgs
(called here ``Official``) so that docker builds can be automated with github
integration. Also, the centralization of all images will allow other
contributors to easily access and edit these containers instead of forking from
one side to another every time a container needs to be changed/updated.

Official FlowCraft Docker images
--------------------------------

Writing docker images
:::::::::::::::::::::

Official FlowCraft Docker images are available in
.. _this github repository: https://github.com/assemblerflow/docker-imgs .
If you want to add your image to this repository please fork it and make a
Pull Request (PR) with the requested new image or create an issue asking to be
added to the organization as a contributor.


Building docker images
::::::::::::::::::::::

Then, after the image has been added to the FlowCraft
.. _docker-imgs https://github.com/assemblerflow/docker-imgs
github repository, they can be built through
.. _FlowCraft docker hub https://hub.docker.com/u/flowcraft/dashboard/ .

Tag naming
^^^^^^^^^^

Each time a docker image is built using the automated build of docker hub it
should follow this nomenclature: ``version-patch``.
This is used to avoid the override of previous builds for the same images,
allowing for instance users to use different version of the same software using
the same docker image but with different tags.

- ``Version``: Is a string with tree letters like this: ``1.1.1``. Versions should
change every time a new software is added the container.

- ``Patch``: Is a number that follows a ``-`` after the version. Patches should
change every time a change does not affect
the software inside it. For example, updates to database related files required
by some of the software inside the container.

Unofficial FlowCraft Docker images
----------------------------------

Although we **strongly** recommend that all images are stored in FlowCraft
.. _docker-imgs https://github.com/assemblerflow/docker-imgs github repo, it is
not mandatory to do it. Images can be built in another github repo and
also use another docker hub repository to build the images.
However, do make sure that you define it correctly in the directives of the
process as explained in :ref:`DirectivesAnchor`.


================================================
FILE: docs/dev/create_process.rst
================================================
Process creation guidelines
===========================

Basic process creation
----------------------

The addition of a new process to FlowCraft requires three main steps:

#. `Create process template`_: Create a jinja2 template in ``flowcraft.generator.templates`` with the
   nextflow code.

#. `Create Process class`_: Create a :class:`~flowcraft.generator.process.Process` subclass in
   :class:`flowcraft.generator.process` with
   information about the process (e.g., expected input/output, secondary inputs,
   etc.).

.. _create-process:

Create process template
:::::::::::::::::::::::

First, create the nextflow template that will be integrated into the pipeline
as a process. This file must be placed in ``flowcraft.generator.templates``
and have the ``.nf`` extension. In order to allow the template to be
dynamically added to a pipeline file, we use the jinja2_ template language to
substitute key variables in the process, such as input/output channels.

An example created as a ``my_process.nf`` file is as follows::

    some_channel_{{ pid }} = Channel.value(params.param1{{ param_id}})
    other_channel_{{ pid }} = Channel.fromPath(params.param2{{ param_id}})

    process myProcess_{{ pid }} {

        {% include "post.txt" ignore missing %}

        publishDir "results/myProcess_{{ pid }}", pattern: "*.tsv"

        input:
        set sample_id, <data> from {{ input_channel }}
        val x from some_channel_{{ pid }}
        file y from other_channel_{{ pid }}
        val direct_from_parms from Channel.value(params.param3{{param_id}}

        // The output is optional
        output:
        set sample_id, <data> into {{ output_channel }}
        {% with task_name="abricate" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}

        """
        <process code/commands>
        """
    }

    {{ forks }}

The fields surrounded by curly brackets are jinja placeholders that will be
dynamically substituted when building the pipeline. They will ensure that the
processes and potential forks correctly link with each other and that
channels are unique and correctly linked. This example contains all
placeholder variables that are currently supported by FlowCraft.

{{pid}}
^^^^^^^

Used as a unique process identifier that prevent issues
from process and channel duplication in the pipeline. Therefore, is should be
appended to each process and channel name as ``_{{ pid }}`` (note the underscore)::

    some_channel_{{ pid }}
    process myProcess_{{ pid }}

{{param_id}}
^^^^^^^^^^^^

Same as the **{{ pid }}**, but sets the identified for nextflow ``params``. It should
be appended to each ``param`` as ``{{ param_id }}``. This will allow parameters
to be specific to each component in the pipeline::

    Channel.value(params.param1{{ param_id}})

Note that the parameters used in the template, should also be defined in the
Process class params attribute (see `Parameters`_).

{% include "post.txt" %}
^^^^^^^^^^^^^^^^^^^^^^^^

Inserts ``beforeScript`` and ``afterScript`` statements to the process that
sets environmental variables and a series of *dotfiles* for the process to
log their status, warnings, fails and reports (see :ref:`dotfiles` for
more information). It also includes scripts for sending requests to
REST APIs (only when certain pipeline parameters are used).

{{input_channel}}
^^^^^^^^^^^^^^^^^

All processes must include **one and only one** input channel. In most cases,
this channel should be defined with a two element tuple that contains the
sample ID and then the actual data file/stream. We suggest the sample ID
variable to be named ``sample_id`` as a standard. If other name variable name
is specified and you include the ``compiler_channels.txt`` in the process,
you'll need to change the sample ID variable (see `Sample ID variable`_).

{{output_channel}}
^^^^^^^^^^^^^^^^^^

Terminal processes may skip the output channel entirely. However, if you want
to link the main output of this process with subsequent ones, this placeholder
must be used **only once**. Like in the input channel, this channel should
be defined with a two element tuple with the sample ID and the data. The
sample ID must match the one specified in the ``input_channel``.

.. _compiler:

{% include "compiler_channels.txt %}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This will include the special channels that will compile the status/logging
of the processes throughout the pipeline. **You must include the whole
block** (see `Status channels`_)::

    {% with task_name="abricate" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}


{{forks}}
^^^^^^^^^

Inserts potential forks of the main output channel. It is **mandatory** if
the ``output_channel`` is set.

Complete example
^^^^^^^^^^^^^^^^

As an example of a complete process, this is the template of ``spades.nf``::

    IN_spades_opts_{{ pid }} = Channel.value([params.spadesMinCoverage{{ param_id }},params.spadesMinKmerCoverage{{ param_id }}])
    IN_spades_kmers_{{pid}} = Channel.value(params.spadesKmers{{ param_id }})

    process spades_{{ pid }} {

        // Send POST request to platform
        {% include "post.txt" ignore missing %}

        tag { fastq_id + " getStats" }
        publishDir 'results/assembly/spades/', pattern: '*_spades.assembly.fasta', mode: 'copy'

        input:
        set fastq_id, file(fastq_pair), max_len from {{ input_channel }}.join(SIDE_max_len_{{ pid }})
        val opts from IN_spades_opts_{{ pid }}
        val kmers from IN_spades_kmers_{{ pid }}

        output:
        set fastq_id, file('*_spades.assembly.fasta') optional true into {{ output_channel }}
        set fastq_id, val("spades"), file(".status"), file(".warning"), file(".fail") into STATUS_{{ pid }}
        file ".report.json"

        script:
        template "spades.py"
    }

    {{ forks }}


Create Process class
::::::::::::::::::::

The process class will contain the information that FlowCraft
will use to build the pipeline and assess potential conflicts/dependencies
between process. This class should be created in one the category files in the
:mod:`flowcraft.generator.components` module (e.g.: ``assembly.py``). If
the new component does not fit in any of the existing categories, create a
new one that imports :mod:`flowcraft.generator.process.Process` and add
your new class. This class should inherit from the
:class:`~flowcraft.generator.process.Process` base
class::

    class MyProcess(Process):

        def __init__(self, **kwargs):

            super().__init__(**kwargs)

            self.input_type = "fastq"
            self.output_type = "fasta"

This is the simplest working example of a process class, which basically needs
to inherit the parent class attributes (the ``super`` part).
Then we only need to define the expected input
and output types of the process. There are no limitations to the
input/output types.
However, a pipeline will only build successfully when all processes correctly
link the output with the input type.

Depending on the process, other attributes may be required:

    - `Parameters`_: Parameters provided by the user to be used in the process.
    - `Secondary inputs`_: Channels created from parameters provided by the
      user.
    - Secondary `Link start`_ and `Link end`_: Secondary links that connect
      secondary information between two processes.
    - `Dependencies`_: List of other processes that may be required for
      the current process.
    - `Directives`_: Default information for RAM/CPU/Container directives
      and more.

Add to available components
::::::::::::::::::::::::::

Contrary to previous implementation (version <= 1.3.1), the available components
are now retrieved automatically by FlowCraft and there is no need to add the
process to any dictionary (previous ``process_map``). In order for the component
to be accessible to ``flowcraft build`` the process template name in
``snake_case`` must match the process class in ``CamelCase``. For instance,
if the process template is named ``my_process.nf``, the process class must
be ``MyProcess``, then the FlowCraft will be able to automatically add it to the
list of available components.

.. note::
    Note that the template string does not include the ``.nf`` extension.

Process attributes
------------------

This section describes the main attributes of the
:mod:`~flowcraft.generator.process.Process` class: what they
do and how do they impact the pipeline generation.

Input/Output types
::::::::::::::::::

The :attr:`~flowcraft.generator.process.Process.input_type` and
:attr:`~flowcraft.generator.process.Process.output_type` attributes
set the expected type of input and output of the process. There are no
limitations to the type of input/output that are provided. However, processes
will only link when the output of one process matches the input of the
subsequent process (unless the
:attr:`~flowcraft.generator.process.Process.ignore_type` attribute is set
to ``True``). Otherwise, FlowCraft will raise an exception stating that
two processes could not be linked.

.. note::

    The input/ouput types that are currently used are ``fastq``, ``fasta``.

Parameters
::::::::::

The :attr:`~flowcraft.generator.process.Process.params` attribute sets
the parameters that can be used by the process. For each parameter, a default
value and a description should be provided. The default value will be set
in the ``params.config`` file in the pipeline directory and the description
will be used to generated the custom help message of the pipeline::

    self.params = {
        "genomeSize": {
            "default": 2.1,
            "description": "Expected genome size (default: params.genomeSiz)
        },
        "minCoverage": {
            "default": 15,
            "description": "Minimum coverage to proceed (default: params.minCoverage)"
        }
    }

These parameters can be simple values that are not feed into
any channel, or can be automatically set to a secondary input channel via
`Secondary inputs`_ (see below).

They can be specified when running the pipeline like any nextflow parameter
(e.g.: ``--genomeSize 5``) and used in the nextflow process as usual
(e.g.: ``params.genomeSize``).

.. note::
    These pairs are then used to populate the ``params.config`` file that is
    generated in the pipeline directory. Note that the values are replaced
    literally in the config file. For instance, ``"genomeSize": 2.1,`` will appear
    as ``genomeSize = 2.1``, whereas ``"adapters": "'None'"`` will appear as
    ``adapters = 'None'``. If you want a value to appear as a string, the double
    and single quotes are necessary.


Secondary inputs
::::::::::::::::

.. warning::
    The ``secondary_inputs`` attribute has been deprecated since **v1.2.1.**
    Instead, specify the secondary channels directly in the nextflow template
    files.

Any process can receive one or more input channels in addition to the main
channel. These are particularly useful when the process needs to receive
additional options from the ``parameters`` scope of nextflow.
These additional inputs can be specified via the
:attr:`~flowcraft.generator.process.Process.secondary_inputs` attribute,
which should store a list of dictionaries (a dictionary for each input). Each dictionary should
contains a key:value pair with the name of the parameter (``params``) and the
definition of the nextflow channel (``channel``). Consider the example below::

    self.secondary_inputs = [
            {
                "params": "genomeSize",
                "channel": "IN_genome_size = Channel.value(params.genomeSize)"
            },
            {
                "params": "minCoverage",
                "channel": "IN_min_coverage = Channel.value(params.minCoverage)"
            }
        ]

This process will receive two secondary inputs that are given by the
``genomeSize`` and ``minCoverage`` parameters. These should be also specified
in the :attr:`~flowcraft.generator.process.Process.params` attribute
(See `Parameters`_ above).

For each of these parameters, the dictionary
also stores how the channel should be defined at the beginning of the pipeline
file. Note that this channel definition mentions the parameters (e.g.
``params.genomeSize``). An additional best practice for channel definition
is to include one or more sanity checks to ensure that the provided arguments
are correct. These checks can be added in the nextflow template file, or
literally in the ``channel`` string::

    self.secondary_inputs = [
        {
            "params": "genomeSize",
            "channel":
                    "IN_genome_size = Channel.value(params.genomeSize)"
                    "map{it -> it.toString().isNumber() ? it : exit(1, \"The genomeSize parameter must be a number or a float. Provided value: '${params.genomeSize}'\")}"
            }

Extra input
:::::::::::

The :attr:`~flowcraft.generator.process.Process.extra_input` attribute
is mostly a user specified directive that allows the injection of additional
input data from a parameter into the main input channel of the process.
When a pipeline is defined as::

    process1 process2={'extra_input':'var'}

FlowCraft will expose a new ``var`` parameter, setup an extra input
channel and mix it with ``process2`` main input channel. A more detailed
explanation follows below.

First, FlowCraft will create a nextflow channel from the parameter name
provided via the ``extra_input`` directive. The channel string will depend
on the input type of the process (this string is fetched from the
:attr:`~flowcraft.generator.process.Process.RAW_MAPPING` attribute).
For instance, if the input type of
``process2`` is ``fastq``, the new extra channel will be::

    IN_var_extraInput = Channel.fromFilePairs(params.var)

Since the same extra input parameter may be used by more than one process,
the ``IN_var_extraInput`` channel will be automatically forked into the
final destination channels::

    // When there is a single destination channel
    IN_var_extraInput.set{ EXTRA_process2_1_2 }
    // When there are multiple destination channels for the same parameter
    IN_var_extraInput.into{ EXTRA_process2_1_2; EXTRA_process3_1_3 }

The destination channels are the ones that will be actually mixed with
the main input channels::

    process process2 {
        input:
        (...) main_channel.mix(EXTRA_process2_1_2)
    }

In these cases, the processes that receive the extra input will process the
data provided by the preceding channel **AND** by the parameter. The data
provided via the extra input parameter does not have to wait for the
``main_channel``, which means that they can run in parallel, if there are
enough resources.

Compiler
::::::::

The :attr:`~flowcraft.generator.process.Process.compiler` attribute
allows one or more channels of the process to be fed into a compiler process
(See `Compiler processes`_). These are special processes that collect
information from one or more processes to execute a given task. Therefore,
this parameter can only be used when there is an appropriate compiler process
available (the available compiler processes are set in the
:attr:`~flowcraft.generator.engine.NextflowGenerator.compilers` dictionary). In order to
provide one or more channels to a compiler process, simply add a key:value to the
attribute, where the key is the id of the compiler process present in the
:attr:`~flowcraft.generator.engine.NextflowGenerator.compilers` dictionary and the value
is the list of channels::

    self.compiler["patlas_consensus"] = ["mappingOutputChannel"]

Link start
::::::::::

The :attr:`~flowcraft.generator.process.Process.link_start` attribute
stores a list of strings of channel names that can be used as secondary
channels in the pipeline (See the `Secondary links between process`_ section).
By default, this attribute contains the main output channel, which means
that every process can fork the main channel to one or more receiving
processes.

Link end
::::::::

The :attr:`~flowcraft.generator.process.Process.link_end` attribute
stores a list of dictionaries with channel names that are meant to be
received by the process as secondary channel **if** the corresponding
`Link start`_ exists in the pipeline. Each dictionary in this list will define
one secondary channel and requires two key:value pairs::

    self.link_end({
        "link": "SomeChannel",
        "alias": "OtherChannel")
    })

If another process exists in the pipeline with
``self.link_start.extend(["SomeChannel"])``, FlowCraft will automatically
establish a secondary channel between the two processes. If there are multiple
processes receiving from a single one, the channel from the later will
for into any number of receiving processes.

Dependencies
::::::::::::

If a process depends on the presence of one or more processes upstream in the
pipeline, these can be specific via the
:attr:`~flowcraft.generator.process.Process.dependencies` attribute.
When building the pipeline if at least one of the dependencies is absent,
FlowCraft will raise an exception informing of a missing dependency.

.. _DirectivesAnchor:

Directives
::::::::::

The :attr:`~flowcraft.generator.process.Process.directives` attribute
allows for information about cpu/RAM usage and container to be specified
for each nextflow process in the template file. For instance, considering
the case where a ``Process`` has a template with two nextflow processes::

    process proc_A_{{ pid }} {
        // stuff
    }

    process proc_B_{{ pid }} {
        // stuff
    }

Then, information about each process can be specified individually in the
:attr:`~flowcraft.generator.process.Process.directives` attribute::


    class myProcess(Process):
        (...)
        self.directives = {
            "proc_A": {
                "cpus": 1
                "memory": "4GB"
            },
            "proc_B": {
                "cpus": 4
                "container": "my/container"
                "version": "1.0.0"
            }
        }

The information in this attribute will then be used to build the
``resources.config`` (containing the information about cpu/RAM) and
``containers.config`` (containing the container images) files. Whenever a
directive is missing, such as the ``container`` and ``version`` from ``proc_A``
and ``memory`` from ``proc_B``, nothing about them will be written into the
config files and they will use the **default pipeline values**:

- ``cpus``: ``1``
- ``memory``: ``1GB``
- ``container``: `flowcraft_base`_ image

.. _flowcraft_base: https://hub.docker.com/r/ummidock/assemblerflow_base/~/dockerfile/

Ignore type
:::::::::::

The :attr:`~flowcraft.generator.process.Process.ignore_type` attribute,
controls whether a match between the input of the current process and the
output of the previous one is enforced or not. When there are multiple
terminal processes that fork from the main channel, there is no need to
enforce the type match and in that case this attribute can be set to ``False``.

Process ID
::::::::::

The process ID, set via the
:attr:`~flowcraft.generator.process.Process.pid` attribute, is an
arbitrarily and incremental number that is awarded to each process depending
on its position in the pipeline. It is mainly used to ensure that there are
no duplicated channels even when the same process is used multiple times
in the same pipeline.

Template
::::::::

The :attr:`~flowcraft.generator.process.Process.template` attribute
is used to fetch the jinja2 template file that corresponds to the current
process. The path to the template file is determined as follows::

    join(<template directory>, template + ".nf")


Status channels
:::::::::::::::

The status channels are special channels dedicated to passing information
regarding the status, warnings, fails and logging from each process
(see :ref:`dotfiles` for more information). They are used only when the
nextflow template file contains the appropriate jinja2 placeholder::

    output:
    {% with task_name="<nextflow_template_name>" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

By default,
every ``Process`` class contains a
:attr:`~flowcraft.generator.process.Process.status_channels` list
attribute that contains the
:attr:`~flowcraft.generator.process.Process.template` string::

    self.status_channels = ["STATUS_{}".format(template)]

If there is only one nextflow process in the template and the ``task_name``
variable in the template matches the
:attr:`~flowcraft.generator.process.Process.template` attribute, then
it's all automatically set up.

If the template file contains **more than one nextflow process**
definition, multiple placeholders can be provided in the template::

    process A {
        (...)
        output:
        {% with task_name="A" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}
    }

    process B {
        (...)
        output:
        {% with task_name="B" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}
    }

In this case, the
:attr:`~flowcraft.generator.process.Process.status_channels` attribute
would need to be changed to::

    self.status_channels = ["A", "B"]

Sample ID variable
^^^^^^^^^^^^^^^^^^

In case you change the standard nextflow variable that stores the sample ID
in the input of the process (``sample_id``), you also need to change it for
the ``compiler_channels`` placeholder::

    process A {

    input:
    set other_id, data from {{ input_channel }}

    output:
    {% with task_name="B", sample_id="other_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    }

Advanced use cases
------------------

Compiler processes
::::::::::::::::::

Compilers are special processes that collect data from one or more processes
and perform a given task with that compiled data. They are automatically
included in the pipeline when at least one of the source channels is present.
In the case there are multiple source channels, they are merged according
to a specified operator.

Creating a compiler process
^^^^^^^^^^^^^^^^^^^^^^^^^^^

The creation of the compiler process is simpler than that of a regular process
but follows the same three steps.

1. Create a nextflow template file in ``flowcraft.generator.templates``::

    process fullConsensus {

        input:
        set id, file(infile_list) from {{ compile_channels }}

        output:
        <output channels>

        script:
        """
        <commands/code/template>
        """

    }

The only requirement is the inclusion of a ``compiler_channels`` jinja
placeholder in the main input channel.

2. Create a Compiler class in the :mod:`flowcraft.generator.process`
   module::

    class PatlasConsensus(Compiler):

        def __init__(self, **kwargs):

            super().__init__(**kwargs)

This class must inherit from
:mod:`~flowcraft.generator.process.Compiler` and does not require any
more changes.

3. Map the compiler template file to the class in
:attr:`~flowcraft.generator.engine.NextflowGenerator.compilers` attribute::

        self.compilers = {
        "patlas_consensus": {
            "cls": pc.PatlasConsensus,
            "template": "patlas_consensus",
            "operator": "join"
            }
        }

Each compiler should contain a key:value entry. The key is the compiler
id that is then specified in the :attr:`~flowcraft.generator.process.Process.compiler`
attribute of the component classes. The value is a json/dict object that
species the compiler class in the ``cls`` key, the template string in the
``template`` string and the operator used to join the channels into the
compiler via the ``operator`` key.

How a compiler process works
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Consider the case where you have a compiler process named ``compiler_1`` and
two processes, ``process_1`` and ``process_2``, both of which feed a single
channel to ``compiler_1``. This means that the class definition of these
processes include::

    class Process_1(Process):
        (...)
        self.compiler["compiler_1"] = ["channel1"]

    class Process_2(Process):
        (...)
        self.compiler["compiler_1"] = ["channel2"]

If a pipeline is built with at least one of these process, the ``compiler_1``
process will be automatically included in the pipeline. If more than one
channel is provided to the compiler, they will be merged with the specified
operator::

    process compiler_1 {

        input:
        set sample_id, file(infile_list) from channel2.join(channel1)

    }

This will allow the output of multiple separate process to be processed by
a single process in the pipeline, and it automatically adjusts according
to the channels provided to the compiler.

Secondary links between process
:::::::::::::::::::::::::::::::

In some cases, it might be necessary to perform additional links between
two or more processes.
For example, the maximum read length might be gathered in one process, and
that information may be required by a subsequent process. These secondary
channels allow this information to be passed between theses channels.

These additional links are called secondary channels and
they may be explicitly or implicitly declared.

Explicit secondary channels
^^^^^^^^^^^^^^^^^^^^^^^^^^^

To create an explicit secondary channel, the origin or source of this channel
must be declared in the nextflow process that sends it::

    // secondary channels can be created inside the process
    output:
    <main output> into {{ output_channel }}
    <secondary output> into SIDE_max_read_len_{{ pid }}

    // or outside
    SIDE_phred_{{ pid }} = Channel.create()

Then, we add the information that this process has a secondary channel start
via the ``link_start`` list attribute in the corresponding
``flowcraft.generator.process.Process`` class::

    class MyProcess(Process):

        (...)

        self.link_start.extend(["SIDE_max_read_len", "SIDE_phred"])

Notice that we extend the ``link_start`` list, instead of simply assigning.
This is because all processes already have the main channel as an implicit
link start (See `Implicit secondary channels`_).

**Now, any process that is executed after this one can receive this secondary
channel.**

For another process to receive this channel, it will be necessary to add this
information to the process class(es) via the ``link_end`` list attribute::

    class OtherProcess(Process):

        (...)

        self.link_end.append({
            "link": "SIDE_phred",
            "alias": "OtherName"
        })

Notice that now we append a dictionary with two key:values. The first, `link`
must match a string from the `link_start` list (in this case, `SIDE_phred`).
The second, `alias`, will be the channel name in the receiving process nextflow
template (which can be the same as the `link` value).

Now, we only need to add the secondary channel to the nextflow template, as in
the example below::

    input:
    <main_input> from {{ input_channel }}.mix(OtherName_{{ pid}})

Implicit secondary channels
^^^^^^^^^^^^^^^^^^^^^^^^^^^

By default, the main output of the channels is declared as a secondary channel
start. This means that any process can receive the main output channel as a
a secondary channel of a subsequent process. This can be useful in situations
were a post-assembly process (has ``assembly`` as expected input and output)
needs to receive the last channel with fastq files::

    class AssemblyMapping(Process):

        (...)

        self.link_end.append({
            "link": "MAIN_fq",
            "alias": "_MAIN_assembly"
        })

In this example, the ``AssemblyMapping`` process will receive a secondary
channel with from the last process that output fastq files into a channel
called ``_MAIN_assembly``. Then, this channel is received in the nextflow
template like this::

    input:
    <main input> from {{ input_channel }}.join(_{{ input_channel }})

Implicit secondary channels can also be used to
fork the last output channel into multiple terminal processes::

    class Abricate(Process):

        (...)

        self.link_end.append({
            "link": "MAIN_assembly",
            "alias": "MAIN_assembly"
        })

In this case, since ``MAIN_assembly`` is already the prefix of the main
output channel of this process, there is no need for changes in the process
template::

    input:
    <main input> from {{ input_channel }}


.. _jinja2: http://jinja.pocoo.org/docs/2.10/


================================================
FILE: docs/dev/create_recipe.rst
================================================
Recipe creation guidelines
===========================

Recipes are pre-made pipeline strings that may be associated with specific
parameters and directives and are used to rapidly build a certain type of
pipeline.

Instead of building a pipeline like::

    -t "integrity_coverage fastqc_trimmomatic fastqc spades pilon"

The user simply can specific a recipe with that pipeline::

    -r assembly

Recipe creation
---------------

The creation of new recipes is a very simple and straightforward process.
You need to create a new file in the ``flowcraft/generator/recipes`` folder
with any name and create a basic class with three attributes::

    try:
        from generator.recipe import Recipe
    except ImportError:
        from flowcraft.generator.recipe import Recipe


    class Innuca(Recipe):

        def __init__(self):
            super().__init__()

            # Recipe name
            self.name = "innuca"

            # Recipe pipeline
            self.pipeline_str = <pipeline string>

            # Recipe parameters and directives
            self.directives = { <directives> }

And that's it! Now there is a new recipe available with the ``innuca`` name and
we can build this pipeline using the option ``-r innuca``.

Name
^^^^

This is the name of the recipe, which is used to make a match with the recipe
name provided by the user via the ``-r`` option.

Pipeline_str
^^^^^^^^^^^^

The pipeline string as if provided via the ``-t`` option.

Directives
^^^^^^^^^^

A dictionary containing the parameters and directives for each process in the
pipeline string. **Setting this attribute is optional and components
that are not specified here will assume their default values**. In general, each
element in this dictionary should have the following format::

    self.directives = {
        "component_name": {
            "params": {
                "paramA": "value"
            },
            "directives": {
                "directiveA": "value"
            }
        }
    }

This will set the provided parameters and directives to the component, but it is
possible to provide only one.

A more concrete example of a real component and directives follows::

    self.pipeline_str = "integrity_coverage fastqc"

    # Set parameters and directives only for integrity_coverage
    # and leave fastqc with the defaults
    self.directives = {
        "integrity_coverage": {
            "params": {
                "minCoverage": 20
            },
            "directives": {
                "memory": "1GB"
            }
        }
    }

Duplicate components
~~~~~~~~~~~~~~~~~~~~

In some cases, the same component may be present multiple times in the pipeline
string of a recipe. In these cases, directives can be assigned to each individual
component by adding a ``#<id>`` suffix to the component::

    self.pipeline_str = "integrity_coverage ( trimmomatic spades#1 | spades#2)"

    self.directives = {
        "spades#1": {
            "directives": {
                "memory": "10GB"
            }
        },
        "spades#2": {
            "directives": {
                "version": "3.7.0"
            }
        }
    }


================================================
FILE: docs/dev/create_recipes.rst
================================================
Recipe creation guidelines
==========================

Under construction.

================================================
FILE: docs/dev/create_template.rst
================================================
Template creation guidelines
============================

Though none of these guidelines are mandatory nor required, their usage is
highly recommended for several reasons:

- Consistency in the outputs of the templates throughout the pipeline,
  particularly the status and report dotfiles (see :ref:`dotfiles` section);
- Debugging purposes;
- Versioning;
- Proper documentation of the template scripts.

Preface header
--------------

After the script shebang, a header with a brief description of the purpose and
expected inputs and outputs should be provided. A complete example of such
description can be viewed in :mod:`flowcraft.templates.integrity_coverage`.

Purpose
^^^^^^^

Purpose section contains a brief description of the script's objective. E.g.::

    Purpose
    -------

    This module is intended parse the results of FastQC for paired end FastQ \
    samples.

Expected input
^^^^^^^^^^^^^^

Expected input section contains a description of the variables that are
provided to the main function of the template script. These variables are
defined in the input channels of the process in which the template is supposed
to be executed. E.g.::

    Expected input
    --------------

    The following variables are expected whether using NextFlow or the
    :py:func:`main` executor.

    - ``mash_output`` : String with the name of the mash screen output file.
        - e.g.: ``'sortedMashScreenResults_SampleA.txt'``

This means that the process that will execute this channel will have the input
defined as::

    input:
    file(mash_output) from <channel>

Generated output
^^^^^^^^^^^^^^^^

Generated output section contains a description of the output files that the
template script is intended to generated. E.g.::

    Generated output
    ----------------

    The generated output are output files that contain an object, usually a string.

    - ``fastqc_health`` : Stores the health check for the current sample. If it
        passes all checks, it contains only the string 'pass'. Otherwise, contains
        the summary categories and their respective results

These can then be passed to the output channel(s) in the nextflow process::

    output:
    file(fastqc_health) into <channel>

.. note ::

    Since templates can be re-used by multiple processes, not all generated
    outputs need to be passed to output channels. Depending on the job of
    the nextflow process, it may catch none or all of the output files
    generated by the template.


Versioning and logging
----------------------

FlowCraft has a specific ``logger``
(:func:`~flowcraft.templates.flowcraft_utils.flowcraft_base.get_logger`) and
versioning system that can be imported from
:mod:`flowcraft.templates.flowcraft_utils`: ::

    # the module that imports the logger and the decorator class for versioning
    # of the script itself and other software used in the script
    from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


Logger
^^^^^^

A `logger` function is also required to add logs to the script. The logs
are written to the ``.command.log`` file in the work directory of each process.

First, the logger must be called, for example, after the **imports** as follows::

    logger = get_logger(__file__)

Then, it may be used at will, using the default `logging levels
<https://docs.python.org/3.6/library/logging.html#levels>`_ . E.g.::

    logger.debug("Information tha may be important for debugging")
    logger.info("Information related to the normal execution steps")
    logger.warning("Events that may require the attention of the developer")
    logger.error("Module exited unexpectedly with error:\\n{}".format(
                traceback.format_exc()))

MainWrapper decorator
^^^^^^^^^^^^^^^^^^^^^

This :class:`~flowcraft.templates.flowcraft_utils.flowcraft_base.MainWrapper`
class decorator allows the program to fetch information on the script version,
build and template name. For example::

    # This can also be declared after the imports
    __version__ = "1.0.0"
    __build__ = "15012018"
    __template__ = "process_abricate-nf"

The :class:`~flowcraft.templates.flowcraft_utils.flowcraft_base.MainWrapper`
should decorate the main function of the script.
E.g.::

    @MainWrapper
    def main():
        #some awesome code
        ...

Besides searching for the script's version, build and template name this decorator
will also search for a specific set of functions that start with the
substring ``__get_version``. For example::

    def __get_version_fastqc():

        try:

        cli = ["fastqc", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[1][1:].decode("utf8")

        except Exception as e:
            logger.debug(e)
            version = "undefined"

        # Note that it returns a dictionary that will then be written to the .versions
        # dotfile
        return {
            "program": "FastQC",
            "version": version,
            # some programs may also contain build.
        }

These functions are used to fetch the version, name and other relevant
information from third-party software and the only requirement is that they
return a dictionary with **at least** two key:value pairs:

- ``program``: String with the name of the program.
- ``version``: String with the version of the program.

For more information, refer to the
:func:`~flowcraft.templates.flowcraft_utils.flowcraft_base.MainWrapper.build_versions`
method.

Nextflow `.command.sh`
----------------------

When these templates are used as a  Nextflow `template <https://www.nextflow.io/docs/latest/process.html#template>`_
they are executed as a ``.command.sh`` file in the work directory of each
process. In this case, we recommended the inclusion of
an **if statement** to parse the arguments sent from nextflow to the python
template. For example, imagine we have a path to a file name to pass as
argument between nextflow and the required template::

    # code check for nextflow execution
    if __file__.endswith(".command.sh"):
        FILE_NAME = '$Nextflow_file_name'
        # logger output can also be included here, for example:
        logger.debug("Running {} with parameters:".format(
            os.path.basename(__file__)))
        logger.debug("FILE_NAME: {}".format(FILE_NAME))

Then, we could use this variable as the argument of a function, such as::

    def main(FILE_NAME):
        #some awesome code
        ...


This way, we can use this function with nextflow arguments or without them,
as is the case when the templates are used as standalone modules.

Use numpy docstrings
--------------------

``FlowCraft`` uses numpy docstrings to document code.
Use
`this link <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html>`_
for reference.

================================================
FILE: docs/dev/general_orientation.rst
================================================
General orientation
===================

Codebase structure
------------------

The most important elements of FlowCraft's directory structure are:

- ``generator``:
    - ``components``: Contains the ``Process`` classes for each component
    - ``templates``: Contains the nextflow jinja template files for each component
    - ``engine.py``: The engine of FlowCraft that builds the pipeline
    - ``process.py``: Contains the abstract ``Process`` class that is inherited
    - by all component classes
    - ``pipeline_parser.py``: Functions that parse and check the pipeline string
    - ``recipe.py``: Class responsible for creating recipes
- ``templates``: A git submodule of the `templates`_ repository that contain
  the template scripts for the components.

.. _templates: https://github.com/ODiogoSilva/templates


Code style
----------

- **Style**:  the code base of flowcraft should adhere (the best it can) to
  the `PEP8`_ style guidelines.
- **Docstrings**: code should be generally well documented following the
  `numpy docstring`_ style.
- **Quality**: there is also an integration with the `codacy`_ service to
  evaluate code quality, which is useful for detecting several coding
  issues that may appear.


Testing
-------

Tests are performed using `pytest`_ and the source files are stored in the
``flowcraft/tests`` directory. Tests must be executed on the root directory
of the repository

Documentation
-------------

Documentation source files are stored in the ``docs`` directory. The general
configuration file is found in ``docs/conf.py`` and the entry
point to the documentation is ``docs/index.html``.


.. _pytest: https://docs.pytest.org/en/latest/
.. _PEP8: https://www.python.org/dev/peps/pep-0008/
.. _numpy docstring: https://numpydoc.readthedocs.io/en/latest/format.html
.. _codacy: https://app.codacy.com/app/o.diogosilva/assemblerflow/dashboard

================================================
FILE: docs/dev/pipeline_reporting.rst
================================================
Pipeline reporting
==================

This section describes how the reports of a FlowCraft pipeline are generated
and collected at the end of a run. These reports can then be sent to the
`FlowCraft web application <https://github.com/assemblerflow/flowcraft-webapp>`_
where the results are visualized.

.. important::
    Note that if the nextflow process reports add new types of data, one or
    more React components need to be added to the web application for them
    to be rendered.

Data collection
---------------

The data for the pipeline reports is collected from three dotfiles in each nextflow
process (they should be present in each work sub directory):

- **.report.json**: Contains report data (See :ref:`report-json` for more information).
- **.versions**: Contains information about the versions of the software used
  (See :ref:`versions` for more information).
- **.command.trace**: Contains resource usage information.

The **.command.trace** file is generated by nextflow when the **trace** scope
is active. The **.report.json** and **.version** files are specific to
FlowCraft pipelines. 

Generation of dotfiles
^^^^^^^^^^^^^^^^^^^^^^

Both **report.json** and **.versions** empty dotfiles are automatically generated
by the ``{% include "post.txt" ignore missing %}`` placeholder, specified in the
:ref:`create-process` section. Using this placeholder in your processes is all
that is needed.

Collection of dotfiles
^^^^^^^^^^^^^^^^^^^^^^

The **.report.json**, **.versions** and **.command.trace** files are automatically
collected and sent to dedicated report channels in the pipeline by the
``{%- include "compiler_channels.txt" ignore missing -%}`` placeholder, specified
in the :ref:`process creation <compiler>` section. Placing this placeholder in your
processes will generate the following line in the output channel specification::

    set {{ sample_id|default("sample_id") }}, val("{{ task_name }}_{{ pid }}"), val("{{ pid }}"), file(".report.json"), file(".versions"), file(".command.trace") into REPORT_{{task_name}}_{{ pid }}

This line collects several metadata associated with the process along with the three
dotfiles.

Compilation of dotfiles
^^^^^^^^^^^^^^^^^^^^^^^

As mentioned in the previous section, the dotfiles and other relevant metadata
for are sent through special report channels to a FlowCraft component that is
responsible for compiling all the information and generate a single report
file at the end of each pipeline run.

This component is specified in ``flowcraft.generator.templates.report_compiler.nf``
and it consists of two nextflow processes:

- First, the **report** process receives the data from each executed process that
  sends report data and runs the ``flowcraft/bin/prepare_reports.py`` script
  on that data. This script will simply merge metadata and dotfiles information
  in a single JSON file. This file contains the following keys:

    - ``reportJson``: The data in **.report.json** file.
    - ``versions``: The data in **.versions** file.
    - ``trace``: The data in **.command.trace** file.
    - ``processId``: The process ID
    - ``pipelineId``: The pipeline ID that defaults to one, unless specified in
      the parameters.
    - ``projectid``: The project ID that defaults to one, unless specified in
      the parameters.
    - ``userId``: The user ID that defaults to one, unless specified in
      the parameters.
    - ``username``: The user name that defaults to *user*, unless specified in
      the parameters
    - ``processName``: The name of the flowcraft component.
    - ``workdir``: The work directory where the process was executed.

- Second, all JSON files created in the process above are merged
  and a single reports JSON file is created. This file will contains the
  following structure::

    reportJSON = {
        "data": {
            "results": [<array of report JSONs>]
        }
    }


================================================
FILE: docs/dev/process_dotfiles.rst
================================================
.. _dotfiles:

Dotfiles
========

Several dotfiles (files prefixed by a single ``.``, as in ``.status``) are
created at the beginning of every nextflow process that has the following
placeholder (see :ref:`create-process`): ::

    process myProcess {
        {% include "post.txt" ignore missing %}
        (...)
    }

The actual script that creates the dotfiles is found in
``flowcraft/bin``, is called ``set_dotfiles.sh`` and executes the
following command::

    touch .status .warning .fail .report.json .versions

Status
------

The ``.status`` file simply stores a string with the run status of the process.
The supported status are:

- ``pass``: The process finished successfully
- ``fail``: The process ran without unexpected issues but failed due to some
  quality control check
- ``error``: The process exited with an unexpected error.

Warning
-------

The ``.warning`` file stores any warnings that may occur during the execution
of the process. There is no particular format for the warning messages other
than that each individual warning should be in a separate line.

Fail
----

The ``.fail`` file stores any fail messages that may occur during the
execution of the process. When this occurs, the ``.status`` channel must have
the ``fail`` string as well. As in the warning dotfile, there is no
particular format for the fail message.

.. _report-json:

Report JSON
-----------

.. important::
    The general specification of the report JSON changed in version 1.2.2.
    See the `issue tracker <https://github.com/assemblerflow/flowcraft/issues/96>`_
    for details.

The ``.report.json`` file stores any information from a given process that is
deemed worthy of being reported and displayed at the end of the pipeline.
Any information can be stored in this file, as long as it is in JSON format,
but there are a couple of recommendations that are necessary to follow
for them to be processed by a reporting web app (Currently hosted at
`flowcraft-webapp <https://github.com/assemblerflow/flowcraft-webapp>`_). However, if
data processing will be performed with custom scripts, feel free to specify
your own format.

Information for tables
^^^^^^^^^^^^^^^^^^^^^^

Information meant to be displayed in tables should be in the following
format::

    json_dic = {
        "tableRow": [{
            "sample": "A",
            "data": [{
                "header": "Raw BP",
                "value": 123,
                "table": "qc"
            }, {
                "header": "Coverage",
                "value": 32,
                "table": "qc"
            }]
        }, {
            "sample": "B",
            "data": [{
                "header": "Coverage",
                "value": 35,
                "table": "qc"
            }]
        }]
    }

This provides table information for multiple samples in the same process. In
this case, data for two samples is provided. For each sample, values for
one or more headers can be provided. For instance, this report provides
information about the **Raw BP** and **Coverage** for sample **A** and this
information should go to the **qc** table. If any other information is relevant
to build the table, feel free to add more elements to the JSON.

Information for plots
^^^^^^^^^^^^^^^^^^^^^

Information meant to be displayed in plots should be in the following format::

    json_dic = {
        "plotData": [{
            "sample": "strainA",
            "data": {
                "sparkline": 23123,
                "otherplot": [1,2,3]
             }
        }],
    }

As in the table JSON, *plotData* should be an array with an entry for each
sample. The data for each sample should be another JSON where the keys are
the *plot signatures*, so that we know to which plot the data belongs. The
corresponding values are whatever data object you need.

Other information
^^^^^^^^^^^^^^^^^

Other than tables and plots, which have a somewhat predefined format, there
is not particular format for other information. They will simply store the
data of interest to report and it will be the job of a downstream report app
to process that data into an actual visual report.

.. _versions:

Versions
--------

The ``.version`` dotfile should contain a list of JSON objects with the
version information of the programs used in any given process. There are
only two required key:value pairs:

- ``program``: String with the name of the software/script/template
- ``version``: String with the version of said software.

As an example::

    version = {
        "program": "abricate"
        "version": "0.3.7"
    }

Key:value pairs with other metadata can be included at will for downstream
processing.

================================================
FILE: docs/dev/reports.rst
================================================
Reports
=======

Report JSON specification
-------------------------

The report JSON is quite flexibly on the information it can contain. Here are
some guidelines to promote consistency on the reports generated by each component.
In general, the reports file is an array of JSON objects that contain relevant
information for each executed process in the pipeline::

    reportFile = [{<processA/tagA reports>}, {<processB/tagB reports>}, ... ]


Nextflow metadata
^^^^^^^^^^^^^^^^^

The nextflow metada is automatically added to the reportFile as a single JSON entry
with the ``nfMetadata`` key that contains the following information::

        "nfMetadata": {
            "scriptId": "${workflow.scriptId}",
            "scriptName": "${workflow.scriptId}",
            "profile": "${workflow.profile}",
            "container": "${workflow.container}",
            "containerEngine": "${workflow.containerEngine}",
            "commandLine": "${workflow.commandLine}",
            "runName": "${workflow.runName}",
            "sessionId": "${workflow.sessionId}",
            "projectDir": "${workflow.projectDir}",
            "launchDir": "${workflow.launchDir}",
            "start_time": "${workflow.start}"
        }

.. note::
    Unlike the remaining JSON entries in the report file, which are generated for
    each process execution, the ``nfMetadata`` entry is generated only once per
    project execution.

Root
^^^^

The reports contained in the ``reports.json`` file for each process execution
are added to the root object::

    {
        "pipelineId": 1,
        "processId": pid,
        "processName": task_name,
        "projectid": RUN_NAME,
        "reportJson": reports,
        "runName": RUN_NAME,
        "scriptId": SCRIPT_ID,
        "versions": versions,
        "trace": trace,
        "userId": 1,
        "username": "user",
        "workdir": dirname(abspath(report_json))
    }

The other key:values are added automatically when the reports are compiled for each
process execution.

Versions
^^^^^^^^

Inside the root, the signature key for software version information is ``versions``::

    "versions": [{
        "program": "progA",
        "version": "1.0.0",
        "build": "1"
    }, {
        "program": "progB",
        "version": "2.1"
    }]

Only the ``program`` and ``version`` keys are mandatory.

ReportJson
^^^^^^^^^^

Table data
~~~~~~~~~~

Inside ``reportJson``, the signature key for table data is ``tableRow``::

    "reportJson": {
        "tableRow": [{
            "sample": "strainA",
            "data": [{
                "header": "Raw BP",
                "value": 123,
                "table": "qc",
            }, {
                "header": "Coverage",
                "value": 32,
                "table": "qc"
            }],
            "sample": "strainB",
            "data": [{
                "header": "Raw BP",
                "value": 321,
                "table": "qc",
            }, {
                "header": "Coverage",
                "value": 22,
                "table": "qc"
            }]
        }]
   }

``tableRow`` should contain an array of JSON for each sample with two key:value pairs:

    - ``sample``: Sample name
    - ``data``: Table data (see below).

``data`` should be an array of JSON with at least three key:value pairs:

    - ``header``: Column header
    - ``value``: The data value
    - ``table``: Informs to which table this data should go.

.. note::
    Available ``table`` keys: ``typing``, ``qc``, ``assembly``, ``abricate``,
    ``chewbbaca``.


Plot data
~~~~~~~~~

Inside ``reportJson``, the signature key for plot data is ``plotData``::

    "reportJson": {
        "plotData": [{
            "sample": "strainA",
            "data": {
                "sparkline": 23123,
                "otherplot": [1,2,3]
             }
        }],
    }

``plotData`` should contain an array of JSON for each sample with two key:value pairs:

    - ``sample``: Sample name
    - ``data``: Plot data (see below).

``data`` should contain a JSON object with the plot signatures as keys, and the relevant
plot data as value. This data can be any object (integer, float, array, JSON, etc).
**It will be up to the components in the flowcraft web application to parse this data
and generate the appropriate chart.**

Warnings and fails
~~~~~~~~~~~~~~~~~~

Inside ``reportJson``, the signature key for warnings is ``warnings`` and for
failures is ``fail``::

    "reportJson": {
        "warnings": [{
            "sample": "strainA",
            "table": "qc",
            "value": ["message 1", "message 2"]
        }],
        "fail": [{
            "sample": "strainA",
            "table": "assembly",
            "value": ["message 1"]
        }]
    }


``warnings``/``fail`` should contain an array of JSON for each sample with
two key:value pairs:

    - ``sample``: Sample name
    - ``value``: An array with one or more string messages.
    - ``table`` **[optional]**: If a table signature is provided, the warning/fail
      messages information will appear on that table. Otherwise, it will appear as
      a general warning/error that is associated to the sample but not to any particular
      table.


================================================
FILE: docs/flowcraft.flowcraft.rst
================================================
flowcraft\.flowcraft module
===========================

.. automodule:: flowcraft.flowcraft
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.annotation.rst
================================================
flowcraft\.generator\.components\.annotation module
===================================================

.. automodule:: flowcraft.generator.components.annotation
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.assembly.rst
================================================
flowcraft\.generator\.components\.assembly module
=================================================

.. automodule:: flowcraft.generator.components.assembly
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.assembly_processing.rst
================================================
flowcraft\.generator\.components\.assembly\_processing module
=============================================================

.. automodule:: flowcraft.generator.components.assembly_processing
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.distance_estimation.rst
================================================
flowcraft\.generator\.components\.distance\_estimation module
=============================================================

.. automodule:: flowcraft.generator.components.distance_estimation
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.downloads.rst
================================================
flowcraft\.generator\.components\.downloads module
==================================================

.. automodule:: flowcraft.generator.components.downloads
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.metagenomics.rst
================================================
flowcraft\.generator\.components\.metagenomics module
=====================================================

.. automodule:: flowcraft.generator.components.metagenomics
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.mlst.rst
================================================
flowcraft\.generator\.components\.mlst module
=============================================

.. automodule:: flowcraft.generator.components.mlst
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.patlas_mapping.rst
================================================
flowcraft\.generator\.components\.patlas\_mapping module
========================================================

.. automodule:: flowcraft.generator.components.patlas_mapping
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.reads_quality_control.rst
================================================
flowcraft\.generator\.components\.reads\_quality\_control module
================================================================

.. automodule:: flowcraft.generator.components.reads_quality_control
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.rst
================================================
flowcraft\.generator\.components package
========================================

Submodules
----------

.. toctree::

   flowcraft.generator.components.annotation
   flowcraft.generator.components.assembly
   flowcraft.generator.components.assembly_processing
   flowcraft.generator.components.distance_estimation
   flowcraft.generator.components.downloads
   flowcraft.generator.components.metagenomics
   flowcraft.generator.components.mlst
   flowcraft.generator.components.patlas_mapping
   flowcraft.generator.components.reads_quality_control
   flowcraft.generator.components.typing

Module contents
---------------

.. automodule:: flowcraft.generator.components
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.components.typing.rst
================================================
flowcraft\.generator\.components\.typing module
===============================================

.. automodule:: flowcraft.generator.components.typing
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.engine.rst
================================================
flowcraft\.generator\.engine module
===================================

.. automodule:: flowcraft.generator.engine
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.error_handling.rst
================================================
flowcraft\.generator\.error\_handling module
============================================

.. automodule:: flowcraft.generator.error_handling
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.footer_skeleton.rst
================================================
flowcraft\.generator\.footer\_skeleton module
=============================================

.. automodule:: flowcraft.generator.footer_skeleton
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.header_skeleton.rst
================================================
flowcraft\.generator\.header\_skeleton module
=============================================

.. automodule:: flowcraft.generator.header_skeleton
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.inspect.rst
================================================
flowcraft\.generator\.inspect module
====================================

.. automodule:: flowcraft.generator.inspect
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.pipeline_parser.rst
================================================
flowcraft\.generator\.pipeline\_parser module
=============================================

.. automodule:: flowcraft.generator.pipeline_parser
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.process.rst
================================================
flowcraft\.generator\.process module
====================================

.. automodule:: flowcraft.generator.process
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.process_details.rst
================================================
flowcraft\.generator\.process\_details module
=============================================

.. automodule:: flowcraft.generator.process_details
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.recipe.rst
================================================
flowcraft\.generator\.recipe module
===================================

.. automodule:: flowcraft.generator.recipe
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.generator.rst
================================================
flowcraft\.generator package
============================

Subpackages
-----------

.. toctree::

    flowcraft.generator.components

Submodules
----------

.. toctree::

   flowcraft.generator.engine
   flowcraft.generator.error_handling
   flowcraft.generator.footer_skeleton
   flowcraft.generator.header_skeleton
   flowcraft.generator.inspect
   flowcraft.generator.pipeline_parser
   flowcraft.generator.process
   flowcraft.generator.process_details
   flowcraft.generator.recipe

Module contents
---------------

.. automodule:: flowcraft.generator
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.rst
================================================
flowcraft package
=================

Subpackages
-----------

.. toctree::

    flowcraft.generator
    flowcraft.templates
    flowcraft.tests

Submodules
----------

.. toctree::

   flowcraft.flowcraft

Module contents
---------------

.. automodule:: flowcraft
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.assembly_report.rst
================================================
flowcraft\.templates\.assembly\_report module
=============================================

.. automodule:: flowcraft.templates.assembly_report
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.fastqc.rst
================================================
flowcraft\.templates\.fastqc module
===================================

.. automodule:: flowcraft.templates.fastqc
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.fastqc_report.rst
================================================
flowcraft\.templates\.fastqc\_report module
===========================================

.. automodule:: flowcraft.templates.fastqc_report
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.flowcraft_utils.flowcraft_base.rst
================================================
flowcraft\.templates\.flowcraft\_utils\.flowcraft\_base module
==============================================================

.. automodule:: flowcraft.templates.flowcraft_utils.flowcraft_base
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.flowcraft_utils.rst
================================================
flowcraft\.templates\.flowcraft\_utils package
==============================================

Submodules
----------

.. toctree::

   flowcraft.templates.flowcraft_utils.flowcraft_base

Module contents
---------------

.. automodule:: flowcraft.templates.flowcraft_utils
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.integrity_coverage.rst
================================================
flowcraft\.templates\.integrity\_coverage module
================================================

.. automodule:: flowcraft.templates.integrity_coverage
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.mapping2json.rst
================================================
flowcraft\.templates\.mapping2json module
=========================================

.. automodule:: flowcraft.templates.mapping2json
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.mashdist2json.rst
================================================
flowcraft\.templates\.mashdist2json module
==========================================

.. automodule:: flowcraft.templates.mashdist2json
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.mashscreen2json.rst
================================================
flowcraft\.templates\.mashscreen2json module
============================================

.. automodule:: flowcraft.templates.mashscreen2json
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.megahit.rst
================================================
flowcraft\.templates\.megahit module
====================================

.. automodule:: flowcraft.templates.megahit
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.metaspades.rst
================================================
flowcraft\.templates\.metaspades module
=======================================

.. automodule:: flowcraft.templates.metaspades
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.pATLAS_consensus_json.rst
================================================
flowcraft\.templates\.pATLAS\_consensus\_json module
====================================================

.. automodule:: flowcraft.templates.pATLAS_consensus_json
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.pipeline_status.rst
================================================
flowcraft\.templates\.pipeline\_status module
=============================================

.. automodule:: flowcraft.templates.pipeline_status
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.process_abricate.rst
================================================
flowcraft\.templates\.process\_abricate module
==============================================

.. automodule:: flowcraft.templates.process_abricate
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.process_assembly.rst
================================================
flowcraft\.templates\.process\_assembly module
==============================================

.. automodule:: flowcraft.templates.process_assembly
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.process_assembly_mapping.rst
================================================
flowcraft\.templates\.process\_assembly\_mapping module
=======================================================

.. automodule:: flowcraft.templates.process_assembly_mapping
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.rst
================================================
flowcraft\.templates package
============================

Subpackages
-----------

.. toctree::

    flowcraft.templates.flowcraft_utils

Submodules
----------

.. toctree::

   flowcraft.templates.assembly_report
   flowcraft.templates.fastqc
   flowcraft.templates.fastqc_report
   flowcraft.templates.integrity_coverage
   flowcraft.templates.mapping2json
   flowcraft.templates.mashdist2json
   flowcraft.templates.mashscreen2json
   flowcraft.templates.megahit
   flowcraft.templates.metaspades
   flowcraft.templates.pATLAS_consensus_json
   flowcraft.templates.pipeline_status
   flowcraft.templates.process_abricate
   flowcraft.templates.process_assembly
   flowcraft.templates.process_assembly_mapping
   flowcraft.templates.skesa
   flowcraft.templates.spades
   flowcraft.templates.trimmomatic
   flowcraft.templates.trimmomatic_report

Module contents
---------------

.. automodule:: flowcraft.templates
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.skesa.rst
================================================
flowcraft\.templates\.skesa module
==================================

.. automodule:: flowcraft.templates.skesa
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.spades.rst
================================================
flowcraft\.templates\.spades module
===================================

.. automodule:: flowcraft.templates.spades
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.trimmomatic.rst
================================================
flowcraft\.templates\.trimmomatic module
========================================

.. automodule:: flowcraft.templates.trimmomatic
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.templates.trimmomatic_report.rst
================================================
flowcraft\.templates\.trimmomatic\_report module
================================================

.. automodule:: flowcraft.templates.trimmomatic_report
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.data_pipelines.rst
================================================
flowcraft\.tests\.data\_pipelines module
========================================

.. automodule:: flowcraft.tests.data_pipelines
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.rst
================================================
flowcraft\.tests package
========================

Submodules
----------

.. toctree::

   flowcraft.tests.data_pipelines
   flowcraft.tests.test_assemblerflow
   flowcraft.tests.test_engine
   flowcraft.tests.test_pipeline_parser
   flowcraft.tests.test_process_details
   flowcraft.tests.test_processes
   flowcraft.tests.test_sanity

Module contents
---------------

.. automodule:: flowcraft.tests
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_assemblerflow.rst
================================================
flowcraft\.tests\.test\_assemblerflow module
============================================

.. automodule:: flowcraft.tests.test_assemblerflow
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_engine.rst
================================================
flowcraft\.tests\.test\_engine module
=====================================

.. automodule:: flowcraft.tests.test_engine
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_pipeline_parser.rst
================================================
flowcraft\.tests\.test\_pipeline\_parser module
===============================================

.. automodule:: flowcraft.tests.test_pipeline_parser
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_process_details.rst
================================================
flowcraft\.tests\.test\_process\_details module
===============================================

.. automodule:: flowcraft.tests.test_process_details
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_processes.rst
================================================
flowcraft\.tests\.test\_processes module
========================================

.. automodule:: flowcraft.tests.test_processes
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/flowcraft.tests.test_sanity.rst
================================================
flowcraft\.tests\.test\_sanity module
=====================================

.. automodule:: flowcraft.tests.test_sanity
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/getting_started/installation.rst
================================================
Installation
============

User installation
-----------------

FlowCraft is available as a bioconda package, which already comes with
nextflow::

    conda install flowcraft

Alternatively, you can install only FlowCraft, via pip::

    pip install flowcraft

You will also need a container engine (see `Container engine`_ below)

Container engine
----------------

All components of FlowCraft are executed in docker containers, which
means that you'll need to have a container engine installed. The container
engines available are the ones supported by Nextflow:

- `Docker`_,
- `Singularity`_
- Shifter (undocumented)

If you already have any one of these installed, you are good to go. If not,
you'll need to install one. We recommend singularity because it does not
require the processes to run on a separate root daemon.

Singularity
:::::::::::

Singularity is available to download and install `here <http://singularity.lbl.gov/install-linux>`_.
Make sure that you have singularity v2.5.x or higher.
Note that singularity should be installed as root and available on the machine(s) that
will be running the nextflow processes.

.. important::

    Singularity is available as a bioconda package. However, conda installs singularity
    in user space without root privileges, which may prevent singularity images from
    being correctly downloaded. **Therefore it is not recommended that you install
    singularity via bioconda**.

Docker
::::::

Docker can be installed following the instructions on the website:
https://www.docker.com/community-edition#/download.
To run docker as anon-root user, you'll need to following the instructions
on the website: https://docs.docker.com/install/linux/linux-postinstall/#manage-docker-as-a-non-root-user


Developer installation
----------------------

If you are looking to contribute to FlowCraft or simply interested in
tweaking it, clone the github repository and its submodule and then run
setup.py::

    git clone https://github.com/assemblerflow/flowcraft.git
    cd flowcraft
    python3 setup.py install


.. _Docker: https://www.nextflow.io/docs/latest/docker.html
.. _Singularity: https://www.nextflow.io/docs/latest/singularity.html


================================================
FILE: docs/getting_started/overview.rst
================================================
..    include:: <isonum.txt>

Overview
========

FlowCraft is an assembler of pipelines written in  nextflow_ for
analyses of genomic data. The premisse is simple:

Software are container blocks |rarr| Build your lego-like pipeline |rarr| Execute it (almost) anywhere.

What is Nextflow
::::::::::::::::

If you do not know nextflow, be sure to check it out. It's an awesome
framework based on the dataflow programming model used for building
parallelized, scalable and reproducible workflows using software containers.
It provides an abstraction layer between the execution and the logic of the
pipeline, which means that the same pipeline code can be executed on
multiple platforms, from a local laptop to clusters managed with SLURM, SGE,
etc. These are quite attractive features since genomic pipelines are
increasingly executed on large computer clusters to handle large volumes
of data and/or tasks. Moreover, portability and reproducibility are becoming
central pillars in modern data science.

What FlowCraft does
:::::::::::::::::::

FlowCraft is a python engine that automatically builds nextflow pipelines
by assembling pre-made ready-to-use :ref:`components <components>`. These components are modular
pieces of software or scripts, such as ``fastqc``, ``trimmomatic``, ``spades``,
etc, that are written for nextflow and have a set of attributes, such as
input and output types, parameters, directives, etc. This modular nature
allows them to be freely connected as long as they respect some basic rules,
such as the input type of a component must match with the output type of
the preceding component. In this way, nextflow processes can be
written only once, and FlowCraft is the magic glue that connects them,
handling the linking and forking of channels automatically. Moreover, each
component is associated with a docker image, which means that there is no
need to install any dependencies at all and all software runs on a
transparent and reliable box. To illustrate:

- A linear genome assembly pipeline can be easily built using FlowCraft
  with the following pipeline string::

    trimmomatic fastqc spades

Which will generate all the necessary files to run the nextflow
pipeline on any linux system that has nextflow and a container engine.

- You can easily add more components to perform assembly polishing, in this
  case, ``pilon``::

    trimmomatic fastqc spades pilon

- If a new assembler comes along and you want to switch that component in the
  pipeline, its as easy as replacing ``spades`` (or any other component)::

    trimmomatic fastqc skesa pilon

- And you can also fork the output of a component into multiple ones. For
  instance, we could annotate the resulting assemblies with multiple software::

    trimmomatic fastqc spades pilon (abricate | prokka)

- Or fork the execution of a pipeline early on to compare different software::

    trimmomatic fastqc (spades pilon | skesa pilon)

This will fork the output of ``fastqc`` into ``spades`` and ``skesa``, and
the pipeline will proceed independently in these two new 'lanes'.

- Directives for each process can be dynamically set when building the pipeline,
  such as the cpu/RAM usage or the software version::

    trimmomatic={'cpus':'4'} fastqc={'version':'0.11.5'} skesa={'memory':'10GB'} pilon (abricate | prokka)

- And extra input can be directly inserted in any part of the pipeline. For
  example, it is possible to assemble genomes from both fastq files and SRR
  accessions (downloaded from public databases) in a single workflow::

    download_reads trimmomatic={'extra_input':'reads'} fastqc skesa pilon

This pipeline can be executed by providing a file with accession numbers
(``--accessions`` parameter by default) **and** fastq reads, using the
``--reads`` parameter defined with the ``extra_input`` directive.


Who is FlowCraft for
::::::::::::::::::::

FlowCraft can be useful for bioinformaticians with varied levels of expertise
that need to executed genomic pipelines often and potentially in different
platforms. Building and executing pipelines requires no programming knowledge,
but familiarization with nextflow is highly recommended to take full advantage
of the generated pipelines.

At the moment, the available pre-made processes are mainly focused on
bacterial genome assembly simply because that was how we started.
However, our goal is to expand the library of existing components to other
commonly used tools in the field of genomics and to widen the applicability
and usefulness of FlowCraft pipelines.

Why not just write a Nextflow pipeline?
:::::::::::::::::::::::::::::::::::::::

In many cases, building a static nextflow pipeline is sufficient for our goals.
However, when building our own pipelines, we often felt the need to add
dynamism to this process, particularly if we take into account how fast new
tools arise and existing ones change. Our biological goals also change over
time and we might need different pipelines to answer different questions.
FlowCraft makes this very easy by having a set of pre-made and ready-to-use
components that can be freely assembled. By using components (``fastqc``,
``trimmomatic``) as its atomic elements, very complex pielines that take
full advantage of nextflow can be built with little effort. Moreover,
these components have explicit and standardized
input and output types, which means that the addition of new modules does not
require any changes in the existing code base. They just need to take into
account how data will be received by the process and how data may be emitted
from the process, to ensure that it can link with other components.

**However, why not both?**

FlowCraft generates a complete Nextflow pipeline file, which ca be used
as a starting point for your customized processes!

.. _nextflow: https://www.nextflow.io/

================================================
FILE: docs/index.rst
================================================
.. Templates documentation master file, created by
   sphinx-quickstart on Thu Feb  8 09:51:21 2018.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

FlowCraft
=========

.. image:: resources/logo_large.png
   :scale: 20 %
   :align: center

A NextFlow pipeline assembler for genomics.

.. _Getting Started:

.. toctree::
   :maxdepth: 1
   :caption: Getting Started

   getting_started/overview
   getting_started/installation
   about/about

.. _User Guide:

.. toctree::
   :maxdepth: 1
   :caption: User Guide

   user/basic_usage
   user/pipeline_building
   user/pipeline_configuration
   user/pipeline_inspect
   user/pipeline_reports
   user/available_components

.. _Developer Guide:

.. toctree::
   :maxdepth: 1
   :caption: Developer Guide

   dev/general_orientation
   dev/create_process
   dev/create_template
   dev/create_recipe
   dev/containers
   dev/process_dotfiles
   dev/pipeline_reporting
   dev/reports

.. _Source API:

.. toctree::
   :maxdepth: 2
   :caption: Source API

   flowcraft

================================================
FILE: docs/make.bat
================================================
@ECHO OFF

REM Command file for Sphinx documentation

pushd %~dp0

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
	set ALLSPHINXOPTS=-D latex_elements.papersize=%PAPER% %ALLSPHINXOPTS%
	set I18NSPHINXOPTS=-D latex_elements.papersize=%PAPER% %I18NSPHINXOPTS%
)

if "%1" == "" goto help

if "%1" == "help" (
	:help
	echo.Please use `make ^<target^>` where ^<target^> is one of
	echo.  html       to make standalone HTML files
	echo.  dirhtml    to make HTML files named index.html in directories
	echo.  singlehtml to make a single large HTML file
	echo.  pickle     to make pickle files
	echo.  json       to make JSON files
	echo.  htmlhelp   to make HTML files and an HTML help project
	echo.  qthelp     to make HTML files and a qthelp project
	echo.  devhelp    to make HTML files and a Devhelp project
	echo.  epub       to make an epub
	echo.  epub3      to make an epub3
	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
	echo.  text       to make text files
	echo.  man        to make manual pages
	echo.  texinfo    to make Texinfo files
	echo.  gettext    to make PO message catalogs
	echo.  changes    to make an overview over all changed/added/deprecated items
	echo.  xml        to make Docutils-native XML files
	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
	echo.  linkcheck  to check all external links for integrity
	echo.  doctest    to run all doctests embedded in the documentation if enabled
	echo.  coverage   to run coverage check of the documentation if enabled
	echo.  dummy      to check syntax errors of document sources
	goto end
)

if "%1" == "clean" (
	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
	del /q /s %BUILDDIR%\*
	goto end
)


REM Check if sphinx-build is available and fallback to Python version if any
%SPHINXBUILD% 1>NUL 2>NUL
if errorlevel 9009 goto sphinx_python
goto sphinx_ok

:sphinx_python

set SPHINXBUILD=python -m sphinx.__init__
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

:sphinx_ok


if "%1" == "html" (
	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
	goto end
)

if "%1" == "dirhtml" (
	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
	goto end
)

if "%1" == "singlehtml" (
	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
	goto end
)

if "%1" == "pickle" (
	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can process the pickle files.
	goto end
)

if "%1" == "json" (
	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can process the JSON files.
	goto end
)

if "%1" == "htmlhelp" (
	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
	goto end
)

if "%1" == "qthelp" (
	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Templates.qhcp
	echo.To view the help file:
	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Templates.ghc
	goto end
)

if "%1" == "devhelp" (
	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished.
	goto end
)

if "%1" == "epub" (
	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The epub file is in %BUILDDIR%/epub.
	goto end
)

if "%1" == "epub3" (
	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
	goto end
)

if "%1" == "latex" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "latexpdf" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	cd %BUILDDIR%/latex
	make all-pdf
	cd %~dp0
	echo.
	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "latexpdfja" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	cd %BUILDDIR%/latex
	make all-pdf-ja
	cd %~dp0
	echo.
	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "text" (
	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The text files are in %BUILDDIR%/text.
	goto end
)

if "%1" == "man" (
	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The manual pages are in %BUILDDIR%/man.
	goto end
)

if "%1" == "texinfo" (
	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
	goto end
)

if "%1" == "gettext" (
	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
	goto end
)

if "%1" == "changes" (
	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
	if errorlevel 1 exit /b 1
	echo.
	echo.The overview file is in %BUILDDIR%/changes.
	goto end
)

if "%1" == "linkcheck" (
	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
	if errorlevel 1 exit /b 1
	echo.
	echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
	goto end
)

if "%1" == "doctest" (
	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
	if errorlevel 1 exit /b 1
	echo.
	echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
	goto end
)

if "%1" == "coverage" (
	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
	if errorlevel 1 exit /b 1
	echo.
	echo.Testing of coverage in the sources finished, look at the ^
results in %BUILDDIR%/coverage/python.txt.
	goto end
)

if "%1" == "xml" (
	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The XML files are in %BUILDDIR%/xml.
	goto end
)

if "%1" == "pseudoxml" (
	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
	goto end
)

if "%1" == "dummy" (
	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. Dummy builder generates no files.
	goto end
)

:end
popd


================================================
FILE: docs/setup.rst
================================================
setup module
============

.. automodule:: setup
    :members:
    :undoc-members:
    :show-inheritance:


================================================
FILE: docs/user/available_components.rst
================================================
.. _components:

Components
==========

These are the currently available FlowCraft components with a short
description of their tasks. For a more detailed information, follow the
links of each component.


Download
--------

- :doc:`components/reads_download`: Downloads reads from the SRA/ENA public
  databases from a list of accessions.

- :doc:`components/fasterq_dump`: Downloads reads from the SRA public databases
  from a list of accessions, using ``fasterq-dump``.

Reads Quality Control
--------------------

- :doc:`components/check_coverage`: Estimates the coverage for each sample and
  filters FastQ files according to a specified minimum coverage threshold.

- :doc:`components/fastqc`: Runs FastQC on paired-end FastQ files.

- :doc:`components/fastqc_trimmomatic`: Runs Trimmomatic on
  paired-end FastQ files informed by the FastQC report.

- :doc:`components/filter_poly`:  Runs PrinSeq on paired-end
  FastQ files to remove low complexity sequences.

- :doc:`components/integrity_coverage`: Tests the integrity
  of the provided FastQ files, provides the option to filter FastQ files
  based on the expected assembly coverage and provides information about
  the maximum read length and sequence encoding.

- :doc:`components/trimmomatic`: Runs Trimmomatic on paired-end FastQ files.

- :doc:`components/downsample_fastq`: Subsamples fastq files up to a target coverage
  depth.


Assembly
--------

- :doc:`components/megahit`: Assembles metagenomic paired-end FastQ files
  using megahit.

- :doc:`components/metaspades`: Assembles metagenomic paired-end FastQ files
  using metaSPAdes.

- :doc:`components/skesa`: Assembles paired-end FastQ files using
  skesa.

- :doc:`components/spades`: Assembles paired-end FastQ files
  using SPAdes.

Post-assembly
-------------

- :doc:`components/pilon`: Corrects and filters assemblies using Pilon.

- :doc:`components/process_skesa`: Processes the assembly output
  from Skesa and performs filtering base on quality criteria of GC content
  k-mer coverage and read length.

- :doc:`components/process_spades`: Processes the assembly output
  from Spades and performs filtering base on quality criteria of GC content
  k-mer coverage and read length.

Binning
-------

- :doc:`components/maxbin2`: An automatic tool for binning metagenomic sequences

Annotation
----------

- :doc:`components/abricate`: Performs anti-microbial gene screening using
  abricate.

- :doc:`components/card_rgi`: Performs anti-microbial resistance gene screening using
  CARD rgi (with contigs as input).

- :doc:`components/prokka`: Performs assembly annotation using prokka.

Distance Estimation
-------------------

- :doc:`components/mash_dist`: Executes mash distance against a reference index
  plasmid database and generates a `JSON` for pATLAS. This component calculates
  pairwise distances between sequences (one from the database and the query
  sequence). However if a different database is provided it can use mash dist
  for other purposes.

- :doc:`components/mash_screen`: Performs mash screen against a reference index
  plasmid database and generates a JSON input file for pATLAS. This component
  searches for containment of a given sequence in read sequencing data.
  However if a different database is provided it can use mash screen for other
  purposes.

- :doc:`components/fast_ani`: Performs pairwise comparisons between fastas,
given a multifasta as input for fastANI. It will split the multifasta into
single fastas that will then be provided as a matrix. The output will be the
all pairwise comparisons that pass the minimum of 50 aligned sequences with a
default length of 200 bp.

- :doc:`components/mash_sketch_fasta`: Performs mash sketch for fasta files.

- :doc:`components/mash_sketch_fastq`: Performes mash sketch for fastq files.

Mapping
-------

- :doc:`components/assembly_mapping`: Performs a mapping
  procedure of FastQ files into a their assembly and performs filtering
  based on quality criteria of read coverage and genome size.

- :doc:`components/bowtie`: Align short paired-end sequencing reads to long reference sequences

- :doc:`components/mapping_patlas`: Performs read mapping and generates a JSON
  input file for pATLAS.

- :doc:`components/remove_host`: Performs read mapping with bowtie2
  against the target host genome (default hg19) and removes the mapping reads

- :doc:`components/retrieve_mapped`: Retrieves the mapped reads of a previous
  bowtie2 mapping process.

Taxonomic Profiling
---------------------

- :doc:`components/kraken`: Performs taxonomic identification with kraken on FastQ files
  (minikrakenDB2017 as default database)

- :doc:`components/kraken2`: Performs taxonomic identification with kraken2 on FastQ files
  (minikraken2_v1_8GB as default database)

- :doc:`components/midas_species`: Performs taxonomic identification on FastQ files at the
  species level with midas (requires database)

Typing
------

- :doc:`components/chewbbaca`: Performs a core-genome/whole-genome Multilocus
  Sequence Typing analysis on an assembly using ChewBBACA.

- :doc:`components/metamlst`: Checks the Sequence Type of metagenomic reads using
  Multilocus Sequence Typing.

- :doc:`components/mlst`: Checks the Sequence Type of an assembly using
  Multilocus Sequence Typing.

- :doc:`components/patho_typing`: *In silico* pathogenic typing from raw
  illumina reads.

- :doc:`components/seq_typing`: Determines the type of a given sample from a set
  of reference sequences.

- :doc:`components/sistr`: Serovar predictions from whole-genome sequence assemblies
  by determination of antigen gene and cgMLST gene alleles.

- :doc:`components/momps`: Multi-locus sequence typing for Legionella pneumophila
  from assemblies and reads.


================================================
FILE: docs/user/basic_usage.rst
================================================
Basic Usage
===========

FlowCraft has currently two execution modes, ``build`` and ``inspect``, that are
used to build and inspect the nextflow pipeline, respectively. However, a
``report`` mode is also being developed.

Build
-----

Assembling a pipeline
:::::::::::::::::::::

Pipelines are generated using the ``build`` mode of FlowCraft
and the ``-t`` parameter to specify the :ref:`components <components>` inside quotes::

    flowcraft build -t "trimmomatic fastqc spades" -o my_pipe.nf

All components should be written inside quotes and be space separated.
This command will generate a linear pipeline with three components on the
current working directory (for more features and tips on how pipelines can be
built, see the :doc:`pipeline building <pipeline_building>` section). **A linear pipeline means that
there are no bifurcations between components, and the input data will flow
linearly.**

The rationale of how the data flows across the pipeline is simple and intuitive.
Data enters a component and is processed in some way, which may result on the
creation of result files (stored in the ``results`` directory) and reports
files (stored in the ``reports`` directory) (see `Results and reports`_ below). If that
component has an ``output_type``, it will feed the processed data into the
next component (or components) and this will repeated until the end of the
pipeline.

If you are interesting in checking the pipeline DAG tree, open the
``my_pipe.html`` file (same name as the pipeline with the html extension)
in any browser.

.. image:: ../resources/fork_4.png
   :scale: 80 %
   :align: center

The ``integrity_coverage`` component is a dependency of ``trimmomatic``, so
it was automatically added to the pipeline.

.. important::
    Not all pipeline configurations will work. **You always need to ensure
    that the output type of a component matches the input type of the next
    component**, otherwise FlowCraft will exit with an error.

Pipeline directory
::::::::::::::::::

In addition to the main nextflow pipeline file (``my_pipe.nf``),
FlowCraft will write several auxiliary files that are necessary for
the pipeline to run. The contents of the directory should look something like
this::

    $ ls
    bin                lib           my_pipe.nf       params.config     templates
    containers.config  my_pipe.html  nextflow.config  profiles.config   resources.config  user.config

You do not have to worry about most of these files. However, the
``*.config`` files can be modified to change several aspects of the pipeline run
(see :doc:`pipeline_configuration` for more details). Briefly:

- ``params.config``: Contains all the available parameters of the pipeline (see
  `Parameters`_ below). These can be changed here, or provided directly on
  run-time (e.g.: ``nextflow run --fastq value``).
- ``resources.config``: Contains the resource directives of the pipeline processes,
  such as cpus, allocated RAM and other nextflow process directives.
- ``containers.config``: Specifies the container and version tag of each process
  in the pipeline.
- ``profiles.config``: Contains a number of predefined profiles of executor and
  container engine.
- ``user.config``: Empty configuration file that is not over-written if you build
  another pipeline in the same directory. Used to set persistent configurations
  across different pipelines.

Parameters
::::::::::

The parameters of the pipeline can be viewed by running the pipeline file
with ``nextflow`` and using the ``--help`` option::

    $ nextflow run my_pipe.nf --help
    N E X T F L O W  ~  version 0.30.1
    Launching `my_pipe.nf` [kickass_mcclintock] - revision: 480b3455ba

    ============================================================
                    F L O W C R A F T
    ============================================================
    Built using flowcraft v1.2.1.dev1


    Usage:
        nextflow run my_pipe.nf

           --fastq                     Path expression to paired-end fastq files. (default: fastq/*_{1,2}.*) (default: 'fastq/*_{1,2}.*')

           Component 'INTEGRITY_COVERAGE_1_1'
           ----------------------------------
           --genomeSize_1_1            Genome size estimate for the samples in Mb. It is used to estimate the coverage and other assembly parameters andchecks (default: 1)
           --minCoverage_1_1           Minimum coverage for a sample to proceed. By default it's setto 0 to allow any coverage (default: 0)

           Component 'TRIMMOMATIC_1_2'
           ---------------------------
           --adapters_1_2              Path to adapters files, if any. (default: 'None')
           --trimSlidingWindow_1_2     Perform sliding window trimming, cutting once the average quality within the window falls below a threshold (default: '5:20')
           --trimLeading_1_2           Cut bases off the start of a read, if below a threshold quality (default: 3)
           --trimTrailing_1_2          Cut bases of the end of a read, if below a threshold quality (default: 3)
           --trimMinLength_1_2         Drop the read if it is below a specified length  (default: 55)

           Component 'FASTQC_1_3'
           ----------------------
           --adapters_1_3              Path to adapters files, if any. (default: 'None')

           Component 'SPADES_1_4'
           ----------------------
           --spadesMinCoverage_1_4     The minimum number of reads to consider an edge in the de Bruijn graph during the assembly (default: 2)
           --spadesMinKmerCoverage_1_4 Minimum contigs K-mer coverage. After assembly only keep contigs with reported k-mer coverage equal or above this value (default: 2)
           --spadesKmers_1_4           If 'auto' the SPAdes k-mer lengths will be determined from the maximum read length of each assembly. If 'default', SPAdes will use the default k-mer lengths.  (default: 'auto')

All these parameters are specific to the components of the pipeline. However,
the main input parameter (or parameters) of the pipeline is always available.
**In this case, since the pipeline started with fastq paired-end files as the
main input, the** ``--fastq`` **parameter is available.** If the pipeline started
with any other input type or with more than one input type, the appropriate
parameters will appear (more information in the :ref:`raw input types<rawInput>` section).

The parameters are composed by their name (``adapters``) followed by the ID of
the process it refers to (``_1_2``). The IDs can be consulted in the DAG tree
(See `Assembling a pipeline`_). This is done to prevent issues when duplicating
components and, as such, **all parameters will be independent between different
components**. This
behaviour can be changed when building the pipeline by using the
``--merge-params`` option (See :ref:`mergeParams`).

.. note::
    The ``--merge-params`` option of the ``build`` mode will merge all parameters
    with identical names (`e.g.:` ``--genomeSize_1_1`` and ``--genomeSize_1_5``
    become simply ``--genomeSize``) . This is usually more appropriate and useful
    in linear pipelines without component duplication.


Providing/modifying parameters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

These parameters can be provided on run-time::

    nextflow run my_pipe.nf --genomeSize_1_1 5 --adapters_1_2 "/path/to/adapters"

or edited in the ``params.config`` file::

    params {
        genomeSize_1_1 = 5
        adapters_1_2 = "path/to/adapters"
    }

Most parameters in FlowCraft's components already come with sensible
defaults, which means that usually you'll only need to provide a small number
of arguments. In the example above, the ``--fastq`` is the only parameter
required. I have placed fastq files on the ``data`` directory::

    $ ls data
    sample_1.fastq.gz  sample_2.fastq.gz

We'll need to provide the pattern to the fastq files. This pattern is perhaps
a bit confusing at first, but it's necessary for the correct inference of the
paired::

    --fastq "data/*_{1,2}.*"

In this case, the pairs are separated by the "_1." or "_2." substring, which leads
to the pattern ``*_{1,2}.*``. Another common nomenclature for paired fastq
files is something like ``sample_R1_L001.fastq.gz``. In this case, an
acceptable pattern would be ``*_R{1,2}_*``.

.. important::

    Note the quotes around the fastq path pattern. These quotes are necessary
    to allow nextflow to resolve the pattern, otherwise your shell might try
    to resolve it and provide the wrong input to nextflow.

Execution
---------

Once you build your pipeline with Flowcraft you have a standard nextflow pipeline
ready to run. Therefore, all you need to do is::

    nextflow run my_pipe.nf --fastq "data/*_{1,2}.*

Changing executor and container engine
::::::::::::::::::::::::::::::::::::::

The default run mode of an FlowCraft pipeline is to be executed locally
and using the singularity container engine. In nextflow terms, this is
equivalent to have ``executor = "local"`` and ``singularity.enabled = true``.
If you want to change these settings, you can modify the
``nextflow.config`` file, or use one of the available profiles in the
``profiles.config`` file. These profiles provide a combination of common
``<executor>_<container_engine>`` that are `supported by nextflow`_. Therefore,
if you want to run the pipeline on a cluster with SLURM and shifter, you'll
just need to specify the `` slurm_shifter`` profile::

    nextflow run my_pipe.nf --fastq "data/*_{1,2}.*" -profile slurm_shifter

Common executors include:

- ``slurm``
- ``sge``
- ``lsf``
- ``pbs``

Other container engines are:

- ``docker``
- ``singularity``
- ``shifter``

.. _supported by nextflow: https://www.nextflow.io/docs/latest/executor.html

Docker images
:::::::::::::

All components of FlowCraft are executed in containers, which means that
the first time they are executed in a machine, **the corresponding image will have
to be downloaded**. In the case of docker, images are pulled and stored in
``var/lib/docker`` by default. In the case of singularity, the
``nextflow.config`` generated by FlowCraft sets the cache dir for the
images at ``$HOME/.singularity_cache``. Note that when an image is downloading,
nextflow does not display any informative message, except for singularity where you'll
get something like::

    Pulling Singularity image docker://ummidock/trimmomatic:0.36-2 [cache /home/diogosilva/.singularity_cache/ummidock-trimmomatic-0.36-2.img]

So, if a process seems to take too long to run the first time, it's probably
because the image is being downloaded.

Results and reports
:::::::::::::::::::

As the pipeline runs, processes may write result and report files to the
``results`` and ``reports`` directories, respectively. For example, the
reports of the pipeline above, would look something like this::

    reports
    ├── coverage_1_1
    │   └── estimated_coverage_initial.csv
    ├── fastqc_1_3
    │   ├── FastQC_2run_report.csv
    │   ├── run_2
    │   │   ├── sample_1_0_summary.txt
    │   │   └── sample_1_1_summary.txt
    │   ├── sample_1_1_trim_fastqc.html
    │   └── sample_1_2_trim_fastqc.html
    └── status
        ├── master_fail.csv
        ├── master_status.csv
        └── master_warning.csv

The ``estimated_coverage_initial.csv`` file contains a very rough coverage
estimation for each sample, the ``fastqc*`` directory contains the html
reports and summary files of FastQC for each sample, and the ``status``
directory contains a log of the status, warnings and fails of each process for
each sample.

The actual results for each process that produces them, are stored in the
``results`` directory::

    results
    ├── assembly
    │   └── spades_1_4
    │       └── sample_1_trim_spades3111.fasta
    └── trimmomatic_1_2
        ├── sample_1_1_trim.fastq.gz
        └── sample_1_2_trim.fastq.gz

If you are interested in checking the actual environment where the execution
of a particular process occurred for any given sample, you can inspected the
``pipeline_stats.txt`` file in the root of the pipeline directory. This file
contains rich information about the execution of each process, including
the working directory::

    task_id hash        process         tag         status      exit    start                   container                           cpus    duration    realtime    queue   %cpu    %mem    rss     vmem
    5       7c/cae270   trimmomatic_1_2 sample_1    COMPLETED   0       2018-04-12 11:42:29.599 docker:ummidock/trimmomatic:0.36-2  2       1m 25s      1m 17s      -       329.3%  1.1%    1.5 GB  33.3 GB

The ``hash`` column contains the start of the current working directory of that
process. In the example below, the directory would be::

    work/7c/cae270*

Inspect
-------

FlowCraft has two options (``overview`` and ``broadcast``) for inspecting the
progress of a pipeline that is running locally, either in a personal computer
or a server machine. In both cases, the progress of the pipeline will be
continuously updated in real-time.

In a terminal
:::::::::::::

To open inspect in the terminal just write the following command **on the folder
that the pipeline is running**::

    flowcraft inspect

.. image:: ../resources/flowcraft_inspect_terminal.png
   :align: center

``overview`` is the default behavior of this module, but it can also be called
like this::

    flowcraft inspect -m overview

.. note::
    To exit the inspection just type ``q`` or ``ctrl+c``.

In a browser
::::::::::::

It is also possible to track the pipeline progress in a browser in any
device using the flowcraft web application. **To do so, the following command
should be run in the folder where the pipeline is running**::

    flowcraft inspect -m broadcast


This will output an URL to the terminal that can be opened in a browser.
This is an example of the screen that is displayed once the url is opened:

.. image:: ../resources/flowcraft_inspect_broadcast.png
   :align: center

.. important::
    This pipeline inspection will be available for **anyone** via the provided URL,
    which means that the URL can be shared with anyone and/or any device with
    a browser. **However, the inspection section will only be available while
    the** ``flowcraft inspect -m broadcast`` **command is running. Once this command
    is cancelled, the data will be erased from the service and the URL will
    no longer be available**.

Want to know more?
::::::::::::::::::

:doc:`pipeline_inspect` is the full documentation of the ``inspect`` mode.


Reports
-------

The reporting of a FlowCraft pipeline is saved on a JSON file that is stored
in ``pipeline_reports/pipeline_report.json``. To visualize the reports you'll just
need to execute the following command in the folder where the pipeline was executed::

    flowcraft report

This will output an URL to the terminal that can be opened in a browser.
This is an example of the screen that is displayed once the url is opened:

.. image:: ../resources/flowcraft_report.png
   :align: center

**The actual layout and content of the reports will depend on the pipeline you
build and it will only provide the information that is directly related to
your pipeline components.**

.. important::
    This pipeline report will be available for **anyone** via the provided URL,
    which means that the URL can be shared with anyone and/or any device with
    a browser. **However, the report section will only be available while
    the** ``flowcraft report`` **command is running. Once this command
    is cancelled, the data will be erased from the service and the URL will
    no longer be available**.

Real time reports
:::::::::::::::::

The reports of any FlowCraft pipeline can be monitored in real-time using the
``--watch`` option::

    flowcraft report --watch

This will output an URL exactly as in the previous section and will render the
same reports page with a small addition. In the top right of the screen in the
navigation bar, there will be a new icon that informs the user when new
reports are available:

.. image:: ../resources/flowcraft_report_watch.png
   :align: center

Local visualization
:::::::::::::::::::

The FlowCraft report JSON file can also be visualized locally by drag and dropping
it into the FlowCraft web application page, currently hosted at http://www.flowcraft.live/reports

Offline visualization
:::::::::::::::::::::

The complete FlowCraft report is also available as a standalone HTML file that
can be visualized offline. This HTML file, stored in
``pipeline_reports/pipeline_report.html``, can be opened in any modern browser.

================================================
FILE: docs/user/components/abricate.rst
================================================
abricate
========

Purpose
-------

This component performs anti-microbial gene screening using abricate. It
includes the default databases plus the ``virulencefinder`` database.

.. note::
    Software page: https://github.com/tseemann/abricate

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``abricateDatabases``: Specify the databases for abricate.

Published results
-----------------

- ``results/annotation/abricate``: Stores the results of the abricate screening
  for each sample and for each specified database.

Published reports
-----------------

None.

Default directives
------------------

- ``abricate``:
    - ``container``: ummidock/abricate
    - ``version``: 0.8.0-1
- ``process_assembly_mapping``:
    - ``container``: ummidock/abricate
    - ``version``: 0.8.0-1

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.process_abricate`


Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``<database>``: List of gene names
``plotData``:
    - ``<database>``:
        - ``contig``: Contig ID
        - ``seqRange``: Genomic range of the contig
        - ``gene``: Gene name
        - ``accession``: Accession number
        - ``coverage``: Coverage of the match
        - ``identity``: Identity of the match

================================================
FILE: docs/user/components/assembly_mapping.rst
================================================
assembly_mapping
================

Purpose
-------

This component performs a mapping procedure of FastQ files using their assembly
as reference. The procedure is carried out with bowtie2 and samtools and aims
to filter the assembly based on quality criteria of read coverage
and expected genome size.

.. note::
    - bowtie2 documentation can be found `here <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>`_.
    - samtools documentation can be found `here <http://www.htslib.org/doc/samtools-1.2.html>`_.

Input/Output type
------------------

- Input type: ``Fasta`` and ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``minAssemblyCoverage``: In auto, the default minimum coverage for each
  assembled contig is 1/3 of the assembly mean coverage or 10x, if the mean
  coverage is below 10x.
- ``AMaxContigs``: A warning is issues if the number of contigs is over
  this threshold.
- ``genomeSize``: Genome size estimate for the samples. It is used to check
  the ratio of contig number per genome MB.

Published results
-----------------

None.

Published reports
-----------------

None.

Default directives
------------------

- ``assembly_mapping``:
    - ``cpus``: 4
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``: ummidock/bowtie2_samtools
    - ``version``: 1.0.0-2
- ``process_assembly_mapping``:
    - ``cpus``: 1
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``: ummidock/bowtie2_samtools
    - ``version``: 1.0.0-2

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.process_assembly_mapping`

Reports JSON
^^^^^^^^^^^^

``plotData``:
    - ``sparkline``: Total number of base pairs.
``warnings``:
    - When the number of contigs exceeds a provided threshold.
``fail``:
    - When the genome size is below 80% or above 150% of the expected genome size.

================================================
FILE: docs/user/components/bowtie.rst
================================================
bowtie
======

Purpose
-------

This component performs a mapping procedure of FastQ files with a given reference.
The procedure is carried out with Bowtie2.
The reference can a set of Bowtie2 index files or a Fasta file. In the latter, the
necessary index will be created with Bowtie2-build and passed through to Bowtie2.

.. note::
    - Bowtie2 documentation can be found `here <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>`_.
    - Software page: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``Bam``

.. note::
    The default input parameter for Fastq data is ``--fastq``.

Parameters
----------

- ``reference``: Specifies the reference genome to be provided to to bowtie2-build.
- ``index``: Specifies the reference indexes to be provided to bowtie2.

.. note::
    An ``index`` OR a ``reference`` fasta file must be provided

Published results
-----------------

- ``results/mapping/bowtie``: Stores the results of the mapping for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``bowtie_build``:
    - ``cpus``: 1
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``: flowcraft/bowtie2_samtools
    - ``version``: 1.0.0-1
- ``bowtie``:
    - ``cpus``: 4
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``:flowcraft/bowtie2_samtools
    - ``version``: 1.0.0-1


================================================
FILE: docs/user/components/card_rgi.rst
================================================
card_rgi
========

Purpose
-------

This component performs anti-microbial gene screening using CARD rgi.
It uses data from CARD database.

.. note::
    Software page: https://github.com/arpcard/rgi

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``alignmentTool``: Specifies the alignment tool. Options: DIAMOND or BLAST

Published results
-----------------

- ``results/annotation/card_rgi``: Stores the results of the screening
  for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/card_rgi
- ``version``: 4.0.2-0.1


================================================
FILE: docs/user/components/check_coverage.rst
================================================
check_coverage
==============

Purpose
-------

This components estimates the coverage of a given sample based on the number
of base pairs in the FastQ files of a sample and on the expected genome size:

.. math::
    \frac{\text{number of base pairs}}{(\text{genome size} \times 1e^{6})}

If the estimated coverage of a given sample falls bellow the provided
minimum coverage threshold, the sample is filtered and does not proceed in the
pipeline.

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``genomeSize``: Genome size estimate for the samples. It is used to
  estimate the coverage and other assembly parameters and
  checks.
- ``minCoverage``: Minimum coverage for a sample to proceed. Can be set to
  0 to allow any coverage.

Published results
-----------------

None.

Published reports
-----------------

- ``reports/coverage``: CSV table with estimated sequencing coverage for
  each sample.

Default directives
------------------

None.

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.integrity_coverage`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``Coverage``: Estimated coverage.
``fail``:
    - When estimated coverage is below the provided threshold.

================================================
FILE: docs/user/components/chewbbaca.rst
================================================
chewbbaca
=========

Purpose
-------

This components runs the allele calling operation of ChewBBACA on a set
of fasta samples to perform a cg/wgMLST analysis

.. note::
    Software page: https://github.com/B-UMMI/chewBBACA

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``chewbbacaQueue``: Specifiy a queue/partition for chewbbaca. This option
  is only used for grid schedulers.
- ``chewbbacaTraining``: Specify the full path to the prodigal training file
  of the corresponding species.
- ``schemaPath``: The path to the chewbbaca schema directory.
- ``schemaSelectedLoci``: The path to the selection of loci in the schema
  directory to be used. If not specified, all loci in the schema will be used.
- ``chewbbacaJson``: If set to True, chewbbaca's allele call output will be
  set to JSON format.
- ``chewbbacaToPhyloviz``: If set to True, the ExtractCgMLST module of
  chewbbaca will be executed after the allele calling.
- ``chewbbacaProfilePercentage``: Specifies the proportion of samples that
  must be present in a locus to save the profile.
- ``chewbbacaBatch``: Specifies whther a chewbbaca run will be performed on
  the complete input batch (all at the same time) or one by one.

Published results
-----------------

- ``results/chewbbaca_alleleCall``: The results of the allelecall for each
 sample.
- ``results/chewbbaca``: The cg/wgMLST schema prepared for phyloviz.

Published reports
-----------------

 None. 

Default directives
------------------

- ``chewbbaca``:
    - ``cpus``: 4
    - ``container``: mickaelsilva/chewbbaca_py3
    - ``version``: latest
- ``chewbbaca_batch``:
    - ``cpus``: 4
    - ``container``: mickaelsilva/chewbbaca_py3
    - ``version``: latest
- ``chewbbacaExtractMLST``:
    - ``container``: mickaelsilva/chewbbaca_py3
    - ``version``: latest


================================================
FILE: docs/user/components/diamond.rst
================================================
diamond
=======

Purpose
-------

This component performs ``blastx`` or ``blastp`` with diamond. The database
used by diamond can be provided from the local disk or generated in the process.
This component uses the same output type as abricate with the same blast output
fields.

.. note::
    Software page: https://github.com/bbuchfink/diamond


Input/Output type
-----------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``pathToDb``: Provide full path for the diamond database. If none is provided
  then will try to fetch from the previous process. Default: None

- ``fastaToDb``: Provide the full path for the fasta to construct a diamond
  database. Default: None

- ``blastType``: Defines the type of blast that diamond will do. Can wither be
  blastx or blastp. Default: blastx

Published results
-----------------

- ``results/annotation/diamond*``: Stores the results of the abricate screening
  for each sample and for each specified database.

Published reports
-----------------

None.

Default directives
------------------

- ``diamond``:
    - ``container``: flowcraft/diamond
    - ``version``: 0.9.22-1
    - ``memory``: { 4.GB * task.attempt }
    - ``cpus``: 2

================================================
FILE: docs/user/components/downsample_fastq.rst
================================================
downsample_fastq
================

Purpose
-------

downsample_fastq uses seqtk to subsample fastq read data to a target coverage depth
if the estimated coverage is higher than the provided target depth. When
no subsample is required, it outputs the original FastQ files.

.. note::
    Software page: https://github.com/lh3/seqtk

Input/Output type
------------------

- Input type: ``fastq``
- Output type: ``fastq``

Parameters
----------

- ``genomeSize``: Genome size estimate for the samples. It is used to
  estimate the coverage.
- ``depth``: The target depth to which the reads should be subsampled.
- ``seed``: The seed number for seqtk. By default it is 100.

Published results
-----------------

- ``results/sample_fastq``: Stores the subsampled FastQ files

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 1
- ``memory``: 4GB
- ``container``: flowcraft/seqtk
- ``version``: 1.3.0-3

Advanced
--------

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``Coverage``: Estimated coverage.

================================================
FILE: docs/user/components/fast_ani.rst
================================================
fast_ani
========

Purpose
-------

This component performs pairwise comparisons between fastas,
given a multifasta as input for fastANI. It will split the multifasta into
single fastas that will then be provided as a matrix. The output will be the
all pairwise comparisons that pass the minimum of 50 aligned sequences with a
default length of 200 bp.

Input/Output type
------------------

- Input type: ``fasta``
- Output type: ``None``


Parameters
----------

- ``fragLen``: Sets the minimum size of the fragment to be passed to
`--fragLen` argument of fastANI.


Published results
-----------------

- ``results/fast_ani/``: A text file with the extension `.out`, which has all
the pairwise comparisons between sequences, reporting ANI.


Published reports
-----------------

None.


Default directives
------------------

- ``fastAniMatrix``:
    - ``container``: flowcraft/fast_ani
    - ``version``: 1.1.0-2
    - ``cpus``: 20
    - ``memory``: { 30.GB * task.attempt }

================================================
FILE: docs/user/components/fasterq_dump.rst
================================================
fasterq_dump
============

Purpose
-------

This component downloads reads from the SRA public databases from a
list of accessions. This component uses ``fasterq-dump`` from
`NCBI sra-tools <https://github.com/ncbi/sra-tools>`_. ``fasterq-dump``
increases the download speed in comparison from ``fastq-dump`` by
**multi-threading** the extraction of FASTQ from SRA-accessions.
The reads for each accession are then emitted through
the main output of this component to any other component (or components) that
receive FastQ data.

Input/Output type
------------------

- Input type: ``accessions``
- Output type: ``fastq``

.. note::
    The default input parameter for Accessions data is ``--accessions``.

Parameters
----------

- ``option_file``: This options enables the *option-file* parameter of
``fasterq-dump``, allowing parameters to be passed.
- ``compress_fastq``: This options allows users to disable the compression of
the fastq files resulting from this component. The default (``true``) behavior
compresses the fastq files to *fastq.gz*.

Published results
-----------------

- ``reads/<accession>``: Stores the reads for each provided accession.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 1
- ``memory``: 1GB
- ``container``: flowcraft/sra-tools
- ``version``: 2.9.1-1


================================================
FILE: docs/user/components/fastqc.rst
================================================
fastqc
======

Purpose
-------

This components runs FastQC on paired-end FastQ files.

.. note::
    Software page: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``adapters``: Provide a non-default fasta file containing the adapter
  sequences to screen overrepresented sequences against.

Published results
-----------------

None.

Published reports
-----------------

- ``reports/fastqc``: Stores the FastQC HTML reports for each sample.
- ``reports/fastqc/run_2/``: Stores the summary text files with the category
  results of FastQC for each sample.

Default directives
------------------

- ``cpus``: 2
- ``memory``: 4GB
- ``container``: ummidock/fastqc
- ``version``: 0.11.7-1

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.fastqc_report`

Reports JSON
^^^^^^^^^^^^

``plotData``:
    - ``base_sequence_quality``: Per base sequence quality data
        - (This structure is repeated for the other entries)
        - ``status``: Status of the category (PASS, WARN, etc)
        - ``data``: Plot data
    - ``sequence_quality``: Per sequence quality data
    - ``base_gc_content``: GC content distribution
    - ``base_n_content``: Per base N content
    - ``sequence_length_dist``: Distribution of sequence read length
    - ``per_base_sequence_content``: Per base sequence content
``warnings``:
    - List of failures or warnings for some non-sensitive FastQC categories
``fail``:
    - Failure message when sensitive FastQC categories fail or do not pass.


================================================
FILE: docs/user/components/fastqc_trimmomatic.rst
================================================
fastqc_trimmomatic
==================

Purpose
-------

This component runs Trimmomatic on paired-end FastQ files but uses information
on the per-base GC content variation reported by FastQC to guide the trimming
of the FastQ reads.

.. note::
    Software pages: FastQC (https://www.bioinformatics.babraham.ac.uk/projects/fastqc/);
    Trimmoatic (http://www.usadellab.org/cms/?page=trimmomatic)

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``adapters``: Provide a non-default fasta file containing the adapter
  sequences used to screen overrepresented sequences against and to filter
  the FastQ files.
- ``trimSlidingWindow``: Perform sliding window trimming, cutting once the
  average quality within the window falls below a threshold.
- ``trimLeading``: Cut bases off the start of a read, if below a threshold
  quality.
- ``trimTrailing``: Cut bases of the end of a read, if below a threshold
  quality.
- ``trimMinLength``: Drop the read if it is below a specified length.

Published results
-----------------

- ``results/trimmomatic``: The trimmed FastQ files for each sample.

Published reports
-----------------

- ``reports/fastqc``: Stores the FastQC HTML reports for each sample and a
  ``FastQC_trim_report.csv`` file containing the trimming values suggested
  by the analysis of the FastQC report.
- ``reports/fastqc/run_1/``: Stores the summary text files with the category
  results of FastQC for each sample.

Default directives
------------------

- ``fastqc``:
    - ``cpus``: 2
    - ``memory``: 4GB
    - ``container``: ummidock/fastqc
    - ``version``: 0.11.7-1

- ``trimmomatic``:
    - ``cpus``: 2
    - ``memory``: 4GB (dynamically increased on retry)
    - ``container``: ummidock/trimmomatic
    - ``version``: 0.36-2


Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.fastqc_report`
:mod:`flowcraft.templates.trimmomatic`
:mod:`flowcraft.templates.trimmomatic_report`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    ``Trimmed (%)``: Percentage of trimmed nucleotides
``plotData``:
    ``sparkline``: Number of nucleotides after trimming
``badReads``: Number of discarded reads


================================================
FILE: docs/user/components/filter_poly.rst
================================================
filter_poly
===========

Purpose
-------

This component removes low complexity sequence from read data
using PrinSeq.

.. note::
    Software page: http://prinseq.sourceforge.net/

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ`

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``adapter``: Pattern to filter the reads. Please separate parameter values with a space
    and separate new parameter sets with semicolon (;). Parameters are defined by two values:
    the pattern (any combination of the letters ATCGN), and the number of repeats or percentage
    of occurence. Default: A 50%; T 50%; N 50%

Published results
-----------------

None.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/prinseq
- ``version``: 0.20.4-1
- ``memory``: 4.GB * task.attempt
- ``cpus``: 1


================================================
FILE: docs/user/components/integrity_coverage.rst
================================================
integrity_coverage
==================

Purpose
-------

This component is intended to test the integrity of the provided FastQ files.
It does so by attempting to parse uncompressed or compressed (``gz``, ``bz2``
or ``zip``) FastQ files (paired-end or single-end). During this parse, if the
FastQ files are not corrupt, it retrieves the following information:

- **sequence encoding**: Estimates the sequence encoding based on the quality
  scores. This information can then be passed to other components that might
  required it.
- **estimated coverage**: Provides a rough coverage estimation for each sample
  based on a user-provided genome size (see `Parameters`_). This estimation
  is essentially

  .. math::
      \frac{\text{number of base pairs}}{(\text{genome size} \times 1e^{6})}

  This information is written to the ``reports`` directory (See
  `Published reports`_)
- **maximum read length.**: Retrieves the maximum read length for each sample.

.. important::
    If the ``minCoverage`` parameter value is set to higher than 0, this
    component will filter samples with an estimated coverage below that
    threshold.

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``genomeSize``: Genome size estimate for the samples. It is used to
  estimate the coverage and other assembly parameters and
  checks.
- ``minCoverage``: Minimum coverage for a sample to proceed. Can be set to
  0 to allow any coverage.

.. note::
    You can use these parameters as in the following example:
    ``--genomeSize 3``.

Published results
-----------------

None.

Published reports
-----------------

- ``reports/coverage``: CSV table with estimated sequencing coverage for
  each sample.
- ``reports/corrupted``: Text file with list of corrupted samples.

Default directives
------------------

None.


Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.integrity_coverage`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``Raw BP``: Number of nucleotides.
    - ``Reads``: Number of reads.
    - ``Coverage``: Estimated coverage.
``plotData``:
    - ``sparkline``: Number of nucleotides.
``warnings``:
    - When the enconding and/or phred score cannot be inferred from FastQ files.
``fail``:
    - When estimated coverage is below the provided threshold.

================================================
FILE: docs/user/components/kraken.rst
================================================
kraken
======

Purpose
-------

This component performs Kraken to assign taxonomic labels to short DNA
sequences, usually obtained through metagenomic studies.

.. note::
    Software page: https://ccb.jhu.edu/software/kraken/

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: None

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``krakenDB``: Specifies kraken database. Default: minikraken_20171013_4GB (in path)

Published results
-----------------

- ``results/taxonomy/kraken``: Stores the results of the screening
  for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/kraken
- ``version``: 1.0-0.1

================================================
FILE: docs/user/components/kraken2.rst
================================================
kraken2
=======

Purpose
-------

This component performs Kraken2 to assign taxonomic labels to short DNA
sequences, usually obtained through metagenomic studies.

.. note::
    Software page: https://ccb.jhu.edu/software/kraken2/

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: txt

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``kraken2DB``: Specifies kraken2 database. Default: minikraken2_v1_8GB (in path inside the
default container)

Published results
-----------------

- ``results/taxonomy/kraken2``: Stores the results of the screening
  for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/kraken2
- ``version``: 2.0.7-1
- ``cpus``: 3
- ``memory``: 5GB (dynamically increased on retry)


================================================
FILE: docs/user/components/mapping_patlas.rst
================================================
mapping_patlas
==============

Purpose
-------

This component performs mapping (using `bowtie2` and `samtools`) against a
plasmid database in order to find
plasmids contained in high throughoput sequencing data. Then, the resulting file
can be imported into `pATLAS <http://www.patlas.site/>`_.

.. note::
    - pATLAs documentation can be found `here <https://tiagofilipe12.gitbooks.io/patlas/content/>`_.
    - bowtie2 documentation can be found `here <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>`_.
    - samtools documentation can be found `here <http://www.htslib.org/doc/samtools-1.2.html>`_.

Input/Output type
------------------

- Input type: ``fastq``
- Output type: ``json``


Parameters
----------

- ``max_k``: Sets the k parameter for bowtie2 allowing to make multiple mappings
  of the same read against several hits on the query sequence or sequences.
  Default: 10949.

- ``trim5``: Sets trim5 option for bowtie. This will become legacy with QC
  integration, but it enables to trim 5' end of reads to be mapped with bowtie2.
  Default: 0

- ``lengthJson``: A dictionary of all the lengths of reference sequences.
  Default: 'jsons/*_length.json' (from docker image).

- ``refIndex``: Specifies the reference indexes to be provided to bowtie2.
  Default: '/ngstools/data/indexes/bowtie2idx/bowtie2.idx' (from docker image).

- ``samtoolsIndex``: Specifies the reference indexes to be provided to samtools.
  Default: '/ngstools/data/indexes/fasta/samtools.fasta.fai' (from docker image).


Published results
-----------------

- ``results/mapping/``: A `JSON` file that can be imported to `pATLAS <http://www.patlas.site/>`_
  with the results from mapping.


Published reports
-----------------

None.


Default directives
------------------

- ``mappingBowtie``:
    - ``container``: flowcraft/mapping-patlas
    - ``version``: 1.6.0-1
- ``samtoolsView``:
    - ``container``: flowcraft/mapping-patlas
    - ``version``: 1.6.0-1
- ``jsonDumpingMapping``:
    - ``container``: flowcraft/mapping-patlas
    - ``version``: 1.6.0-1


================================================
FILE: docs/user/components/mash_dist.rst
================================================
mash_dist
=========

Purpose
-------

This component executes mash dist to find plasmids
within high throughoput sequencing data, using as inputs fasta files
(e.g. contigs). Then, the resulting file can
be imported into `pATLAS <http://www.patlas.site/>`_.
This component calculates pairwise distances between sequences
(one from the database and the query sequence).
However, this process can be user for other purposes, by providing a different
database than the default that is intended for plasmid searches.

.. note::
    - pATLAs documentation can be found `here <https://tiagofilipe12.gitbooks.io/patlas/content/>`_.
    - MASH documentation can be found `here <https://mash.readthedocs.io/en/latest/>`_.


Input/Output type
------------------

- Input type: ``fasta``
- Output type: ``json``


Parameters
----------

- ``mash_distance``: Sets the maximum distance between two sequences to be
  included in the output. Default: 0.1.

.. note::
    The subtraction of 1 - `mash_distance` can be used as an approximation to
    Average Nucleotide Identity (ANI). For instance a mash distance of 0.1 well
    correlates with ANI at 0.9 (90%).

- ``pValue``: P-value cutoff for the distance estimation between two sequences
  to be included in the output. Default: 0.05.

- ``shared_hashes``: Sets a minimum percentage of hashes shared between two
  sequences in order to include its result in the output. Default: 0.8.

- ``refFile``: Specifies the reference file to be provided to mash. It can either
  be a fasta or a .msh reference sketch generated by mash.
  Default: '/ngstools/data/patlas.msh'. If the component ``mash_sketch_fasta``
  is executed before this component, this parameter will be ignored and instead
  the secondary link between the two processes will be used to feed this
  component with the reference sketch.


Published results
-----------------

- ``results/mashdist/``: A `JSON` file that can be imported to `pATLAS <http://www.patlas.site/>`_
  with the results from mash dist.


Published reports
-----------------

None.


Default directives
------------------

- ``runMashDist``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1
- ``mashDistOutputJson``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1


================================================
FILE: docs/user/components/mash_screen.rst
================================================
mash_screen
===========

Purpose
-------

This component performes mash screen to find plasmids
contained in high throughout sequencing data, using as inputs read files
(FastQ files). Then, the resulting file can
be imported into `pATLAS <http://www.patlas.site/>`_.
This component searches for containment of a given sequence in read sequencing
data.
However, this process can be user for other purposes, by providing a different
database than the default that is intended for plasmid searches.

.. note::
    - pATLAs documentation can be found `here <https://tiagofilipe12.gitbooks.io/patlas/content/>`_.
    - MASH documentation can be found `here <https://mash.readthedocs.io/en/latest/>`_.


Input/Output type
------------------

- Input type: ``fastq``
- Output type: ``json``


Parameters
----------

- ``noWinner``: A variable that enables the use of -w option for mash screen.
  Default: false.

- ``pValue``: P-value cutoff for the distance estimation between two sequences to
  be included in the output. Default: 0.05.

- ``identity``: The percentage of identity between the reads input and the
  reference sequence. Default: 0.9.

- ``refFile``: "Specifies the reference file to be provided to mash. It can
  either be a fastq or a .msh reference sketch generated by mash.
  Default: '/ngstools/data/patlas.msh'. If the component ``mash_sketch_fastq``
  is executed before this component, this parameter will be ignored and instead
  the secondary link between the two processes will be used to feed this
  component with the reference sketch.


Published results
-----------------

- ``results/mashscreen/``: A `JSON` file that can be imported to `pATLAS <http://www.patlas.site/>`_
  with the results from mash screen.


Published reports
-----------------

None.


Default directives
------------------

- ``mashScreen``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1
- ``mashOutputJson``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1


================================================
FILE: docs/user/components/mash_sketch_fasta.rst
================================================
mash_sketch_fasta
=================

Purpose
-------

This component performs mash sketch for fasta input files.

.. note::
    - MASH documentation can be found `here <https://mash.readthedocs.io/en/latest/>`_.


Input/Output type
------------------

- Input type: ``fasta``
- Output type: ``msh``


Parameters
----------

- ``kmerSize``: Parameter to set the kmer size for hashing. Default: 21.
  Default: false.

- ``sketchSize``: Parameter to set the number of hashes per sketch.
  Default: 1000.


Published results
-----------------

None.


Published reports
-----------------

None.


Default directives
------------------

- ``mashSketchFasta``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1


================================================
FILE: docs/user/components/mash_sketch_fastq.rst
================================================
mash_sketch_fastq
=================

Purpose
-------

This component performs mash sketch for fastq input files. These sketches can
be used by ``mash_dist`` and ``mash_screen`` components to fetch the
reference file for mash.

.. note::
    - MASH documentation can be found `here <https://mash.readthedocs.io/en/latest/>`_.


Input/Output type
------------------

- Input type: ``fastq``
- Output type: ``msh``


Parameters
----------

- ``kmerSize``: Parameter to set the kmer size for hashing. Default: 21.
  Default: false.

- ``sketchSize``: Parameter to set the number of hashes per sketch.
  Default: 1000.

- ``minKmer``: Minimum copies of each k-mer required to pass noise filter for
  reads. Default: 1.

- ``genomeSize``: Genome size (raw bases or with K/M/G/T). If specified, will
  be used for p-value calculation instead of an estimated size from k-mer
  content. Default: *false*, meaning that it won't be used. If you want to use
  it pass a number to this parameter.


Published results
-----------------

None.


Published reports
-----------------

None.


Default directives
------------------

- ``mashSketchFastq``:
    - ``container``: flowcraft/mash-patlas
    - ``version``: 1.6.0-1


================================================
FILE: docs/user/components/maxbin2.rst
================================================
maxbin2
=======

Purpose
-------

This component is an automated binning algorithm to recover genomes from multiple metagenomic datasets

.. note::
    Software page: https://sourceforge.net/projects/maxbin2/

Input/Output type
------------------

- Input type: ``Fasta``  and ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for fasta is ``--fasta``. This process also requires FastQ files.
    If the FastQ files are input to any upstream process, those will be provided to maxbin2 automatically,
    if not, they can be provided with the parameter ``--fastq``.

Parameters
----------

- ``min_contig_lenght``: Minimum contig length. Default: 1000

- ``max_iteration``: Maximum Expectation-Maximization algorithm iteration number. Default: 50

- ``prob_threshold``: Probability threshold for EM final classification. Default: 0.9

Published results
-----------------

- ``results/maxbin2/``: Stores the results of the binning in a folder
  for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/maxbin2
- ``version``: 2.2.4-1
- ``cpus``: 4
- ``memory``: 8.GB (dynamically increased on retry)


Template
^^^^^^^^

:mod:`assemblerflow.templates.maxbin2`

================================================
FILE: docs/user/components/megahit.rst
================================================
megahit
=======

Purpose
-------

This components assembles metagenomic paired-end FastQ files using the megahit assembler.

.. note::
    Software page: https://github.com/voutcn/megahit

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``megahitKmers``: If 'auto' the megahit k-mer lengths will be determined
  from the maximum read length of each assembly. If 'default', megahit will
  use the default k-mer lengths.

- ``fastg``: When true, it converts megahit intermediate contigs into fastg.
  Default: False


Published results
-----------------

- ``results/assembly/megahit``: Stores the fasta assemblies for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 5GB (dynamically increased on retry)
- ``container``: cimendes/megahit
- ``version``: v1.1.3-0.1
- ``scratch``: true

Advanced
--------

Template
^^^^^^^^

:mod:`assemblerflow.templates.megahit`

================================================
FILE: docs/user/components/metamlst.rst
================================================
metamlst
========

Purpose
-------

Checks the ST of metagenomic reads using mlst.

.. note::
    Software page: https://bitbucket.org/CibioCM/metamlst

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: None

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``metamlstDB``: Specifiy the metamlst database (full path) for MLST checking

- ``metamlstDB_index``: Specifiy the Bowtie2 metamlst database index (full path) for MLST checking

Published results
-----------------

- ``results/annotation/metamlst``: Stores the results of the ST for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/metamlst
- ``version``: 1.1-1
- ``memory``: 4.Gb * task.attempt


================================================
FILE: docs/user/components/metaspades.rst
================================================
metaspades
==========

Purpose
-------

This components assembles metagenomic paired-end FastQ files using the metaSPAdes assembler.

.. note::
    Software page: http://bioinf.spbau.ru/spades

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``metaspadesKmers``: If 'auto' the metaSPAdes k-mer lengths will be determined
  from the maximum read length of each assembly. If 'default', metaSPAdes will
  use the default k-mer lengths.

Published results
-----------------

- ``results/assembly/metaspades``: Stores the fasta assemblies for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 5GB (dynamically increased on retry)
- ``container``: ummidock/spades
- ``version``: 3.11.1-1
- ``scratch``: true

Advanced
--------

Template
^^^^^^^^

:mod:`assemblerflow.templates.metaspades`

================================================
FILE: docs/user/components/midas_species.rst
================================================
midas_species
=============

Purpose
-------

This component performs MIDAS to assign taxonomic labels fro species to short DNA
sequences, usually obtained through metagenomic studies.

.. note::
    Software page: https://github.com/snayfach/MIDAS

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: None

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``midasDB``: Specifies MIDAS database. Default: /MidasDB/midas_db_v1.2

Published results
-----------------

- ``results/taxonomy/midas``: Stores the results of the screening
  for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: flowcraft/midas
- ``version``: 1.3.2-0.1
- ``memory``: 2.Gb*task.attempt
- ``cpus``: 3

================================================
FILE: docs/user/components/mlst.rst
================================================
mlst
====

Purpose
-------

Checks the ST of an assembly using mlst.

.. note::
    Software page: https://github.com/tseemann/mlst

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``mlstSpecies``: Specifiy the expected species for MLST.

Published results
-----------------

- ``results/annotation/mlst``: Stores the results of the ST for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``container``: ummidock/mlst


Advanced
--------

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``mlst``: Predicted species.
``expectedSpecies``: Name of the expected species.

``species``: Name of inferred species.


================================================
FILE: docs/user/components/momps.rst
================================================
momps
========

Purpose
-------

This component performs Multi-Locus Sequence Typing (MLST) on Legionella pneumophila
from reads and assemblies.

.. note::
    Software page: https://github.com/bioinfo-core-BGU/mompS

Input/Output type
------------------

- Input type: ``fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``. This process
    also requires FastQ reads provided via the ``--fastq`` parameter.

Parameters
----------

None.

Published results
-----------------

- ``results/typing/momps``: Stores TSV files with the ST and allelic profiles
  for each strain.

Published reports
-----------------

None.

Default directives
------------------

- ``momps``:
    - ``container``: flowcraft/momps
    - ``version``: 0.1.0-4

Advanced
--------

Reports JSON
^^^^^^^^^^^^

``typing``:
    - ``momps``: <typing result>

================================================
FILE: docs/user/components/patho_typing.rst
================================================
patho_typing
==========

Purpose
-------

Patho_typing is a software for *in silico* pathogenic typing
directly from raw Illumina reads.

.. note::
    Software page: https://github.com/B-UMMI/patho_typing

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: None

Parameters
----------

- ``species``: Species name. Must be the complete species name with genus
  and species, e.g.: 'Yersinia enterocolitica'.

Published results
-----------------

- ``results/pathotyping/<sample id>``: Stores the results of patho_typing in
  text and tabular format.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 4GB
- ``container``: ummidock/patho_typing
- ``version``: 0.3.0-1

Advanced
--------

Reports JSON
^^^^^^^^^^^^

``typing``:
    - ``pathotyping``: <typing result>

================================================
FILE: docs/user/components/pilon.rst
================================================
pilon
=====

Purpose
-------

This components Performs a mapping procedure of FastQ files into a their
assembly and performs filtering based on quality criteria of read coverage
and genome size.

.. note::
    Software page: https://github.com/broadinstitute/pilon

Input/Output type
------------------

- Input type: ``Fasta`` and ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

None.

Published results
-----------------

- ``results/assembly/pilon``: Stores the polished fasta assemblies for each
  sample.

Published reports
-----------------

- ``reports/assembly/pilon``: Table with several summary statistics about the
  assembly for each sample.

Default directives
------------------

- ``pilon``:
    - ``cpus``: 4
    - ``memory``: 7GB (dynamically increased on retry)
    - ``container``: ummidock/pilon
    - ``version``: 1.22.0-2
- ``process_assembly_mapping``:
    - ``cpus``: 1
    - ``memory``: 7GB (dynamically increased on retry)
    - ``container``: ummidock/pilon
    - ``version``: 1.22.0-2

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.assembly_report`

Reports JSON
^^^^^^^^^^^^
``tableRow``:
    - ``Contigs``: Number of contigs.
    - ``Assembled BP``: Number of assembled base pairs.
``plotData``:
    - ``size_dist``: Distribution of contig size.
    - ``sparkline``: Number of assembled base pairs.
    - ``genomeSliding``:
        - ``gcData``: Genome sliding window of GC content.
        - ``covData``: Genome sliding window of read coverage depth.
        - ``window``: Size of sliding window
        - ``xbars``: Position of contigs along the genome sliding window.
        - ``assemblyFile``: Name of the input assembly file.
``warnings``:
    - When the number of contigs exceeds a given threshold.
``fail``:
    - When the genome size is below 80% or above 150% of the expected genome size.


================================================
FILE: docs/user/components/process_skesa.rst
================================================
process_skesa
==============

Purpose
-------

This components processes the assembly resulting from the Skesa software and,
optionally, filters contigs based on user-provide parameters.

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: ``Fasta``

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``skesaMinKmerCoverage``: Minimum contigs K-mer coverage. After assembly
  only keep contigs with reported k-mer coverage equal or above this value.
- ``skesaMinContigLen``: Filter contigs for length greater or equal than
  this value.
- ``skesaMaxContigs``: Maximum number of contigs per 1.5 Mb of expected
  genome size.

Published results
-----------------

None.

Published reports
-----------------

- ``reports/assembly/skesa_filter``: The filter status for each contig and
  each sample. If any contig does not pass the filters, it reports which 
  filter type it failed and the corresponding value.

Default directives
------------------

- ``container``: ummidock/skesa
- ``version``: 0.2.0-3

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.process_assembly`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``Contigs (<assembler>)``: Number of contigs.
    - ``Assembled BP (<assembler>)``: Number of assembled base pairs.
``warnings``:
    - When the number of contigs exceeds a given threshold.
``fail``:
    - When the genome size is below 80% or above 150% of the expected genome size.


================================================
FILE: docs/user/components/process_spades.rst
================================================
process_spades
==============


Purpose
-------

This components processes the assembly resulting from the Spades software and,
optionally, filters contigs based on user-provide parameters.

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: ``Fasta``

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

- ``spadesMinKmerCoverage``: Minimum contigs K-mer coverage. After assembly
  only keep contigs with reported k-mer coverage equal or above this value.
- ``spadesMinContigLen``: Filter contigs for length greater or equal than
  this value.
- ``spadesMaxContigs``: Maximum number of contigs per 1.5 Mb of expected
  genome size.

Published results
-----------------

None.

Published reports
-----------------

- ``reports/assembly/spades_filter``: The filter status for each contig and
  each sample. If any contig does not pass the filters, it reports which
  filter type it failed and the corresponding value.

Default directives
------------------

- ``container``: ummidock/spades
- ``version``: 3.11.1-1

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.process_assembly`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    - ``Contigs (<assembler>)``: Number of contigs.
    - ``Assembled BP (<assembler>)``: Number of assembled base pairs.
``warnings``:
    - When the number of contigs exceeds a given threshold.
``fail``:
    - When the genome size is below 80% or above 150% of the expected genome size.
 ``process_assembly``: Failure messages

================================================
FILE: docs/user/components/prokka.rst
================================================
prokka
======


Purpose
-------

This component performs annotations using the annotations available in
`prokka <https://github.com/tseemann/prokka>`_.


Input/Output type
-----------------

- Input type: ``fasta``
- Output type: ``None``

.. note::
    - Although the component doesn't have an output channel it writes the results into the ``publishDir``.


Parameters
----------

- ``centre``: sets the center to which the sequencing center id.
  Default: 'UMMI'.

- ``kingdom``: Selects the annotation mode between Archaea, Bacteria,
  Mitochondria, Viruses. Default: Bacteria).

- ``genus``: Allows user to select a genus name. Default: 'Genus' (same
  as prokka). This also adds the use of the --usegenus flag to prokka.


Published results
-----------------

- ``results/annotation/prokka_<pid>/<sample_id>``: All the outputs from prokka
  will be available in these directories.


Published reports
-----------------

None.


Default directives
------------------

- ``prokka``:
    - ``cpus``: 2
    - ``container``: ummidock/prokka
    - ``version``: 1.12


================================================
FILE: docs/user/components/reads_download.rst
================================================
reads_download
==============

Purpose
-------

This component downloads reads from the SRA/ENA public databases from a
list of accessions. First, it tries to use `aspera connect`_ to download
reads, if a valid aspera key is provided. Otherwise it uses curl, which is
substantially slower. The reads for each accession are then emitted through
the main output of this component to any other component (or components) that
receive FastQ data.

.. _aspera connect: http://asperasoft.com/download_connect/

Input/Output type
------------------

- Input type: ``accessions``
- Output type: ``fastq``

.. note::
    The default input parameter for Accessions data is ``--accessions``.

Parameters
----------

- ``asperaKey``: Downloads fastq accessions using Aspera Connect
  by providing the private-key file 'asperaweb_id_dsa.openssh' normally found
  in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh after the installation.

Published results
-----------------

- ``reads/<accession>``: Stores the reads for each provided accession.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 1
- ``memory``: 1GB
- ``container``: flowcraft/getseqena
- ``version``: 0.4.0-2


================================================
FILE: docs/user/components/remove_host.rst
================================================
remove_host
===========

Purpose
-------

This component performs a mapping procedure of FastQ files using a host
genome as referece (default: hg19). The procedure is carried out with
bowtie2 and samtools and aims to filter the reads that map to host genome.

.. note::
    - bowtie2 documentation can be found `here <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>`_.
    - samtools documentation can be found `here <http://www.htslib.org/doc/samtools-1.2.html>`_.

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for fastq data is ``--fastq``.

Parameters
----------

- ``refIndex``: Specifies the reference indexes to be provided to bowtie2.
Default: '/index_hg19/hg19' (from docker image).


Published results
-----------------

- ``results/mapping/``: A `txt` file from bowtie2 with the mapping statistics.

Published reports
-----------------

None.

Default directives
------------------

- ``remove_host``:
    - ``cpus``: 3
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``: flowcraft/remove_host
    - ``version``: 2-0.1


Advanced
--------

Template
^^^^^^^^

:mod:`assemblerflow.templates.remove_host`


================================================
FILE: docs/user/components/retrieve_mapped.rst
================================================
retrieve_mapped
===============

Purpose
-------

This component retrieves the mapping reads of a previous bowtie mapping process.
The procedure is carried out with samtools and aims to retrieve the reads that map to target reference.

.. note::
    - samtools documentation can be found `here <http://www.htslib.org/doc/samtools-1.2.html>`_.

Input/Output type
------------------

- Input type: ``bam``
- Output type: ``FastQ``

.. note::
    This process has the ``bowtie2`` process as a dependency.

Parameters
----------

None

Published results
-----------------

- ``results/mapping/retrieve_mapped``: Contains the resulting ``FastQ`` files.

Published reports
-----------------

None.

Default directives
------------------

- ``remove_host``:
    - ``cpus``: 2
    - ``memory``: 5GB (dynamically increased on retry)
    - ``container``: flowcraft/bowtie2_samtools
    - ``version``: 1.0.0-1


================================================
FILE: docs/user/components/seq_typing.rst
================================================
seq_typing
==========

Purpose
-------

Seq_typing is a software that determines the type of a given sample using a
read mapping approach against a set of reference sequences. Sample's reads
are mapped to the given reference sequences and, based on the length of the
sequence covered and it's depth of coverage, seq_typing decides which reference
sequence is more likely to be present and returns the type associated with
such sequences.

.. note::
    Software page: https://github.com/B-UMMI/seq_typing

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: None

Parameters
----------

- ``referenceFileO``: Fasta file containing reference sequences. If more
  than one file is passed via the 'referenceFileH parameter, a reference
  sequence for each file will be determined.
- ``referenceFileH``: Fasta file containing reference sequences. If more
  than one file is passed via the 'referenceFileO parameter, a reference
  sequence for each file will be determined.

Published results
-----------------

- ``results/seqtyping/<sample id>``: Stores the results of seq_typing in
  text and tabular format.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 4GB
- ``container``: ummidock/seq_typing
- ``version``: 0.1.0-1

Advanced
--------

Reports JSON
^^^^^^^^^^^^

``typing``:
    - ``seqtyping``: <typing result>

================================================
FILE: docs/user/components/sistr.rst
================================================
sistr
=====

Purpose
-------

Sistr (Salmonella In Silico Typing Resource) is a software for Serovar
predictions from whole-genome sequence assemblies by determination
of antigen gene and cgMLST gene alleles using BLAST. Mash MinHash can also be
used for serovar prediction.

.. note::
    Software page: https://github.com/peterk87/sistr_cmd

Input/Output type
------------------

- Input type: ``Fasta``
- Output type: None

.. note::
    The default input parameter for fasta data is ``--fasta``.

Parameters
----------

None

Published results
-----------------

- ``results/typing/sistr``: Stores the results of sistr in a tab file

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 4GB
- ``container``: ummidock/sistr_cmd
- ``version``: 1.0.2


================================================
FILE: docs/user/components/skesa.rst
================================================
skesa
=====

Purpose
-------

This components assembles paired-end FastQ files using the Skesa assembler.

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

None.

Published results
-----------------

- ``results/assembly/skesa``: Stores the fasta assemblies for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 5GB (dynamically increased on retry)
- ``container``: flowcraft/skesa
- ``version``: 2.3.0-1
- ``scratch``: true

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.skesa`

================================================
FILE: docs/user/components/spades.rst
================================================
spades
======

Purpose
-------

This components assembles paired-end FastQ files using the Spades assembler.

.. note::
    Software page: http://bioinf.spbau.ru/spades

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``Fasta``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``spadesMinCoverage``: The minimum number of reads to consider an edge in
  the de Bruijn graph during the assembly
- ``spadesMinKmerCoverage``: Minimum contigs K-mer coverage. After assembly
  only keep contigs with reported k-mer coverage equal or above this value
- ``spadesKmers``: If 'auto' the SPAdes k-mer lengths will be determined
  from the maximum read length of each assembly. If 'default', SPAdes will
  use the default k-mer lengths.

Published results
-----------------

- ``results/assembly/spades``: Stores the fasta assemblies for each sample.

Published reports
-----------------

None.

Default directives
------------------

- ``cpus``: 4
- ``memory``: 5GB (dynamically increased on retry)
- ``container``: ummidock/spades
- ``version``: 3.13.0-1
- ``scratch``: true

Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.spades`


================================================
FILE: docs/user/components/trimmomatic.rst
================================================
trimmomatic
===========

Purpose
-------

This component runs Trimmomatic on paired-end FastQ files.

.. note::
    Software page: http://www.usadellab.org/cms/?page=trimmomatic

Input/Output type
------------------

- Input type: ``FastQ``
- Output type: ``FastQ``

.. note::
    The default input parameter for FastQ data is ``--fastq``. You can change
    the ``--fastq`` parameter default pattern (``fastq/*_{1,2}.*``) according
    to input file names (e.g.: ``--fastq "path/to/fastq/*R{1,2}.*"``).

Parameters
----------

- ``adapters``: Provide a non-default fasta file containing the adapter
  sequences used to filter the FastQ files.
- ``trimSlidingWindow``: Perform sliding window trimming, cutting once the
  average quality within the window falls below a threshold.
- ``trimLeading``: Cut bases off the start of a read, if below a threshold
  quality.
- ``trimTrailing``: Cut bases of the end of a read, if below a threshold
  quality.
- ``trimMinLength``: Drop the read if it is below a specified length.

Published results
-----------------

- ``results/trimmomatic``: The trimmed FastQ files for each sample.

Published reports
-----------------

- ``reports/fastqc``: Stores the FastQC HTML reports for each sample.
- ``reports/fastqc/run_2/``: Stores the summary text files with the category
  results of FastQC for each sample.

Default directives
------------------

- ``cpus``: 2
- ``memory``: 4GB (dynamically increased on retry)
- ``container``: ummidock/trimmomatic
- ``version``: 0.36-2


Advanced
--------

Template
^^^^^^^^

:mod:`flowcraft.templates.trimmomatic`
:mod:`flowcraft.templates.trimmomatic_report`

Reports JSON
^^^^^^^^^^^^

``tableRow``:
    ``Trimmed (%)``: Percentage of trimmed nucleotides
``plotData``:
    ``sparkline``: Number of nucleotides after trimming
``badReads``: Number of discarded reads

================================================
FILE: docs/user/pipeline_building.rst
================================================
Pipeline building
=================

FlowCraft offers a few extra features when building pipelines using the
``build`` execution mode.

.. _rawInput:

Raw input types
---------------

The first component (or components) you place at the start of the pipeline
determine the raw input type, and the parameter for providing input data.
The input type information is provided in the documentation page of each
component. For instance, if the first component is FastQC, which has an input
type of ``FastQ``, the parameter for providing the raw input data will be
``--fastq``. Here are the currently supported input types and their
respective parameters:

- ``FastQ``: ``--fastq``
- ``Fasta``: ``--fasta``
- ``Accessions``: ``--accessions``


.. _mergeParams:

Merge parameters
----------------

By default, parameters in a FlowCraft pipeline are unique and independent
between different components, even if the parameters have the same name and/or
the components are the same. This allows for the execution of the same software
using different parameters in a single workflow. The ``params.config`` of these
pipelines will look something like::

    params {
        /*
        Component 'trimmomatic_1_2'
        --------------------------
        */
        adapters_1_2 = 'None'
        trimSlidingWindow_1_2 = '5:20'
        trimLeading_1_2 = 3
        trimTrailing_1_2 = 3
        trimMinLength_1_2 = 55

        /*
        Component 'fastqc_1_3'
        ---------------------
        */
        adapters_1_3 = 'None'
    }

Notice that the ``adapters`` parameter occurs twice and can be independently set
in each component.

If you want to override this behaviour, FlowCraft has a ``--merge-params`` option
that merges all parameters with the same name in a single parameter, which is then
equally applied to all components. So, if we generate the pipeline above
with this option::

    flowcraft build -t "trimmomatic fastqc" -o pipe.nf --merge-params

Then, the ``params.config`` will become::

    params {
        adapters = 'None'
        trimSlidingWindow = '5:20'
        trimLeading = 3
        trimTrailing = 3
        trimMinLength = 5
    }

Forks
-----

The output of any component in an FlowCraft pipeline can be forked into
two or more components, using the following fork syntax::

    trimmomatic fastqc (spades | skesa)

.. image:: ../resources/fork_1.png
   :scale: 80 %
   :align: center

In this example, the output of ``fastqc`` will be fork into two new *lanes*,
which will proceed independently from each other. In this syntax, a fork is
triggered by the ``(`` symbol (and the corresponding closing ``)``) and each
lane will be separated by a ``|`` symbol. There is no limitation to the number
of forks or lanes that a pipeline has. For instance, we could add more
components after the ``skesa`` module, including another fork::

    trimmomatic fastqc (spades | skesa pilon (abricate | prokka | chewbbaca))

.. image:: ../resources/fork_2.png
   :scale: 80 %
   :align: center

In this example, data will be forked after ``fastqc`` into two new lanes,
processed by ``spades`` and ``skesa``. In the skesa lane, data will continue
to flow into the ``pilon`` component and its output will fork into three new
lanes.

It is also possible to start a fork at the beggining of the pipeline, which
basically means that the pipeline will have multiple starting points. If we
want to provide the raw input two multiple process, the fork syntax can start
at the beginning of the pipeline::

    (seq_typing | trimmomatic fastqc (spades | skesa))

.. image:: ../resources/fork_3.png
   :scale: 80 %
   :align: center

In this case, since both initial components (``seq_typing`` and
``integrity_coverage``) received fastq files as input, the data provided
via the ``--fastq`` parameter will be forked and provided to both processes.

.. note::
    Some components have dependencies which need to be included previously
    in the pipeline. For instance, ``trimmomatic`` requires
    ``integrity_coverage`` and ``pilon`` requires ``assembly_mapping``. By
    default, FlowCraft will insert any missing dependencies right before
    the process, which is why these components appear in the figures above.

.. warning::
    Pay special attention to the syntax of the pipeline string when using
    forks. However, when unable to parse it, FlowCraft will do its best
    to inform you where the parsing error occurred.

Directives
----------

Several directives with information on cpu usage, RAM, version, etc. can be
specified for each individual component when building the pipeline using the
``={}`` notation. These
directives are written to the ``resources.config`` and
``containers.config`` files that are generated in the pipeline directory. You
can pass any of the directives already supported by nextflow (https://www.nextflow.io/docs/latest/process.html#directives),
but the most commonly used include:

    - ``cpus``
    - ``memory``
    - ``queue``

In addition, you can also pass the ``container`` and ``version`` directives
which are parsed by FlowCraft to dynamically change the container and/or
version tag of any process.

Here is an example where we specify cpu usage, allocated memory and container
version in the pipeline string::

    flowcraft build -t "fastqc={'version':'0.11.5'} \
                            trimmomatic={'cpus':'2'} \
                            spades={'memory':'\'10GB\''}" -o my_pipeline.nf

When a directive is not specified, it will assume the default value of the
nextflow directive.

.. warning::
    Take special care not to include any white space characters inside the
    directives field. Common mistakes occur when specifying directives like
    ``fastqc={'version': '0.11.5'}``.

.. note::
    The values specified in these directives are placed in the
    respective config files exactly as they are. For instance,
    ``spades={'memory':'10GB'}"`` will appear in the config as
    ``spades.memory = 10Gb``, which will raise an error in nextflow because
    ``10Gb`` should be a string. Therefore, if you want a string you'll need to add
    the ``'`` as in this example: ``spades={'memory':'\'10GB\''}"``. The
    reason why these directives are not automatically converted is to allow
    the specification of dynamic computing resources, such as
    ``spades={'memory':'{10.Gb*task.attempt}'}"``

Extra inputs
------------

By default, only the first process (or processes) in a pipeline will receive
the raw input data provided by the user. However, the ``extra_input`` special
directive allows one or more processes to receive input from an additional parameter
that is provided by the user::

    reads_download integrity_coverage={'extra_input':'local'} trimmomatic spades

The default main input of this pipeline is a text file with accession numbers
for the ``reads_download`` component. The ``extra_input`` creates
a new parameter, named ``local`` in this example, that allows us to provide
additional input data to the ``integrity_coverage`` component directly::

    nextflow run pipe.nf --accessions accession_list.txt --local "fastq/*_{1,2}.*"

What will happen in this pipeline, is that the fastq files provided to the
``integrity_coverage`` component will be mixed with the ones provided by the
``reads_download`` component. Therefore, if we provide 10 accessions and 10
fastq samples, we'll end up with 20 samples being processed by the end of the
pieline.

**It is important to note that the extra input parameter expected data
compliant with the input type of the process.** If files other than fastq files
would be provided in the pipeline above, this would result in a pipeline error.

If the ``extra_input`` directive is used on a component that has a different
input type from the first component in the pipeline, it is possible to use
the ``default`` value::

    trimmomatic spades abricate={'extra_input':'default'}

In this case, the input type of the first component if fastq and the input
type of ``abricate`` is fasta. The ``default`` value will make available the
default parameter for fasta raw input, which is ``fasta``::

    nextflow run pipe.nf --fastq "fastq/*_{1,2}.*" --fasta "fasta/*.fasta"

Pipeline file
-------------

Instead of providing the pipeline components via the command line, you can
specify them in a text file::

    # my_pipe.txt
    trimmomatic fastqc spades

And then provide the pipeline file to the ``-t`` parameter::

    flowcraft build -t my_pipe.txt -o my_pipe.nf

Pipeline files are usually more readable, particularly when they become more
complex. Consider the following example::

    integrity_coverage (
        spades={'memory':'\'50GB\''} |
        skesa={'memory':'\'40GB\'','cpus':'4'} |
        trimmomatic fastqc (
            spades pilon (abricate={'extra_input':'default'} | prokka) |
            skesa pilon (abricate | prokka)
        )
    )

In addition to be more readable, it is also easier to edit, re-use and share.


================================================
FILE: docs/user/pipeline_configuration.rst
================================================
Pipeline configuration
======================

When a nextflow pipeline is built with FlowCraft, a number of configuration
files are automatically generated in the same directory. They are all imported
at the end of the ``nextflow.config`` file and are sorted by their configuration
role. All configuration files are overwritten if you build another pipeline
in the same directory, with the exception of the ``user.config`` file, which
is meant to be a persistent configuration file.

Parameters
----------

The ``params.config`` file includes all available paramenters for the pipeline
and their respective default values. Most of these parameters already contain
sensible defaults.

Resources
---------

The ``resources.config`` file includes the majority of the directives provided
for each process, including ``cpus`` and ``memory``. You'll note that each
process name has a suffix like ``_1_1``, which is a unique process identifier
composed of ``<lane>_<process_number>``. This ensures that even when the same
component is specified multiple times in a pipeline, you'll still be able to
set directives for each one individually.

Containers
----------

The ``containers.config`` file includes the container directive for each
process in the pipeline. These containers are retrieved from dockerhub, if they
do not exist locally yet. You can change the container string to any other
value, but it should point to an image that exist on dockerhub or locally.

Profiles
--------

The ``profiles.config`` file includes a set of pre-made profiles with all
possible combinations of executors and container engines. You can add new ones
or modify existing one.

User configutations
-------------------

The ``user.config`` file is configuration file that is not overwritten when a
new pipeline is build in the same directory. It can contain any configuration
that is supported by nextflow and will overwrite all other configuration files.

================================================
FILE: docs/user/pipeline_inspect.rst
================================================
Pipeline inspection
===================

FlowCraft offers an ``inspect`` mode for tracking the progress of a nextflow
pipeline either directly in a terminal (``overview``) or by broadcasting information to
the `flowcraft web application <https://github.com/assemblerflow/flowcraft-webapp>`_
(``broadcast``).

.. note::
    This mode was design for nextflow pipelines generated by FlowCraft. It should
    be possible to inspect any nextflow pipeline, provided that the requirements
    below are met, but compatibility it's not guaranteed.

**How it works:** Simply run ``flowcraft inspect -m <mode>`` in the directory
where the pipeline is running. In either run mode, FlowCraft will keep running
(until you cancel it) and continuously update the progress of a pipeline. If
the pipeline is interrupted or fails for some reason, FlowCraft should be able
to correctly reset the inspection automatically when resuming its execution.

Requirements for inspect
------------------------

While the ``inspect`` mode is running, it will parse the information written
into two files that are generated by nextflow:

- ``.nextflow.log``: The log file that is automatically generated by nextflow.
- ``trace file``: The trace file that is generated by nextflow when using the
  ``-with-trace`` option. By default, it searches for the ``pipeline_stats.txt`` file,
  but this can be changed using the ``-i`` option.

Trace fields
------------

FlowCraft parses several fields of the trace file, but only a few are mandatory
for its execution. If the trace file does not contain any of the optional fields,
that information will simply not appear on the terminal or web app. Nevertheless, to take
full advantage of the inspect mode, the following trace fields should be present:

- **Mandatory:**
    - ``tag``: The tag of the nextflow process. Flowcraft assumes that this is a string
      with only the sample name (e.g.: *SampleA*). While this is not strictly required,
      providing strings with other information (e.g.: *Running bowtie for sampleA*)
      may result in some inconsistencies in the inspection.
    - ``task_id``: The task ID is used to skip entries that have already been parsed.
- **Optional:**
    - ``hash``: Used to get the work directory the process execution.
    - ``cpus``, ``%cpu``, ``memory``, ``rss``, ``rchar`` and ``wchar``: Used for statistics
      of computational resources.

.. note::
    Any additional fields present in the trace file are ignored.

Usage
-----

::

    flowcraft inspect --help
    usage: flowcraft inspect [-h] [-i TRACE_FILE] [-r REFRESH_RATE]
                             [-m {overview,broadcast}] [-u URL] [--pretty]

    optional arguments:
      -h, --help            show this help message and exit
      -i TRACE_FILE         Specify the nextflow trace file.
      -r REFRESH_RATE       Set the refresh frequency for the continuous inspect
                            functions
      -m {overview,broadcast}, --mode {overview,broadcast}
                            Specify the inspection run mode.
      -u URL, --url URL     Specify the URL to where the data should be broadcast
      --pretty              Pretty inspection mode that removes usual reporting
                            processes.

- ``-i``: Used to specify the path to the trace file that should be parsed. By
  default, FlowCraft will try to parse the ``pipeline_stats.txt`` file in current
  working directory.
- ``-r``: Sets the time interval in seconds between each parsing of the
  relevant nextflow files. By default it is set to ``0.01``.
- ``-m``: The inspection mode. ``overview`` is the terminal display while
  ``broadcast`` sends the data to FlowCraft's web service.
- ``-u``: The URL of FlowCraft's web service. By default it is already set to the
  main service and you do not need to specify it. It is only useful when the service
  is running on local host or in other custom instance.
- ``--pretty``: By default the inspection shows the progress of all processes in
  the pipeline. Using this option filters the processes to the most relevant ones
  of FlowCraft's pipelines.


================================================
FILE: docs/user/pipeline_reports.rst
================================================
Pipeline reports
================

.. include:: reports/abricate.rst
.. include:: reports/assembly_mapping.rst
.. include:: reports/check_coverage.rst
.. include:: reports/chewbbaca.rst
.. include:: reports/dengue_typing.rst
.. include:: reports/fastqc.rst
.. include:: reports/fastqc_trimmomatic.rst
.. include:: reports/integrity_coverage.rst
.. include:: reports/mash_dist.rst
.. include:: reports/mlst.rst
.. include:: reports/patho_typing.rst
.. include:: reports/pilon.rst
.. include:: reports/process_mapping.rst
.. include:: reports/process_newick.rst
.. include:: reports/process_skesa.rst
.. include:: reports/process_spades.rst
.. include:: reports/process_viral_assembly.rst
.. include:: reports/seq_typing.rst
.. include:: reports/sistr.rst
.. include:: reports/trimmomatic.rst
.. include:: reports/true_coverage.rst


================================================
FILE: docs/user/reports/abricate.rst
================================================
abricate
--------

Table data
^^^^^^^^^^

AMR table:
    - **<abricate database>**: Number of hits for a particular given database

.. image:: ../resources/reports/abricate_table.png
    :align: center

Plot data
^^^^^^^^^

- **Sliding window AMR annotation**: Provides annotation of Abricate hits for
  each database along the genome. This report component is only available when
  the ``pilon`` component was used downstream of ``abricate``.

.. image:: ../resources/reports/sliding_window_amr.png

================================================
FILE: docs/user/reports/assembly_mapping.rst
================================================
assembly_mapping
----------------

Plot data
^^^^^^^^^

- **Data loss chart**: Gives a trend of the data loss
  (in total number of base pairs) across components that may filter this data.

.. image:: ../resources/reports/sparkline.png

Warnings
^^^^^^^^

Assembly table:
    - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb.

Fails
^^^^^

Assembly table:
    - When the assembly size if smaller than 80% or larger than 150% of the
      expected genome size.

================================================
FILE: docs/user/reports/check_coverage.rst
================================================
check_coverage
--------------

Table data
^^^^^^^^^^

Quality control table:
    - **Coverage**: Estimated coverage based on the number of base pairs and the expected
      genome size.

.. image:: ../resources/reports/quality_control_table.png
    :align: center

Warnings
^^^^^^^^

Quality control table:
    - When the enconding and phred score cannot be guessed from the FastQ file(s).

Fails
^^^^^

Quality control table:
    - When the sample has lower estimated coverage than the provided coverage threshold.

================================================
FILE: docs/user/reports/chewbbaca.rst
================================================
chewbbaca
---------

Table data
^^^^^^^^^^

Chewbbaca table:
    - Table with the summary statistics of ChewBBACA allele calling, including
      the number of exact matches, inferred loci, loci not found, etc.

.. image:: ../resources/reports/chewbbaca_table.png
    :align: center

================================================
FILE: docs/user/reports/dengue_typing.rst
================================================
dengue_typing
-------------

Table data
^^^^^^^^^^

Typing table:
    - **seqtyping**: The sequence typing result (serotypy-genotype).

.. image:: ../resources/reports/typing_table_dengue.png
    :align: center

================================================
FILE: docs/user/reports/fastqc.rst
================================================
fastqc
------

Plot data
^^^^^^^^^

- **Base sequence quality**: The average quality score across the read length.

.. image:: ../resources/reports/fastqc_base_sequence_quality.png

- **Sequence quality**: Distribution of the mean sequence quality score.

.. image:: ../resources/reports/fastqc_per_base_sequence_quality.png

- **Base GC content**: Distribution of the GC content of each sequence.

.. image:: ../resources/reports/fastqc_base_gc_content.png

- **Sequence length**: Distribution of the read sequence length.

.. image:: ../resources/reports/fastqc_sequence_length.png

- **Missing data**: Normalized count of missing data across the read length.

.. image:: ../resources/reports/fastqc_missing_data.png


Warnings
^^^^^^^^

The following FastQC categories will issue a warning when they have a ``WARN`` flag:
    - Per base sequence quality.
    - Overrepresented sequences.

The following FastQC categories will issue a warning when do not have a ``PASS`` flag:
    - Per base sequence content.

Fails
^^^^^

The following FastQC categories will issue a fail when they have  a ``FAIL`` flag:
    - Per base sequence quality.
    - Overrepresented sequences.
    - Sequence length distribution.
    - Per sequence GC content.

The following FastQC categories will issue a fail when the do not have a ``PASS`` flag:
    - Per base N content.
    - Adapter content.


================================================
FILE: docs/user/reports/fastqc_trimmomatic.rst
================================================
fastqc_trimmomatic
------------------

Table data
^^^^^^^^^^

Quality control table:
    - **Trimmed (%)**: Percentage of trimmed base pairs.

.. image:: ../resources/reports/quality_control_table.png
    :scale: 80 %
    :align: center

Plot data
^^^^^^^^^

- **Data loss chart**: Gives a trend of the data loss
  (in total number of base pairs) across components that may filter this data.

.. image:: ../resources/reports/sparkline.png


================================================
FILE: docs/user/reports/integrity_coverage.rst
================================================
integrity_coverage
------------------

Table data
^^^^^^^^^^

Quality control table:
    - **Raw BP**: Number of raw base pairs from the FastQ file(s).
    - **Reads**: Number of reads in the FastQ file(s)
    - **Coverage**: Estimated coverage based on the number of base pairs and the expected
      genome size.

.. image:: ../resources/reports/quality_control_table.png
    :align: center

Plot data
^^^^^^^^^

- **Data loss chart**: Gives a trend of the data loss
  (in total number of base pairs) across components that may filter this data.

.. image:: ../resources/reports/sparkline.png

Warnings
^^^^^^^^

Quality control table:
    - When the enconding and phred score cannot be guessed from the FastQ file(s).

Fails
^^^^^

Quality control table:
    - When the sample has lower estimated coverage than the provided coverage threshold.

================================================
FILE: docs/user/reports/mash_dist.rst
================================================
mash_dist
---------

Table data
^^^^^^^^^^

Plasmids table:
    - **Mash Dist**: Number of plasmid hits

.. image:: ../resources/reports/mash_dist_table.png
    :align: center

Plot data
^^^^^^^^^

- **Sliding window Plasmid annotation**: Provides annotation of plasmid
  hits along the genome assembly. This report component is only available
  when the ``mash_dist`` component is used.

.. image:: ../resources/reports/sliding_window_mash_dist.png

================================================
FILE: docs/user/reports/maxbin2.rst
================================================
maxbin2
----

Table data
^^^^^^^^^^

Metagenomic Binning (sample specific):
    - **Bin name**: The number of bin.
    - **Completness**: Estimation of completion of genome in bin (% of Single copy genes present)
    - **Genome size**: Total size of the bin
    - **GC content**: Percentage of GC in the bin

.. image:: ../resources/reports/binning.png
    :scale: 80 %
    :align: center

================================================
FILE: docs/user/reports/mlst.rst
================================================
mlst
----

Table data
^^^^^^^^^^

Typing table:
    - **MLST species**: The inferred species name.
    - **MLST ST**: The inferred sequence type.

.. image:: ../resources/reports/typing_table.png
    :scale: 80 %
    :align: center

================================================
FILE: docs/user/reports/patho_typing.rst
================================================
patho_typing
------------

Table data
^^^^^^^^^^

Typing table:
    - **Patho_typing**: The pathotyping result.

.. image:: ../resources/reports/typing_table.png
    :scale: 80 %
    :align: center

================================================
FILE: docs/user/reports/pilon.rst
================================================
pilon
-----

Table data
^^^^^^^^^^

Quality control table:
    - **Contigs**: Number of assembled contigs.
    - **Assembled BP**: Total number of assembled base pairs.

.. image:: ../resources/reports/assembly_table_skesa.png
    :scale: 80 %
    :align: center

Plot data
^^^^^^^^^

- **Contig size distribution**: Distribution of the size of each assembled contig.

.. image:: ../resources/reports/contig_size_distribution.png

- **Sliding window coverage and GC content**: Provides coverage and GC content
  metrics along the genome using a sliding window approach and two synchronised
  charts.

.. image:: ../resources/reports/sliding_window_amr.png

Warnings
^^^^^^^^

Quality control table:
    - When the enconding and phred score cannot be guessed from the FastQ file(s).

Fails
^^^^^

Quality control table:
    - When the sample has lower estimated coverage than the provided coverage threshold.

================================================
FILE: docs/user/reports/process_mapping.rst
================================================
process_mapping
---------------

Table data
^^^^^^^^^^

Read mapping table:
    - **Reads**: Number reads in the the FastQ file(s).
    - **Unmapped**: Number of unmapped reads
    - **Mapped 1x**: Number of reads that aligned, concordantly and discordantly, exactly 1 time
    - **Mapped >1x**: Number of reads that aligned, concordantly or disconrdantly, more than 1 times
    - **Overall alignment rate (%)**: Overall alignment rate

.. image:: ../resources/reports/read_mapping_remove_host.png
    :align: center


================================================
FILE: docs/user/reports/process_newick.rst
================================================
process_newick
--------------

Tree data
^^^^^^^^^^

Phylogenetic reconstruction with bootstrap values for the provided tree.


.. image:: ../resources/reports/phylogenetic_tree.png
    :align: center

================================================
FILE: docs/user/reports/process_skesa.rst
================================================
process_skesa
-------------

Table data
^^^^^^^^^^

Quality control table:
    - **Contigs (skesa)**: Number of assembled contigs.
    - **Assembled BP**: Total number of assembled base pairs.

.. image:: ../resources/reports/assembly_table_skesa.png
    :scale: 80 %
    :align: center

Warnings
^^^^^^^^

Assembly table:
    - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb.

Fails
^^^^^

Assembly table:
    - When the assembly size if smaller than 80% or larger than 150% of the
      expected genome size.


================================================
FILE: docs/user/reports/process_spades.rst
================================================
process_spades
-------------

Table data
^^^^^^^^^^

Quality control table:
    - **Contigs (spades)**: Number of assembled contigs.
    - **Assembled BP**: Total number of assembled base pairs.

.. image:: ../resources/reports/assembly_table_spades.png
    :scale: 80 %
    :align: center

Warnings
^^^^^^^^

Assembly table:
    - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb.

Fails
^^^^^

Assembly table:
    - When the assembly size if smaller than 80% or larger than 150% of the
      expected genome size.


================================================
FILE: docs/user/reports/process_viral_assembly.rst
================================================
process_viral_assembly
----------------------

Table data
^^^^^^^^^^

Quality control table:
    - **Contigs (SPAdes)**: Number of assembled contigs.
    - **Assembled BP (SPAdes)**: Total number of assembled base pairs.
    - **ORFs**: Number of complete ORFs in the assembly.
    - **Contigs (MEGAHIT)**: Number of assembled contigs.
    - **Assembled BP (MEGAHIT)**: Total number of assembled base pairs.


.. image:: ../resources/reports/assembly_table_viral_assembly.png
    :align: center

Fails
^^^^^

Assembly table:
    - When the assembly size if smaller than 80% or larger than 150% of the
      expected genome size.


================================================
FILE: docs/user/reports/seq_typing.rst
================================================
seq_typing
----------

Table data
^^^^^^^^^^

Typing table:
    - **seqtyping**: The sequence typing result.

.. image:: ../resources/reports/typing_table.png
    :align: center

================================================
FILE: docs/user/reports/sistr.rst
================================================
sistr
-----

Table data
^^^^^^^^^^

Typing table:
    - **sistr**: The sequence typing result.

.. image:: ../resources/reports/typing_table.png
    :align: center

================================================
FILE: docs/user/reports/trimmomatic.rst
================================================
trimmomatic
-----------

Table data
^^^^^^^^^^

Quality control table:
    - **Trimmed (%)**: Percentage of trimmed base pairs.

.. image:: ../resources/reports/quality_control_table.png
    :align: center

Plot data
^^^^^^^^^

- **Data loss chart**: Gives a trend of the data loss
  (in total number of base pairs) across components that may filter this data.

.. image:: ../resources/reports/sparkline.png


================================================
FILE: docs/user/reports/true_coverage.rst
================================================
true_coverage
-------------

Table data
^^^^^^^^^^

Quality control table:
    - **True Coverage**: Estimated coverage based on read mapping on MLST genes.

.. image:: ../resources/reports/quality_control_table.png
    :align: center

Fails
^^^^^

Quality control table:
    - When the sample has lower estimated coverage than the provided coverage threshold.

================================================
FILE: flowcraft/__init__.py
================================================

__version__ = "1.4.2"
__build__ = "18062019"
__author__ = "Diogo N. Silva, Tiago F. Jesus, Ines Mendes, Bruno Ribeiro-Goncalves"
__copyright__ = "Diogo N. Silva"
__license__ = "GPL3"
__maintainer__ = "Diogo N. Silva"
__email__ = "o.diogosilva@gmail.com"

================================================
FILE: flowcraft/bin/final_POST.sh
================================================
#!/usr/bin/env sh

st=$(cat $(pwd)/.status)

json="{'project_id':'$1','pipeline_id':'$2','process_id':'$3','run_info':'None','run_output':'None','warnings':'$(pwd)/.warning','log_file':'$(pwd)/.command.log','status':'$st','type':'output'}"

{
    curl -H  "Content-Type: application/json" -L -X POST -d \"$json\" $4 > /dev/null
} || {
    echo Curl request failed
}

================================================
FILE: flowcraft/bin/merge_json.py
================================================
#!/usr/bin/env python3

import sys
import json

core_file, f1, f2 = sys.argv[1:4]

try:
    sample_id = sys.argv[4]
except IndexError:
    sample_id = None


def get_core_genes(core_file):

    with open(core_file) as fh:
        core_genes = [x.strip() for x in fh.readlines()[1:]
                      if x.strip() != ""]

    return core_genes


def filter_core_genes(locus_array, info_array, core_genes):

    core_array = []

    for gene, info in zip(*[info_array, locus_array]):
        if gene in core_genes:
            core_array.append(info)

    return core_array


def assess_quality(core_array, core_genes):

    # Get the total number of missing loci. The sum/map approach aggretates
    # the sum of all possible missing loci symbols.
    missing_loci = ["LNF", "PLOT3", "PLOT5", "NIPH", "ALM", "ASM"]
    locus_not_found = sum(map(core_array.count, missing_loci))

    perc = float(locus_not_found) / float(len(core_genes))

    # Fail sample with higher than 2% missing loci
    with open(".status", "w") as fh:
        if perc > 0.02:
            status = "fail"
        elif perc > 0.003:
            status = "warning"
        else:
            status = "pass"

        fh.write(status)

    return status, perc


def get_table_data(data_obj, sample_id=None):

    header_map = dict((p, h) for p, h in enumerate(data_obj["header"]))
    table_data = []

    for sample, data in data_obj.items():

        if sample == "header":
            continue

        cur_data = []
        for pos, d in enumerate(data):
            cur_data.append({
                "header": header_map[pos],
                "value": d,
                "table": "chewbbaca"
            })

        table_data.append({
            "sample": sample_id if sample_id else sample,
            "data": cur_data
        })

    return table_data


def main():
    core_genes = get_core_genes(core_file)

    with open(f1) as f1h, open(f2) as f2h:

        j1 = json.load(f1h)
        j2 = json.load(f2h)

        sample_info = [(k, v) for k, v in j1.items() if "header" not in k]
        current_array = j1["header"]
        status_info = []
        for sample, info in sample_info:

            sample_name = sample_id if sample_id else sample

            core_results = filter_core_genes(info, current_array, core_genes)
            status, perc = assess_quality(core_results, core_genes)
            status_info.append({
                "sample": sample_name,
                "status": status,
                "lnfPercentage": perc
            })

        table_data = get_table_data(j2, sample_name)
        res = {"cagao": [j1, j2], "status": status_info,
               "tableRow": table_data}

        with open(".report.json", "w") as fh:
            fh.write(json.dumps(res, separators=(",", ":")))


main()


================================================
FILE: flowcraft/bin/metadata_POST.sh
================================================
#!/usr/bin/env sh

set -ex

projectid=$1
pipelineid=$2
processid=$3
sample=$4
url=$5
username=$6
userid=$7
task=$8
species=$9

metadata_str="{}"

# If a .report.json file was populated, set the json_str variable
if [ -s .metadata.json ];
then
    metadata_str=$(cat $(pwd)/.metadata.json | sed 's/ /%20/g' | sed s/\"/\'/g)
fi

# If a .versions OR .report.json file was populated send the request
if [ ! "$metadata_str" = "{}" ];
then
    workdir=$(pwd)
    json="{'projectid':'$projectid','pipelineId':'$pipelineid','processId':'nfMetadata','sample_name':'$sample','nfMetadata':$metadata_str,'username':'$username','userId':'$userid','workdir':'$workdir','task':'nfMetadata','processName':'nfMetadata','species':'$species','overwrite':'false'}"
    echo \"${json}\" > .final.json
    {
        cat .final.json | curl -H  "Content-Type: application/json" -k -L -X POST -d @- $url > /dev/null
    } || {
        echo Curl request failed
    }

fi


================================================
FILE: flowcraft/bin/parse_fasta.py
================================================
#!/usr/bin/env python3


import argparse
from itertools import groupby
import os


def replace_char(text):
    for ch in ['/', '`', '*', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '$', ':']:
        text = text.replace(ch, "_")
    return text

def getSequence(ref, fasta):

    entry = (x[1] for x in groupby(fasta, lambda line: line[0] == ">"))

    for header in entry:
        headerStr = header.__next__()[1:].strip()
        seq = "".join(s.strip() for s in entry.__next__())

        if ref == headerStr.replace('>',''):
            filename = os.path.join(os.getcwd(), ref.replace('/','_').split('|')[0])
            fasta_header = replace_char(headerStr)
            output_file = open(filename + '.fa', "w")
            output_file.write(">" + fasta_header + "\n" + seq.upper() + "\n")
            output_file.close()
            header_file = open("header.txt", "w")
            header_file.write(fasta_header)
            header_file.close()

def main():

    parser = argparse.ArgumentParser(prog='parse_fasta.py', description="Parse FASTA files for a specific header", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v0.1'))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-t', type=str, metavar='header of sequence to be retrieved',
                             help='Uncompressed fastq file containing mate 1 reads', required=True)
    parser_required.add_argument('-f', type=argparse.FileType('r'), metavar='/path/to/input/file.fasta',
                             help='Fasta with the sequences', required=True)

    args = parser.parse_args()

    getSequence(args.t, args.f)


if __name__ == "__main__":
    main()

================================================
FILE: flowcraft/bin/parse_true_coverage.py
================================================
#!/usr/bin/env python

import sys
import json


def parse_true_coverage(report_json, fail_json=None):

    with open(report_json) as fh:
        res = json.load(fh)
        print("Report JSON: {}".format(res))

    with open(".report.json", "w") as report_fh:

        json_dic = {
            "tableRow": [
                {"header": "True Coverage",
                 "value": res["mean_sample_coverage"],
                 "table": "assembly",
                 "columnBar": True},
            ]
        }

        if fail_json:
            with open(fail_json) as fail_fh:
                fail = json.load(fail_fh)
                print("Fail JSON: {}".format(fail))

            json_dic["fail"] = {
                "process": "true_coverage",
                "value": []
            }

            for v in fail.values():
                json_dic["fail"]["value"].append(v)

        report_fh.write(json.dumps(json_dic, separators=(",", ":")))


def main():

    args = sys.argv[1:]
    report_json = args[0]
    try:
        fail_json = args[1]
    except IndexError:
        fail_json = None

    print("Parsing report {} and fail {}".format(report_json, fail_json))

    parse_true_coverage(report_json, fail_json)


main()


================================================
FILE: flowcraft/bin/prepare_reports.py
================================================
#!/usr/bin/env python3

import sys
import json
import logging

from os.path import dirname, abspath

logger = logging.getLogger("main.{}".format(__name__))


def write_json(report_json, version_json, trace_file, task_name,
               project_name, sample_name, pid, script_id, run_name):

    logging.info("Parsing report JSON")
    try:
        with open(report_json) as fh:
            _reports = fh.read().replace("'", '"')
            reports = json.loads(_reports)
            if "task" in reports:
                del reports["task"]
    except json.JSONDecodeError:
        logging.warning("Could not parse report JSON: {}".format(report_json))
        reports = {}

    logging.info("Parsing versions JSON")
    try:
        with open(version_json) as fh:
            _version = fh.read().replace("'", '"')
            versions = json.loads(_version)
    except json.JSONDecodeError:
        logging.warning("Could not parse versions JSON: {}".format(
            version_json))
        versions = []

    logging.info("Parsing trace file")
    with open(trace_file) as fh:
        trace = fh.readlines()

    report = {
        "pipelineId": run_name,
        "processId": pid,
        "processName": task_name,
        "projectid": run_name,
        "reportJson": reports,
        "runName": run_name,
        "scriptId": script_id,
        "versions": versions,
        "sampleName": sample_name,
        "trace": trace,
        "userId": 1,
        "username": "user",
        "workdir": dirname(abspath(report_json))
    }

    logging.info("Dumping final report JSON file")
    logging.debug("Final JSON file: {}".format(report))
    with open("{}_{}_report.json".format(task_name, sample_name), "w") \
            as report_fh:
        report_fh.write(json.dumps(report, separators=(",", ":")))


def main():

    # Fetch arguments
    args = sys.argv[1:]
    report_json = args[0]
    version_json = args[1]
    trace = args[2]
    sample_name = args[3]
    task_name = args[4]
    project_name = args[5]
    pid = args[6]
    script_id = args[7]
    run_name = args[8]
    logging.debug("Report JSON: {}".format(report_json))
    logging.debug("Version JSON: {}".format(version_json))
    logging.debug("Trace file: {}".format(trace))
    logging.debug("Sample name: {}".format(sample_name))
    logging.debug("Task name: {}".format(task_name))
    logging.debug("Project name: {}".format(project_name))
    logging.debug("Process ID: {}".format(pid))
    logging.debug("Script ID: {}".format(script_id))
    logging.debug("Run name: {}".format(run_name))

    # Write the final report JSON that compiles all information
    write_json(report_json, version_json, trace, task_name,
               project_name, sample_name, pid, script_id, run_name)


main()


================================================
FILE: flowcraft/bin/renamePE_samtoolsFASTQ.py
================================================
#!/usr/bin/env python2

#TODO - change to py3
# -*- coding: utf-8 -*-

"""
renamePE_samtoolsFASTQ.py - Rename the fastq headers with PE terminations
that were not include in samtools fastq command
<https://github.com/miguelpmachado/pythonScripts>
Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt>
Last modified: January 10, 2017
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import os
import sys
import time
import argparse
import itertools


version = '0.1'


def formartFastqHeaders(in_fastq_1, in_fastq_2, outdir):
	out_fastq_1 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_1))[0] + '.headersRenamed_1.fq')
	out_fastq_2 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_2))[0] + '.headersRenamed_2.fq')
	writer_in_fastq_1 = open(out_fastq_1, 'wt')
	writer_in_fastq_2 = open(out_fastq_2, 'wt')
	outfiles = [out_fastq_1, out_fastq_2]
	with open(in_fastq_1, 'rtU') as reader_in_fastq_1, open(in_fastq_2, 'rtU') as reader_in_fastq_2:
		plus_line = True
		quality_line = True
		number_reads = 0
		for in_1, in_2 in itertools.izip(reader_in_fastq_1, reader_in_fastq_2):
			if len(in_1) > 0:
				in_1 = in_1.splitlines()[0]
				in_2 = in_2.splitlines()[0]
				if in_1.startswith('@') and plus_line and quality_line:
					if in_1 != in_2:
						sys.exit('The PE fastq files are not aligned properly!')
					in_1 += '/1' + '\n'
					in_2 += '/2' + '\n'
					writer_in_fastq_1.write(in_1)
					writer_in_fastq_2.write(in_2)
					plus_line = False
					quality_line = False
				elif in_1.startswith('+') and not plus_line:
					in_1 += '\n'
					writer_in_fastq_1.write(in_1)
					writer_in_fastq_2.write(in_1)
					plus_line = True
				elif plus_line and not quality_line:
					in_1 += '\n'
					in_2 += '\n'
					writer_in_fastq_1.write(in_1)
					writer_in_fastq_2.write(in_2)
					writer_in_fastq_1.flush()
					writer_in_fastq_2.flush()
					number_reads += 1
					quality_line = True
				else:
					in_1 += '\n'
					in_2 += '\n'
					writer_in_fastq_1.write(in_1)
					writer_in_fastq_2.write(in_2)
	return number_reads, outfiles


def compressionType(file_to_test):
	magic_dict = {'\x1f\x8b\x08': ['gzip', 'gunzip'], '\x42\x5a\x68': ['bzip2', 'bunzip2']}

	max_len = max(len(x) for x in magic_dict)

	with open(file_to_test, 'r') as reader:
		file_start = reader.read(max_len)

	for magic, filetype in magic_dict.items():
		if file_start.startswith(magic):
			return filetype
	return None


def runTime(start_time):
	end_time = time.time()
	time_taken = end_time - start_time
	hours, rest = divmod(time_taken, 3600)
	minutes, seconds = divmod(rest, 60)
	print 'Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's'
	return time_taken


def main():
	parser = argparse.ArgumentParser(prog='renamePE_samtoolsFASTQ.py', description='Rename the fastq headers with PE terminations that were not include in samtools fastq command', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))

	parser_required = parser.add_argument_group('Required options')
	parser_required.add_argument('-1', '--fastq_1', type=argparse.FileType('r'), metavar='/path/to/input/file_1.fq', help='Uncompressed fastq file containing mate 1 reads', required=True)
	parser_required.add_argument('-2', '--fastq_2', type=argparse.FileType('r'), metavar='/path/to/input/file_2.fq', help='Uncompressed fastq file containing mate 2 reads', required=True)

	parser_optional_general = parser.add_argument_group('General facultative options')
	parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/output/directory/', help='Path for output directory', required=False, default='.')

	args = parser.parse_args()

	print '\n' + 'STARTING renamePE_samtoolsFASTQ.py' + '\n'
	start_time = time.time()

	fastq_files = [os.path.abspath(args.fastq_1.name), os.path.abspath(args.fastq_2.name)]

	print 'Check if files are compressed' + '\n'
	for fastq in fastq_files:
		if compressionType(fastq) is not None:
			sys.exit('Compressed fastq files found')

	outdir = os.path.abspath(args.outdir)
	if not os.path.isdir(outdir):
		os.makedirs(outdir)

	print 'Renaming fastq headers' + '\n'
	number_reads, outfiles = formartFastqHeaders(fastq_files[0], fastq_files[1], outdir)

	print 'It was written ' + str(number_reads) + ' read pairs in ' + str(outfiles) + ' files' + '\n'

	print '\n' + 'END renamePE_samtoolsFASTQ.py'
	time_taken = runTime(start_time)
	del time_taken


if __name__ == "__main__":
	main()

================================================
FILE: flowcraft/bin/report_POST.sh
================================================
#!/usr/bin/env sh

set -ex

projectid=$1
pipelineid=$2
processid=$3
sample=$4
url=$5
username=$6
userid=$7
task=$8
species=$9
overwrite=${10}

json_str="{}"
version_str="[]"
trace_str=""

# If a .report.json file was populated, set the json_str variable
if [ -s .report.json ];
then

    # Modification of the JSON string should be different for chewbbaca
    # output
    if [ $task = "chewbbaca" ];
    then
        json_str=$(cat $(pwd)/.report.json | sed 's/ //g' | sed s/\"/\'/g)
    else
        json_str=$(cat $(pwd)/.report.json | sed 's/ /%20/g' | sed s/\"/\'/g)
    fi
fi

# If a .versions file was populated, set the version_str variable
if [ -s .versions ];
then
    version_str=$(< $(pwd)/.versions sed 's/ /%20/g' | sed s/\"/\'/g)
fi

if [ -s .command.trace ];
then
    trace_str="$(< $(pwd)/.command.trace tr "\n" ";")"
fi

# If a .versions OR .report.json file was populated send the request
if [ ! "$json_str" = "{}" ] || [ ! "$version_str" = "[]" ] || [ ! "$trace_str" = "" ];
then
    workdir=$(pwd)
    json="{'projectid':'$projectid','pipelineId':'$pipelineid','processId':'$processid','sample_name':'$sample','reportJson':$json_str,'username':'$username','userId':'$userid','workdir':'$workdir','task':'$task','processName':'$task','species':'$species','versions':$version_str,'trace':'$trace_str', 'overwrite': '$overwrite'}"
    echo \"${json}\" > .final.json
    {
        cat .final.json | curl -H  "Content-Type: application/json" -k -L -X POST -d @- $url > /dev/null
    } || {
        echo Curl request failed
    }

fi


================================================
FILE: flowcraft/bin/set_dotfiles.sh
================================================
#!/usr/bin/env bash

touch .status .warning .fail .report.json .versions

================================================
FILE: flowcraft/bin/startup_POST.sh
================================================
#!/usr/bin/env bash

json="{'project_id':'$1','pipeline_id':'$2','process_id':'$3','run_property':'log_file,status','run_property_value':'$(pwd)/.command.log,running','type':'output'}"

{
    curl -H  "Content-Type: application/json" -L -X PUT -d \"$json\" $4 > /dev/null
} || {
    echo Curl request failed
}

================================================
FILE: flowcraft/flowcraft.py
================================================
#!/usr/bin/env python3

import os
import sys
import shutil
import logging
import argparse
import logging.config

from distutils.dir_util import copy_tree

from os.path import join, dirname

try:
    from __init__ import __version__, __build__
    from generator.engine import NextflowGenerator
    from generator.inspect import NextflowInspector
    from generator.report import FlowcraftReport
    from generator.process_collector import collect_process_map
    from generator.recipe import brew_innuendo, brew_recipe, list_recipes
    from generator.pipeline_parser import parse_pipeline, SanityError
    from generator.process_details import proc_collector, colored_print
    import generator.error_handling as eh
except ImportError as e:
    from flowcraft import __version__, __build__
    from flowcraft.generator.engine import NextflowGenerator
    from flowcraft.generator.inspect import NextflowInspector
    from flowcraft.generator.report import FlowcraftReport
    from flowcraft.generator.recipe import brew_innuendo, \
        brew_recipe, list_recipes
    from flowcraft.generator.pipeline_parser import parse_pipeline, \
        SanityError
    from flowcraft.generator.process_details import proc_collector, \
        colored_print
    import flowcraft.generator.error_handling as eh
    from flowcraft.generator.process_collector import collect_process_map

logger = logging.getLogger("main")


def get_args(args=None):

    parser = argparse.ArgumentParser(
        description="A Nextflow pipeline generator")

    subparsers = parser.add_subparsers(help="Select which mode to run",
                                       dest="main_op")

    # BUILD MODE
    build_parser = subparsers.add_parser("build",
                                         help="Build a nextflow pipeline")

    group_lists = build_parser.add_mutually_exclusive_group()

    build_parser.add_argument(
        "-t", "--tasks", type=str, dest="tasks",
        help="Space separated tasks of the pipeline")
    build_parser.add_argument(
        "-r", "--recipe", dest="recipe",
        help="Use one of the available recipes")
    build_parser.add_argument(
        "-o", dest="output_nf", help="Name of the pipeline file")
    build_parser.add_argument(
        "-n", dest="pipeline_name", default="flowcraft",
        help="Provide a name for your pipeline.")
    build_parser.add_argument(
        "--merge-params", dest="merge_params", action="store_true",
        help="Merges identical parameters from multiple components into the "
             "same one. Otherwise, the parameters will be separated and unique"
             " to each component.")
    build_parser.add_argument(
        "--pipeline-only", dest="pipeline_only", action="store_true",
        help="Write only the pipeline files and not the templates, bin, and"
             " lib folders.")
    build_parser.add_argument(
        "-nd", "--no-dependecy", dest="no_dep", action="store_false",
        help="Do not automatically add dependencies to the pipeline.")
    build_parser.add_argument(
        "-c", "--check-pipeline", dest="check_only", action="store_const",
        const=True, help="Check only the validity of the pipeline "
                         "string and exit.")
    group_lists.add_argument(
        "-L", "--component-list", action="store_const", dest="detailed_list",
        const=True, help="Print a detailed description for all the "
                         "currently available processes.")
    group_lists.add_argument(
        "-l", "--component-list-short", action="store_const", dest="short_list",
        const=True, help="Print a short list of the currently "
                         "available processes.")
    group_lists.add_argument(
        "--recipe-list", dest="recipe_list", action="store_const", const=True,
        help="Print a short list of the currently available recipes."
    )
    group_lists.add_argument(
        "--recipe-list-short", dest="recipe_list_short", action="store_const",
        const=True, help="Print a condensed list of the currently available "
                         "recipes"
    )
    build_parser.add_argument(
        "-cr", "--check-recipe", dest="check_recipe",
        action="store_const", const=True,
        help="Check tasks that the recipe contain and "
             "their flow. This option might be useful "
             "if a user wants to change some components "
             "of a given recipe, by using the -t option.")
    build_parser.add_argument(
        "--export-params", dest="export_params", action="store_const",
        const=True, help="Only export the parameters for the provided "
                         "components (via -t option) in JSON format to stdout. "
                         "No pipeline will be generated with this option."
    )
    build_parser.add_argument(
        "--export-directives", dest="export_directives", action="store_const",
        const=True, help="Only export the directives for the provided "
                         "components (via -t option) in JSON format to stdout. "
                         "No pipeline will be generated with this option."
    )
    build_parser.add_argument(
        "-ft", "--fetch-tags", dest="fetch_docker_tags",
        action="store_const", const=True, help="Allows to fetch all docker tags"
                                               " for the components listed with"
                                               " the -t flag."
    )

    # GENERAL OPTIONS
    parser.add_argument(
        "--debug", dest="debug", action="store_const", const=True,
        help="Set log to debug mode")
    parser.add_argument(
        "-v", "--version", dest="version", action="store_const", const=True,
        help="Show version and exit.")

    # INSPECT MODE
    inspect_parser = subparsers.add_parser("inspect",
                                           help="Inspect the progress of a "
                                                "pipeline execution")
    inspect_parser.add_argument(
        "-i", dest="trace_file", default="pipeline_stats.txt",
        help="Specify the nextflow trace file."
    )
    inspect_parser.add_argument(
        "-r", dest="refresh_rate", default=0.02,
        help="Set the refresh frequency for the continuous inspect functions"
    )
    inspect_parser.add_argument(
        "-m", "--mode", dest="mode", default="overview",
        choices=["overview", "broadcast"],
        help="Specify the inspection run mode."
    )
    inspect_parser.add_argument(
        "-u", "--url", dest="url", default="http://www.flowcraft.live:80/",
        help="Specify the URL to where the data should be broadcast"
    )
    inspect_parser.add_argument(
        "--pretty", dest="pretty", action="store_const", const=True,
        help="Pretty inspection mode that removes usual reporting processes."
    )

    # REPORT MODE
    reports_parser = subparsers.add_parser("report",
                                           help="Broadcast the report of "
                                                "a pipeline")
    reports_parser.add_argument(
        "-i", dest="report_file",
        default="pipeline_report/pipeline_report.json",
        help="Specify the path to the pipeline report JSON file."
    )
    reports_parser.add_argument(
        "-u", "--url", dest="url", default="http://www.flowcraft.live:80/",
        help="Specify the URL to where the data should be broadcast"
    )
    reports_parser.add_argument(
        "--trace-file", dest="trace_file", default="pipeline_stats.txt",
        help="Specify the nextflow trace file. Only applicable in combination "
             "with --watch option."
    )
    reports_parser.add_argument(
        "--log-file", dest="log_file", default=".nextflow.log",
        help="Specify the nextflow log file. Only applicable in combination "
             "with --watch option."
    )
    reports_parser.add_argument(
        "-w", "--watch", dest="watch",  action="store_const", const=True,
        help="Run the report in watch mode. This option will track the "
             "generation of reports during the execution of the pipeline, "
             "allowing for the visualization of the reports in real-time"
    )

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    return parser.parse_args(args)


def validate_build_arguments(args):

    # Skip all checks when listing the processes
    if args.detailed_list or args.short_list:
        return

    # Skip all checks when exporting parameters AND providing at least one
    # component
    if args.export_params or args.export_directives or args.fetch_docker_tags:
        # Check if components provided
        if not args.tasks:
            logger.error(colored_print(
                "At least one component needs to be provided via the -t option"
                " when exporting parameters in JSON format"
            ))
            sys.exit(1)
        return

    # When none of the main run options is specified
    if not args.tasks and not args.recipe and not args.check_only \
            and not args.detailed_list and not args.short_list:
        logger.error(colored_print(
            "At least one of these options is required: -t, -r, -c, "
            "-l, -L", "red_bold"))
        sys.exit(1)

    # When the build mode is active via tasks or recipe, but no output file
    # option has been provided
    if (args.tasks or args.recipe) and not args.check_recipe \
            and not args.output_nf:
        logger.error(colored_print(
            "Please provide the path and name of the pipeline file using the"
            " -o option.", "red_bold"))
        sys.exit(1)

    if args.output_nf:
        if not os.path.basename(args.output_nf):
            logger.error(colored_print(
                "Output pipeline path '{}' missing a name (only the directory "
                "path was provided)".format(args.output_nf), "red_bold"))
            sys.exit(1)

        parsed_output_nf = (args.output_nf if args.output_nf.endswith(".nf")
                            else "{}.nf".format(args.output_nf.strip()))
        opath = parsed_output_nf
        if os.path.dirname(opath):
            parent_dir = os.path.dirname(opath)
            if not os.path.exists(parent_dir):
                logger.error(colored_print(
                    "The provided directory '{}' does not exist.".format(
                        parent_dir), "red_bold"))
                sys.exit(1)

        return parsed_output_nf


def copy_project(path):
    """

    Parameters
    ----------
    path

    Returns
    -------

    """

    # Get nextflow repo directory
    repo_dir = dirname(os.path.abspath(__file__))

    # Get target directory
    target_dir = dirname(path)

    # Copy templates
    copy_tree(join(repo_dir, "templates"), join(target_dir, "templates"))

    # Copy Helper scripts
    copy_tree(join(repo_dir, "lib"), join(target_dir, "lib"))

    # Copy resources dir
    copy_tree(join(repo_dir, "resources"), join(target_dir, "resources"))

    # Copy bin scripts
    copy_tree(join(repo_dir, "bin"), join(target_dir, "bin"))

    # Copy static profiles file
    shutil.copy(join(repo_dir, "profiles.config"),
                join(target_dir, "profiles.config"))


def build(args):

    # Disable standard logging for stdout when the following modes are
    #  executed:
    if args.export_params or args.export_directives or args.fetch_docker_tags:
        logger.setLevel(logging.ERROR)

    if args.recipe_list_short:
        list_recipes()

    if args.recipe_list:
        list_recipes(full=True)

    welcome = [
        "========= F L O W C R A F T =========",
        "Build mode\n"
        "version: {}".format(__version__),
        "build: {}".format(__build__),
        "====================================="
    ]

    parsed_output_nf = validate_build_arguments(args)

    logger.info(colored_print("\n".join(welcome), "green_bold"))

    # If a recipe is specified, build pipeline based on the
    # appropriate recipe
    if args.recipe:
        if args.recipe == "innuendo":
            pipeline_string = brew_innuendo(args)
        else:
            # pipeline_string = available_recipes[args.recipe]
            pipeline_string = brew_recipe(args.recipe)
            if args.tasks:
                logger.warning(colored_print(
                    "-t parameter will be ignored for recipe: {}\n".format(
                        args.recipe), "yellow_bold")
                )

        if args.check_recipe:
            logger.info(colored_print("Pipeline string for recipe: {}"
                                      .format(args.recipe), "purple_bold"))
            logger.info(pipeline_string)
            sys.exit(0)
    else:
        pipeline_string = args.tasks

    process_map = collect_process_map()

    # used for lists print
    proc_collector(process_map, args, pipeline_string)

    try:
        logger.info(colored_print("Checking pipeline for errors..."))
        pipeline_list = parse_pipeline(pipeline_string)
    except SanityError as e:
        logger.error(colored_print(e.value, "red_bold"))
        sys.exit(1)
    logger.debug("Pipeline successfully parsed: {}".format(pipeline_list))

    # Exit if only the pipeline parser needs to be checked
    if args.check_only:
        sys.exit()

    nfg = NextflowGenerator(process_connections=pipeline_list,
                            nextflow_file=parsed_output_nf,
                            process_map=process_map,
                            pipeline_name=args.pipeline_name,
                            auto_dependency=args.no_dep,
                            merge_params=args.merge_params,
                            export_params=args.export_params)

    logger.info(colored_print("Building your awesome pipeline..."))

    if args.export_params:
        nfg.export_params()
        sys.exit(0)
    elif args.export_directives:
        nfg.export_directives()
        sys.exit(0)
    elif args.fetch_docker_tags:
        nfg.fetch_docker_tags()
        sys.exit(0)
    else:
        # building the actual pipeline nf file
        nfg.build()

    # copy template to cwd, to allow for immediate execution
    if not args.pipeline_only:
        copy_project(parsed_output_nf)

    logger.info(colored_print("DONE!", "green_bold"))


def inspect(args):

    try:
        nf_inspect = NextflowInspector(args.trace_file, args.refresh_rate,
                                       args.pretty, args.url)
        if args.mode == "overview":
            nf_inspect.display_overview()

        if args.mode == "broadcast":
            nf_inspect.broadcast_status()

    except eh.InspectionError as ie:
        logger.error(colored_print(ie.value, "red_bold"))
        sys.exit(1)

    except eh.LogError as le:
        logger.error(colored_print(le.value, "red_bold"))
        sys.exit(1)


def report(args):

    try:
        fc_report = FlowcraftReport(
            report_file=args.report_file,
            trace_file=args.trace_file,
            log_file=args.log_file,
            watch=args.watch,
            ip_addr=args.url)

        fc_report.broadcast_report()

    except eh.ReportError as re:
        logger.error(colored_print(re.value, "red_bold"))
        sys.exit(1)

    except eh.LogError as le:
        logger.error(colored_print(le.value, "red_bold"))
        sys.exit(1)


def main():

    args = get_args()

    if args.version:
        print(__version__)

    if args.debug:
        logger.setLevel(logging.DEBUG)

        # create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    else:
        logger.setLevel(logging.INFO)

        # create special formatter for info logs
        formatter = logging.Formatter('%(message)s')

    # create console handler and set level to debug
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.DEBUG)

    # add formatter to ch
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    if args.main_op == "build":
        build(args)

    if args.main_op == "inspect":
        inspect(args)

    if args.main_op == "report":
        report(args)


if __name__ == '__main__':

    main()


================================================
FILE: flowcraft/generator/__init__.py
================================================
"""
Placeholder for Process creation docs
"""

================================================
FILE: flowcraft/generator/components/__init__.py
================================================


================================================
FILE: flowcraft/generator/components/alignment.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Mafft(Process):
    """mafft to align sequences

            This process is set with:

                - ``input_type``: fasta
                - ``output_type``: align
                - ``ptype``: sequence alignment

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "align"

        self.params = {
        }

        self.link_end.append({"link": "_ref_seqTyping", "alias": "_ref_seqTyping"})


        self.directives = {
            "mafft": {
                "container": "flowcraft/mafft",
                "version": "7.402-1",
                "cpus": 4,
                "memory": "{ 4.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "mafft"
        ]


class ProgressiveMauve(Process):
    """Mauve to align sequences

            This process is set with:

                - ``input_type``: fasta
                - ``output_type``: align
                - ``ptype``: sequence alignment

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "align"

        self.params = {
        }

        self.directives = {
            "progressive_mauve": {
                "container": "flowcraft/mauve",
                "version": "2015.02.13-1",
                "cpus": 4,
                "memory": "{ 4.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "progressive_mauve"
        ]

================================================
FILE: flowcraft/generator/components/annotation.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Abricate(Process):
    """Abricate mapping process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: None
        - ``ptype``: post_assembly

    It contains one **secondary channel link end**:

        - ``MAIN_assembly`` (alias: ``MAIN_assembly``): Receives the last
        assembly.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.ignore_type = True

        self.status_channels = ["STATUS_abricate", "STATUS_process_abricate"]

        self.params = {
            "abricateDatabases": {
                "default": '["resfinder", "card", "vfdb", "plasmidfinder", '
                           '"virulencefinder", "bacmet"]',
                "description": "Specify the databases for abricate."
            },
            "abricateDataDir": {
                "default": 'null',
                "description": "Specify the full path location of the database "
                               "folders."
            },
            "abricateMinId": {
                "default": '75',
                "description": "Minimum DNA %identity."
            },
            "abricateMinCov": {
                "default": '0',
                "description": "Minimum DNA %coverage."
            }
        }

        self.link_start = None
        self.link_end.append({"link": "MAIN_assembly",
                              "alias": "MAIN_assembly"})

        self.directives = {
            "abricate": {
                "container": "flowcraft/abricate",
                "version": "0.8.0-3"
            },
            "process_abricate": {
                "container": "flowcraft/abricate",
                "version": "0.8.0-3"
            }
        }


class CardRgi(Process):
    """card's rgi process template interface

        This process is set with:

            - ``input_type``: fasta
            - ``output_type``: txt
            - ``ptype``: resistance gene detection (assembly)
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "txt"

        self.params = {
            "alignmentTool": {
                "default": "'DIAMOND'",
                "description": "Specifies the alignment tool."
                               "Options: DIAMOND or BLAST"
            }
        }

        self.directives = {
            "card_rgi": {
                "container": "flowcraft/card_rgi",
                "version": "4.0.2-0.1",
                "memory": "{10.Gb*task.attempt}"
            }
        }

        self.status_channels = [
            "card_rgi"
        ]


class Prokka(Process):
    """Prokka mapping process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: None
        - ``ptype``: post_assembly

    It contains one **secondary channel link end**:

        - ``MAIN_assembly`` (alias: ``MAIN_assembly``): Receives the last
        assembly.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.ignore_type = True

        self.link_start = None
        self.link_end.append({"link": "MAIN_assembly",
                              "alias": "MAIN_assembly"})

        self.params = {
            "centre": {
                "default": "'UMMI'",
                "description": "sequencing centre ID"
            },
            "kingdom": {
                "default": "'Bacteria'",
                "description": "Annotation mode: Archaea|Bacteria|Mitochondria"
                               "|Viruses (default 'Bacteria')"
            },
            "genus": {
                "default": "false",
                "description": "Genus name (default 'Genus'). This also adds"
                               "the --usegenus flag to prokka"
            },
        }

        self.directives = {
            "prokka": {
                "cpus": 2,
                "container": "ummidock/prokka",
                "version": "1.12"
            }
        }


class Diamond(Process):
    """diamond process for protein database queries

        This process is set with:

            - ``input_type``: fasta
            - ``output_type``: None
            - ``ptype``: post_assembly
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.params = {
            "pathToDb": {
                "default": 'null',
                "description": "Provide full path for the diamond database. "
                               "If none is provided then will try to fetch from"
                               " the previous process. Default: None"
            },
            "fastaToDb": {
                "default": 'null',
                "description": "Provide the full path for the fasta to "
                               "construct a diamond database. Default: None"
            },
            "blastType": {
                "default": "'blastx'",
                "description": "Defines the type of blast that diamond will do."
                               "Can wither be blastx or blastp. Default: blastx"
            }
        }

        self.directives = {
            "diamond": {
                "container": "flowcraft/diamond",
                "version": "0.9.22-1",
                "memory": "{ 4.GB * task.attempt }",
                "cpus": 2
            }
        }


================================================
FILE: flowcraft/generator/components/assembly.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Bcalm(Process):
    """Bcalm process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: assembly
        - ``ptype``: assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.params = {
            "bcalmKmerSize": {
                "default": 31,
                "description":
                    "size of a kmer"
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"bcalm": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "quay.io/biocontainers/bcalm",
            "version": "2.2.0--hd28b015_2",
            "scratch": "true"
        }}


class Spades(Process):
    """Spades process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: assembly
        - ``ptype``: assembly

    It contains one **secondary channel link end**:

        - ``SIDE_max_len`` (alias: ``SIDE_max_len``): Receives max read length
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.link_end.append({"link": "SIDE_max_len", "alias": "SIDE_max_len"})
        self.link_start.append("gfa1")

        self.dependencies = ["integrity_coverage"]

        self.params = {
            "spadesMinCoverage": {
                "default": 2,
                "description":
                    "The minimum number of reads to consider an edge in the"
                    " de Bruijn graph during the assembly"
            },
            "spadesMinKmerCoverage": {
                "default": 2,
                "description":
                    "Minimum contigs K-mer coverage. After assembly only "
                    "keep contigs with reported k-mer coverage equal or "
                    "above this value"
            },
            "spadesKmers": {
                "default": "'auto'",
                "description":
                    "If 'auto' the SPAdes k-mer lengths will be determined "
                    "from the maximum read length of each assembly. If "
                    "'default', SPAdes will use the default k-mer lengths. "
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            },
            "disableRR": {
                "default": "false",
                "description":
                    "disables repeat resolution stage of assembling."
            }
        }

        self.directives = {"spades": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/spades",
            "version": "3.13.0-1",
            "scratch": "true"
        }}


class Skesa(Process):
    """Skesa process template interface
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.directives = {"skesa": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/skesa",
            "version": "2.3.0-1",
            "scratch": "true"
        }}

        self.params = {
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }


class ViralAssembly(Process):
    """
    Process to assemble viral genomes, based on SPAdes and megahit
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.dependencies = ["integrity_coverage"]

        self.status_channels = ["va_spades", "va_megahit",
                                "report_viral_assembly"]

        self.link_end.append({"link": "SIDE_max_len", "alias": "SIDE_max_len"})

        self.directives = {"va_spades": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/viral_assembly",
            "version": "0.1-1",
            "scratch": "true"
        }, "va_megahit": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/viral_assembly",
            "version": "0.1-1",
            "scratch": "true"
        }}

        self.params = {
            "minimumContigSize": {
                "default": 10000,
                "description":
                    "Expected genome size in bases"
            },
            "spadesMinCoverage": {
                "default": 2,
                "description":
                    "The minimum number of reads to consider an edge in the"
                    " de Bruijn graph during the assembly"
            },
            "spadesMinKmerCoverage": {
                "default": 2,
                "description":
                    "Minimum contigs K-mer coverage. After assembly only "
                    "keep contigs with reported k-mer coverage equal or "
                    "above this value"
            },
            "spadesKmers": {
                "default": "'auto'",
                "description":
                    "If 'auto' the SPAdes k-mer lengths will be determined "
                    "from the maximum read length of each assembly. If "
                    "'default', SPAdes will use the default k-mer lengths. "
            },
            "megahitKmers": {
                "default": "'auto'",
                "description":
                    "If 'auto' the megahit k-mer lengths will be determined "
                    "from the maximum read length of each assembly. If "
                    "'default', megahit will use the default k-mer lengths. "
                    "(default: $params.megahitKmers)"
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }


class Abyss(Process):
    """ABySS process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: assembly
        - ``ptype``: assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"
        self.link_start.append("gfa1")

        self.params = {
            "abyssKmer": {
                "default": "96",
                "description":
                    "kmer size for assembly."
            }
        }

        self.directives = {"abyss": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/abyss",
            "version": "2.1.1",
            "scratch": "true"
        }}


class Unicycler(Process):
    """Unicycler process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: assembly
        - ``ptype``: assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"
        self.link_start.append("gfa1")

        self.directives = {"unicycler": {
            "cpus": 4,
            "container": "quay.io/biocontainers/unicycler",
            "version": "0.4.7--py36hdbcaa40_0",
            "scratch": "true"
        }}


================================================
FILE: flowcraft/generator/components/assembly_processing.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class ProcessSkesa(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.params = {
            "genomeSize": {
                "default": 1,
                "description":
                    "Genome size estimate for the samples in Mb. It is used "
                    "to assess whether an assembly is much larger or smaller "
                    "than expected",
            },
            "skesaMinKmerCoverage": {
                "default": 2,
                "description":
                    "Minimum contigs K-mer coverage. After assembly only keep"
                    " contigs with reported k-mer coverage equal or above "
                    "this value"
            },
            "skesaMinContigLen": {
                "default": 200,
                "description":
                    "Filter contigs for length greater or equal than this "
                    "value"
            },
            "skesaMaxContigs": {
                "default": 100,
                "description":
                    "Maximum number of contigs per 1.5 Mb of expected "
                    "genome size"
            }
        }

        self.directives = {"skesa": {
            "cpus": 1,
            "memory": "'2GB'",
            "container": "flowcraft/skesa",
            "version": "2.1-1",
        }}


class ProcessSpades(Process):
    """Process spades process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: assembly
        - ``ptype``: post_assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.params = {
            "genomeSize": {
                "default": 1,
                "description":
                    "Genome size estimate for the samples in Mb. It is used "
                    "to assess whether an assembly is much larger or smaller "
                    "than expected",

            },
            "spadesMinKmerCoverage": {
                "default": 2,
                "description":
                    "Minimum contigs K-mer coverage. After assembly only keep"
                    " contigs with reported k-mer coverage equal or above "
                    "this value"
            },
            "spadesMinContigLen": {
                "default": 200,
                "description":
                    "Filter contigs for length greater or equal than this "
                    "value"
            },
            "spadesMaxContigs": {
                "default": 100,
                "description":
                    "Maximum number of contigs per 1.5 Mb of expected "
                    "genome size"
            }
        }

        self.directives = {"process_spades": {
            "container": "flowcraft/spades",
            "version": "3.11.1-1"
        }}


class AssemblyMapping(Process):
    """Assembly mapping process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: assembly
        - ``ptype``: post_assembly

    It contains one **secondary channel link end**:

        - ``MAIN_fq`` (alias: ``_MAIN_assembly``): Receives the FastQ files
        from the last process with ``fastq`` output type.

    It contains two **status channels**:

        - ``STATUS_am``: Status for the assembly_mapping process
        - ``STATUS_amp``: Status for the process_assembly_mapping process
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.status_channels = ["STATUS_assembly_mapping",
                                "STATUS_process_am"]

        self.link_start.append("SIDE_BpCoverage")
        self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

        self.params = {
            "minAssemblyCoverage": {
                "default": "'auto'",
                "description":
                    "In auto, the default minimum coverage for each "
                    "assembled contig is 1/3 of the assembly mean coverage or"
                    " 10x, if the mean coverage is below 10x"
            },
            "AMaxContigs": {
                "default": 100,
                "description":
                    "A warning is issued if the number of contigs is over"
                    "this threshold."
            },
            "genomeSize": {
                "default": 2.1,
                "description":
                    "Genome size estimate for the samples. It is used to "
                    "check the ratio of contig number per genome MB"
            }
        }

        self.directives = {
            "assembly_mapping": {
                "cpus": 4,
                "memory": "{ 5.GB * task.attempt }",
                "container": "flowcraft/bowtie2_samtools",
                "version": "1.0.0-1"
            },
            "process_assembly_mapping": {
                "cpus": 1,
                "memory": "{ 5.GB * task.attempt }",
                "container": "flowcraft/bowtie2_samtools",
                "version": "1.0.0-1"
            }
        }


class Pilon(Process):
    """Pilon mapping process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: assembly
        - ``ptype``: post_assembly

    It contains one **dependency process**:

        - ``assembly_mapping``: Requires the BAM file generated by the
        assembly mapping process
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.dependencies = ["assembly_mapping"]
        self.status_channels = ["STATUS_pilon", "STATUS_pilon_report"]

        self.link_end.append({"link": "SIDE_BpCoverage",
                              "alias": "SIDE_BpCoverage"})

        self.params = {
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "pilon": {
                "cpus": 4,
                "memory": "{ 7.GB * task.attempt }",
                "container": "flowcraft/pilon",
                "version": "1.22.0-1"
            },
            "pilon_report": {
                "cpus": 1,
                "memory": "{ 7.GB * task.attempt }",
                "container": "flowcraft/pilon",
                "version": "1.22.0-1"
            }
        }

class Bandage(Process):
    """Visualize the assembly using Bandage

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: none
        - ``ptype``: post_assembly

    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.link_end.append({"link": "gfa1", "alias": "gfa1"})

        self.params = {
            "reference": {
                "default": "null",
                "description": "Align the assembly to this reference genome using BLAST"
            },
        }

        self.directives = {
            "bandage": {
                "container": "flowcraft/bandage",
                "version": "0.8.1"
            }
        }

class Quast(Process):
    """Assess assembly quality using QUAST

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: tsv
        - ``ptype``: post_assembly

    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "tsv"

        self.params = {
            "reference": {
                "default": "null",
                "description": "Compare the assembly to this reference genome"
            },
            "genomeSizeBp": {
                "default": "null",
                "description": "Expected genome size (bp)"
            },
        }

        self.directives = {
            "quast": {
                "container": "quay.io/biocontainers/quast",
                "version": "5.0.0--py27pl526ha92aebf_1"
            }
        }


================================================
FILE: flowcraft/generator/components/distance_estimation.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class MashDist(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "json"

        self.params = {
            "pValue": {
                "default": 0.05,
                "description": "P-value cutoff for the distance estimation "
                               "between two sequences to be included in the "
                               "output."
            },
            "mash_distance": {
                "default": 0.1,
                "description": "Sets the maximum distance between two "
                               "sequences to be included in the output."
            },
            "shared_hashes": {
                "default": 0.8,
                "description": "Sets a minimum percentage of hashes shared "
                               "between two sequences in order to include its "
                               "result in the output."
            },
            "refFile": {
                "default": "'/ngstools/data/plasmid_db_reference.msh'",
                "description": "Specifies the reference file to be provided "
                               "to mash. It can either be a fasta or a .msh "
                               "reference sketch generated by mash."
            }
        }

        self.directives = {
            "runMashDist": {
                "container": "flowcraft/mash-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "{ 4.GB * task.attempt }"
            },
            "mashDistOutputJson": {
                "container": "flowcraft/mash-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "'4GB'"
            }
        }

        self.status_channels = [
            "runMashDist",
            "mashDistOutputJson"
        ]

        self.link_end.append({
            "link": "SIDE_mashSketchOutChannel",
            "alias": "SIDE_mashSketchOutChannel"
        })


class MashScreen(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "json"

        self.params = {
            "noWinner": {
                "default": "false",
                "description": "A variable that enables the use of -w option"
                               " for mash screen."
            },
            "pValue": {
                "default": 0.05,
                "description": "P-value cutoff for the distance estimation "
                               "between two sequences to be included in the "
                               "output."
            },
            "identity": {
                "default": 0.9,
                "description": "The percentage of identity between the reads "
                               "input and the reference sequence"
            },
            "refFile": {
                "default": "'/ngstools/data/plasmid_db_reference.msh'",
                "description": "Specifies the reference file to be provided "
                               "to mash. It can either be a fasta or a .msh "
                               "reference sketch generated by mash."
            }
        }

        self.directives = {
            "mashScreen": {
                "container": "flowcraft/mash-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "{ 4.GB * task.attempt }"
            },
            "mashOutputJson": {
                "container": "flowcraft/mash-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "'4GB'"
            }
        }

        self.status_channels = [
            "mashScreen",
            "mashOutputJson"
        ]

        self.compiler["patlas_consensus"] = ["mashScreenOutputChannel"]

        self.link_end.append({
            "link": "SIDE_mashSketchOutChannel",
            "alias": "SIDE_mashSketchOutChannel"
        })


class MashSketchFasta(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "msh"

        self.ignore_type = True

        self.params = {
            "kmerSize": {
                "default": 21,
                "description": "Set the kmer size for hashing. Default: 21."
            },
            "sketchSize": {
                "default": 1000,
                "description": "Set the number of hashes per sketch. Default: "
                               "1000"
            },
        }

        self.directives = {
            "mashSketchFasta": {
                "container": "flowcraft/mash-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "{ 4.GB * task.attempt }"
            },
        }

        self.status_channels = [
            "mashSketchFasta",
        ]

        self.link_start.extend(["SIDE_mashSketchOutChannel"])


class MashSketchFastq(MashSketchFasta):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"

        # add more params to dict
        self.params.update({
            "minKmer": {
                "default": 1,
                "description": "Minimum copies of each k-mer required to pass "
                               "noise filter for reads. Implies -r. Default: 1"
            },
            "genomeSize": {
                "default": "false",
                "description": "Genome size (raw bases or with K/M/G/T). If "
                               "specified, will be used for p-value calculation"
                               " instead of an estimated size from k-mer "
                               "content. Default: false, meaning that it won't"
                               "be used. If you want to use it pass a number to"
                               " this parameter."
            }
        })

        self.directives = {
            "mashSketchFastq": self.directives["mashSketchFasta"]
        }

        self.status_channels = [
            "mashSketchFastq",
        ]


class FastAni(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"

        self.params = {
            "fragLen": {
                "default": 3000,
                "description": "Set size of fragment. Default: 3000."
            }
        }

        self.directives = {
            "fastAniMatrix": {
                "container": "flowcraft/fast_ani",
                "version": "1.1.0-2",
                "cpus": 20,
                "memory": "{ 30.GB * task.attempt }"
            },
        }

        self.status_channels = [
            "fastAniMatrix",
        ]


================================================
FILE: flowcraft/generator/components/downloads.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class ReadsDownload(Process):
    """Process template interface for reads downloading from SRA and NCBI

    This process is set with:

        - ``input_type``: accessions
        - ``output_type`` fastq

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "accessions"
        self.output_type = "fastq"

        self.params = {
            "asperaKey": {
                "default": "null",
                "description":
                    "Downloads fastq accessions from ENA using Aspera Connect "
                    "by providing the private-key file "
                    "'asperaweb_id_dsa.openssh' normally found in "
                    "~/.aspera/connect/etc/asperaweb_id_dsa.openssh "
            }
        }

        self.directives = {"reads_download": {
            "cpus": 1,
            "memory": "'1GB'",
            "container": "flowcraft/getseqena",
            "version": "0.4.0-1"
        }}


class FasterqDump(Process):
    """Process template for fasterq-dump

    This process is set with:

        - ``input_type``: accessions
        - ``output_type`` fastq

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "accessions"
        self.output_type = "fastq"

        self.params = {
            "option_file": {
                "default": "false",
                "description": "Read more options and parameters from the file."
                           "Use to provide parameters to fasterq-dump"
            },
            "compress_fastq": {
                "default": "true",
                "description": "This option allow the users to define if they"
                               "want to compress the downloaded fastq files, "
                               "saving disk space. Default behavior is set"
                               "to compress the fastq files. If the user wants"
                               "to change this, set the variable to 'no'"
            }
        }

        self.directives = {"fasterqDump": {
            "cpus": 1,
            "memory": "'1GB'",
            "container": "flowcraft/sra-tools",
            "version": "2.9.1-1"
        }}

        self.status_channels = [
            "fasterqDump"
        ]


================================================
FILE: flowcraft/generator/components/mapping.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Bowtie(Process):
    """bowtie2 to align short paired-end sequencing reads to long reference sequences

        This process is set with:

            - ``input_type``: fastq
            - ``output_type``: bam
            - ``ptype``: mapping

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "bam"

        self.params = {
            "reference": {
                "default": "null",
                "description": "Specifies the reference genome to be provided "
                               "to bowtie2-build."
            },
            "index": {
                "default": "null",
                "description": "Specifies the reference indexes to be provided "
                               "to bowtie2."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "bowtie": {
                "container": "flowcraft/bowtie2_samtools",
                "version": "1.0.0-1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4
            },
            "bowtie_build": {
                "container": "flowcraft/bowtie2_samtools",
                "version": "1.0.0-1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 1
            }
        }

        self.status_channels = [
            "bowtie",
            "report_bowtie"
        ]


class RetrieveMapped(Process):
    """Samtools process to  to align short paired-end sequencing reads to
    long reference sequences

        This process is set with:

            - ``input_type``: bam
            - ``output_type``: fastq
            - ``ptype``: mapping

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "bam"
        self.output_type = "fastq"

        self.params = {
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.dependencies = ["bowtie"]

        self.directives = {
            "retrieve_mapped": {
                "container": "flowcraft/bowtie2_samtools",
                "version": "1.0.0-1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 2
            }
        }

        self.status_channels = [
            "retrieve_mapped"
        ]


class Bwa(Process):
    """Bwa to align short paired-end sequencing reads to long reference sequences

        This process is set with:

            - ``input_type``: fastq
            - ``output_type``: bam
            - ``ptype``: mapping

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "bam"

        self.params = {
            "bwaIndex": {
                "default": "'s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37/Sequence/BWAIndex/human_g1k_v37_decoy.fasta'",
                "description": "Specifies the reference indexes to be provided "
                               "to bwa."
            }
        }

        self.directives = {
            "bwa": {
                "container": "flowcraft/bwa_samtools",
                "version": "0.7.17-1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4
            }
        }

        self.status_channels = [
            "bwa",
        ]


class MarkDuplicates(Process):
    """Identifies duplicate reads.

        This process is set with:

            - ``input_type``: bam
            - ``output_type``: bam
            - ``ptype``: mapping

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "bam"
        self.output_type = "bam"

        self.compiler["multiqc"] = ["markDupMultiQC"]

        self.directives = {
            "mark_duplicates": {
                "container": "broadinstitute/gatk",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4
            }
        }

        self.status_channels = [
            "mark_duplicates"
        ]


class BaseRecalibrator(Process):
    """Detects systematic errors in base quality scores

        This process is set with:

            - ``input_type``: bam
            - ``output_type``: bam
            - ``ptype``: mapping

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "bam"
        self.output_type = "bam"

        self.params = {
            "reference": {
                "default": "null",
                "description": "Specifies the name of the FASTA reference genome and index files to be provided "
                               "to BaseRecalibrator."
            },
            "dbsnp": {
                "default": "null",
                "description": "Specifies the dbSNP VCF file to be provided "
                               "to BaseRecalibrator."
            },
            "dbsnpIdx": {
                "default": "null",
                "description": "Specifies the dbSNP VCF index file to be provided "
                               "to BaseRecalibrator."
            },
            "goldenIndel": {
                "default": "null",
                "description": "Specifies the Gold standard INDELs VCF file to be provided "
                               "to BaseRecalibrator."
            },
            "goldenIndelIdx": {
                "default": "null",
                "description": "Specifies the Gold standard INDELs VCF index file to be provided "
                               "to BaseRecalibrator."
            }
        }

        self.directives = {
            "base_recalibrator": {
                "container": "broadinstitute/gatk",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4
            },
            "apply_bqsr": {
                "container": "broadinstitute/gatk",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4
            }
        }

        self.status_channels = [
            "base_recalibrator",
            "apply_bqsr"
        ]

================================================
FILE: flowcraft/generator/components/metagenomics.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Concoct(Process):
    """
    CONCOCT process template interface for the
    taxonomic independent binning of metagenomic
    assemblies.

    This process is set with:
        - ``input_type``: assembly
        - ``output_type``: assembly
        - ``ptype``: post_assembly

        It contains one **secondary channel link end**:

            - ``MAIN_fq`` (alias: ``_MAIN_assembly``): Receives the FastQ files
            from the last process with ``fastq`` output type.
    """
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

        self.params = {
            "clusters": {
                "default": 400,
                "description": "Maximum number of clusters for VGMM. Default: 400"
            },
            "lengthThreshold": {
                "default": 1000,
                "description": "Contigs shorter than this value will not be included. Default: 1000."
            },
            "readLength": {
                "default": 100,
                "description": "Specify read length for coverage."
                               "Default: 0.9"
            },
            "iterations": {
                "default": 500,
                "description": "Number of iterations for the VBGMM. Default: 500"
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "concoct": {
                "container": "flowcraft/concoct",
                "version": "1.0.0-1",
                "cpus": 4,
                "memory": "{ 5.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "concoct",
            "report_concoct"
        ]


class Kraken(Process):
    """kraken process template interface

            This process is set with:

                - ``input_type``: fastq
                - ``output_type``: txt
                - ``ptype``: taxonomic classification
    """
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "txt"

        self.params = {
            "krakenDB": {
                "default": "'minikraken_20171013_4GB'",
                "description": "Specifies kraken database."
            }
        }

        self.directives = {
            "kraken": {
                "container": "flowcraft/kraken",
                "version": "1.0-0.1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 3
            }
        }

        self.status_channels = [
            "kraken"
        ]


class Kraken2(Process):
    """kraken2 process template interface

            This process is set with:

                - ``input_type``: fastq
                - ``output_type``: txt
                - ``ptype``: taxonomic classification
    """
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = None

        self.params = {
            "kraken2DB": {
                "default": "'minikraken2_v1_8GB'",
                "description": "Specifies kraken2 database. Requires full path if database not on "
                               "KRAKEN2_DB_PATH."
            }
        }

        self.directives = {
            "kraken2": {
                "container": "flowcraft/kraken2",
                "version": "2.0.7-1",
                "memory": "{8.Gb*task.attempt}",
                "cpus": 4
            }
        }

        self.status_channels = [
            "kraken2"
        ]


class Maxbin2(Process):
    """MaxBin2, a metagenomics binning software

            This process is set with:

                - ``input_type``: assembly
                - ``output_type``: assembly
                - ``ptype``: post_assembly

            It contains one **secondary channel link end**:

                - ``MAIN_fq`` (alias: ``_MAIN_assembly``): Receives the FastQ files
                from the last process with ``fastq`` output type.

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

        self.params = {
            "min_contig_lenght": {
                "default": 1000,
                "description": "minimum contig length. Default: 1000"
            },
            "max_iteration": {
                "default": 50,
                "description": "maximum Expectation-Maximization algorithm"
                               "iteration number. Default: 50"
            },
            "prob_threshold": {
                "default": 0.9,
                "description": "probability threshold for EM final classification."
                               "Default: 0.9"
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "maxbin2": {
                "container": "flowcraft/maxbin2",
                "version": "2.2.4-1",
                "cpus": 3,
                "memory": "{ 5.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "maxbin2",
            "report_maxbin2"
        ]


class Megahit(Process):
    """megahit process template interface

        This process is set with:

            - ``input_type``: fastq
            - ``output_type``: assembly
            - ``ptype``: assembly

        It contains one **secondary channel link end**:

            - ``SIDE_max_len`` (alias: ``SIDE_max_len``): Receives max read length
        """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.link_end.append({"link": "SIDE_max_len", "alias": "SIDE_max_len"})

        self.dependencies = ["integrity_coverage"]

        self.params = {
            "megahitKmers": {
                "default": "'auto'",
                "description":
                    "If 'auto' the megahit k-mer lengths will be determined "
                    "from the maximum read length of each assembly. If "
                    "'default', megahit will use the default k-mer lengths. "
                    "(default: $params.megahitKmers)"
            },
            "fastg": {
                "default": "false",
                "description":
                    "Converts megahit intermediate contigs to fastg"

            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"megahit": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/megahit",
            "version": "1.1.3-0.1",
            "scratch": "true"
        },
            "megahit_fastg": {
                "container": "flowcraft/megahit",
                "version": "1.1.3-0.1",
            }
        }

        self.status_channels = [
            "megahit",
            "megahit_fastg"
        ]


class Metabat2(Process):
    """
    MetaBat2 process template interface for the
    taxonomic independent binning of metagenomic
    assemblies.

    This process is set with:
        - ``input_type``: assembly
        - ``output_type``: assembly
        - ``ptype``: post_assembly

    It contains one **dependency process**:

        - ``assembly_mapping``: Requires the BAM file generated by the
        assembly mapping process

    """
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.dependencies = ["assembly_mapping"]

        self.params = {
            "maxPercentage": {
                "default": 95,
                "description": "Percentage of 'good' contigs considered for binning decided by connection. Default: 95."
            },
            "minContig": {
                "default": 2500,
                "description": "Minimum size of a contig for binning (should be >=1500). Default: 2500."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "metabat2": {
                "container": "flowcraft/metabat",
                "version": "2.13-1",
                "cpus": 4,
                "memory": "{ 5.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "metabat2",
            "report_metabat2"
        ]


class Metaspades(Process):
    """Metaspades process template interface

        This process is set with:

            - ``input_type``: fastq
            - ``output_type``: assembly
            - ``ptype``: assembly

        It contains one **secondary channel link end**:

            - ``SIDE_max_len`` (alias: ``SIDE_max_len``): Receives max read length
        """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fasta"

        self.link_end.append({"link": "SIDE_max_len", "alias": "SIDE_max_len"})

        self.dependencies = ["integrity_coverage"]

        self.params = {
            "metaspadesKmers": {
                "default": "'auto'",
                "description":
                    "If 'auto' the metaSPAdes k-mer lengths will be determined "
                    "from the maximum read length of each assembly. If "
                    "'default', metaSPAdes will use the default k-mer lengths. "
                    "(default: $params.metaspadesKmers)"
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"metaspades": {
            "cpus": 4,
            "memory": "{ 5.GB * task.attempt }",
            "container": "flowcraft/spades",
            "version": "3.11.1-1",
            "scratch": "true"
        }}


class Midas_species(Process):
    """Midas species process template interface

            This process is set with:

                - ``input_type``: fastq
                - ``output_type``: txt
                - ``ptype``: taxonomic classification (species)
    """
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "txt"

        self.params = {
            "midasDB": {
                "default": "null",
                "description": "Specifies Midas database."
            }
        }

        self.directives = {
            "midas_species": {
                "container": "flowcraft/midas",
                "version": "1.3.2-0.1",
                "memory": "{2.Gb*task.attempt}",
                "cpus": 3
            }
        }

        self.status_channels = [
            "midas_species"
        ]


class RemoveHost(Process):
    """bowtie2 to remove host reads process template interface

        This process is set with:

            - ``input_type``: fastq
            - ``output_type``: fastq
            - ``ptype``: removal os host reads

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "refIndex": {
                "default": "'/index_hg19/hg19'",
                "description": "Specifies the reference indexes to be provided "
                               "to bowtie2."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "remove_host": {
                "container": "flowcraft/remove_host",
                "version": "2-0.1",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 3
            }
        }

        self.status_channels = [
            "remove_host",
            "report_remove_host"
        ]


class Metaprob(Process):
    """MetaProb to bin metagenomic reads interface

            This process is set with:

                - ``input_type``: fastq
                - ``output_type``: csv
                - ``ptype``: binning of reads

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "csv"

        self.params = {
            "feature": {
                "default": 1,
                "description": "Feature used to compute. Default: 1"
            },
            "metaProbQMer": {
                "default": 5,
                "description": "Threshold of shared q-mer to create graph "
                               "adiacences. Default: 5"
            }
        }

        self.directives = {
            "metaProb": {
                "container": "flowcraft/metaprob",
                "version": "2-1",
                "cpus": 1,
                "memory": "{ 30.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "metaProb"
        ]


class SplitAssembly(Process):
    """Component to filter metagenomic assemblies by contig size
    If the contig is larger than $param.size, it gets separated
    from the original assembly to continue the processes downstream
    of the pipeline.

            This process is set with:

                - ``input_type``: fasta
                - ``output_type``: fasta
                - ``ptype``: assembly filter

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.params = {
            "size": {
                "default": "null",
                "description": "Minimum contig size"
            }
        }

        self.directives = {
            "split_assembly": {
                "cpus": 1,
                "memory": "{ 1.GB * task.attempt }"
            }
        }

        self.status_channels = [
            "split_assembly"
        ]


================================================
FILE: flowcraft/generator/components/mlst.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Mlst(Process):
    """Mlst mapping process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: None
        - ``ptype``: post_assembly

    It contains one **secondary channel link end**:

        - ``MAIN_assembly`` (alias: ``MAIN_assembly``): Receives the last
        assembly.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.directives = {"mlst": {
            "container": "ummidock/mlst",
        }}

        self.params = {
            "mlstSpecies": {
                "default": "null",
                "description":
                    "Specify the expected species for MLST checking."
            }
        }


class Chewbbaca(Process):
    """Chewbbaca process template interface

    This process is set with:

        - ``input_type``: assembly
        - ``output_type``: None
        - ``ptype``: post_assembly

    It contains one **secondary channel link end**:

        - ``MAIN_assembly`` (alias: ``MAIN_assembly``): Receives the last
        assembly.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.ignore_type = True

        self.link_start = None
        self.link_end.append({"link": "MAIN_assembly",
                              "alias": "MAIN_assembly"})

        self.directives = {
            "chewbbaca": {
                "cpus": 4,
                "container": "mickaelsilva/chewbbaca_py3",
                "version": "latest",
            },
            "chewbbaca_batch": {
                "cpus": 4,
                "container": "mickaelsilva/chewbbaca_py3",
                "version": "latest",
            },
            "chewbbacaExtractMLST": {
                "container": "mickaelsilva/chewbbaca_py3",
                "version": "latest"
            }
        }

        self.params = {
            "chewbbacaQueue": {
                "default": "null",
                "description":
                    "Specifiy a queue/partition for chewbbaca. This option"
                    " is only used for grid schedulers."
            },
            "chewbbacaTraining": {
                "default": "null",
                "description":
                    "Specify the full path to the prodigal training file "
                    "of the corresponding species."
            },
            "schemaPath": {
                "default": "null",
                "description":
                    "The path to the chewbbaca schema directory."
            },
            "schemaSelectedLoci": {
                "default": "null",
                "description":
                    "The path to the selection of loci in the schema "
                    "directory to be used. If not specified, all loci in the"
                    " schema will be used."
            },
            "schemaCore": {
                "default": "null",
                "description": ""
            },
            "chewbbacaJson": {
                "default": "false",
                "description":
                    "If set to True, chewbbaca's allele call output will be "
                    "set to JSON format."
            },
            "chewbbacaToPhyloviz": {
                "default": "false",
                "description":
                    "If set to True, the ExtractCgMLST module of chewbbaca"
                    " will be executed after the allele calling.",
            },
            "chewbbacaProfilePercentage": {
                "default": 0.95,
                "description":
                    "Specifies the proportion of samples that must be "
                    "present in a locus to save the profile."
            },
            "chewbbacaBatch": {
                "default": "false",
                "description":
                    "Specifies whether a chewbbaca run will be performed on the"
                    " complete input batch (all at the same time) or one by "
                    "one."
            }
        }


class Metamlst(Process):
    """MetaMlst mapping process template interface

    This process is set with:

        - ``input_type``: reads
        - ``output_type``: None
        - ``ptype``: pre_assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = None

        self.directives = {"metamlst": {
            "container": "flowcraft/metamlst",
            "version": "1.1-1",
            "memory": "{4.Gb*task.attempt}"
            }
        }

        self.params = {
            "metamlstDB": {
                "default": "'/NGStools/metamlst/metamlstDB_2017.db'",
                "description":
                    "Specify the metamlstDB (full path) for MLST checking."
            },
            "metamlstDB_index": {
                "default": "'/NGStools/index/metamlstDB_2017'",
                "description":
                    "Specify the Bowtie2 metamlstDB index (full path) for MLST checking."
            }
        }


================================================
FILE: flowcraft/generator/components/patlas_mapping.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class MappingPatlas(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "json"

        self.params = {
            "trim5": {
                "default": 0,
                "description": "Sets trim5 option for bowtie. This will become"
                               " legacy with QC integration, but it enables to"
                               " trim 5' end of reads to be mapped with "
                               "bowtie2."
            },
            "cov_cutoff": {
                "default": 0.6,
                "description": "This variable sets a cutoff for the percentage"
                               " of the query reference sequence that is "
                               "covered by reads (in absolute lenght)."
            },
            "refIndex": {
                "default": "'/ngstools/data/indexes/patlas_bowtie2_index'",
                "description": "Specifies the reference indexes to be provided"
                               " to bowtie2."
            },
            "samtoolsIndex": {
                "default": "'/ngstools/data/indexes/master_fasta_plasmid_db.fas.fai'",
                "description": "Specifies the reference indexes to be provided"
                               " to samtools."
            },
            "lengthJson": {
                "default": "'/ngstools/data/length_plasmid_db.json'",
                "description": "A dictionary of all the lengths of reference "
                               "sequences."
            }
        }

        self.directives = {
            "mappingBowtie": {
                "container": "flowcraft/mapping-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "{ 4.GB * task.attempt }",
                "scratch": "true"
            },
            "jsonDumpingMapping": {
                "container": "flowcraft/mapping-patlas",
                "version": "1.6.0-1",
                "cpus": 1,
                "memory": "'4GB'"
            }
        }

        self.status_channels = [
            "mappingBowtie",
            "jsonDumpingMapping"
        ]

        self.compiler["patlas_consensus"] = ["mappingOutputChannel"]


================================================
FILE: flowcraft/generator/components/phylogeny.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Raxml(Process):
    """mafft to align sequences

            This process is set with:

                - ``input_type``: align
                - ``output_type``: .tree
                - ``ptype``: tree

            """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "align"
        self.output_type = ".tree"

        self.params = {
            "substitutionModel": {
                "default": "'GTRGAMMA'",
                "description": "Substitution model. Option: GTRCAT, GTRCATI, ASC_GTRCAT, GTRGAMMA, ASC_GTRGAMMA etc "
            },
            "seedNumber": {
                "default": "12345",
                "description": "Specify an integer number (random seed) and turn on rapid bootstrapping"
            },
            "bootstrap": {
                "default": "500",
                "description": "Specify the number of alternative runs on distinct starting trees"
            },
            "simpleLabel": {
                "default": "true",
                "description": "Simplify the labels in the newick tree (for interactive report only)"
            }
        }

        self.directives = {
            "raxml": {
                "container": "flowcraft/raxml",
                "version": "8.2.11-2",
                "cpus": 4,
                "memory": "{ 4.GB * task.attempt }"
            },
            "report_raxml": {
                "container": "flowcraft/raxml",
                "version": "8.2.11-2"
            }
        }

        self.status_channels = [
            "raxml",
            "report_raxml"
        ]


================================================
FILE: flowcraft/generator/components/reads_quality_control.py
================================================

try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class IntegrityCoverage(Process):
    """Process template interface for first integrity_coverage process

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    It contains two **secondary channel link starts**:

        - ``SIDE_phred``: Phred score of the FastQ files
        - ``SIDE_max_len``: Maximum read length
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "genomeSize": {
                "default": 1,
                "description":
                    "Genome size estimate for the samples in Mb. It is used to "
                    "estimate the coverage and other assembly parameters and"
                    "checks"
            },
            "minCoverage": {
                "default": 0,
                "description":
                    "Minimum coverage for a sample to proceed. By default it's set"
                    "to 0 to allow any coverage"
            }
        }

        self.link_start.extend(["SIDE_phred", "SIDE_max_len"])


class CheckCoverage(Process):
    """Process template interface for additional integrity_coverage process

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    It contains one **secondary channel link start**:

        - ``SIDE_max_len``: Maximum read length

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "genomeSize": {
                "default": 2.1,
                "description":
                    "Genome size estimate for the samples. It is used to "
                    "estimate the coverage and other assembly parameters and"
                    "checks"
            },
            "minCoverage": {
                "default": 15,
                "description":
                    "Minimum coverage for a sample to proceed. Can be set to"
                    "0 to allow any coverage"
            }
        }

        self.link_start.extend(["SIDE_max_len"])


class TrueCoverage(Process):
    """TrueCoverage process template interface
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "species": {
                "default": "null",
                "description":
                    "Species name. Must be the complete species name with"
                    "genus and species, e.g.: 'Yersinia enterocolitica'. "
            }
        }

        self.directives = {
            "true_coverage": {
                "cpus": 4,
                "memory": "'1GB'",
                "container": "flowcraft/true_coverage",
                "version": "3.2-1"
            }
        }


class Fastqc(Process):
    """FastQC process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    It contains two **status channels**:

        - ``STATUS_fastqc``: Status for the fastqc process
        - ``STATUS_report``: Status for the fastqc_report process

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.status_channels = ["STATUS_fastqc2", "STATUS_fastqc2_report"]
        """
        list: Setting status channels for FastQC execution and FastQC report
        """

        self.params = {
            "adapters": {
                "default": "'None'",
                "description":
                    "Path to adapters files, if any."
            }
        }

        self.directives = {"fastqc2": {
            "cpus": 2,
            "memory": "'4GB'",
            "container": "flowcraft/fastqc",
            "version": "0.11.7-1"
        }}


class Trimmomatic(Process):
    """Trimmomatic process template interface

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    It contains one **secondary channel link end**:

        - ``SIDE_phred`` (alias: ``SIDE_phred``): Receives FastQ phred score
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.link_end.append({"link": "SIDE_phred", "alias": "SIDE_phred"})

        self.dependencies = ["integrity_coverage"]

        self.params = {
            "adapters": {
                "default": "'None'",
                "description":
                    "Path to adapters files, if any."
            },
            "trimSlidingWindow": {
                "default": "'5:20'",
                "description":
                    "Perform sliding window trimming, cutting once the "
                    "average quality within the window falls below a "
                    "threshold"
            },
            "trimLeading": {
                "default": "3",
                "description":
                    "Cut bases off the start of a read, if below a threshold "
                    "quality"
            },
            "trimTrailing": {
                "default": "3",
                "description":
                    "Cut bases of the end of a read, if below a "
                    "threshold quality"
            },
            "trimMinLength": {
                "default": "55",
                "description":
                    "Drop the read if it is below a specified length "
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"trimmomatic": {
            "cpus": 2,
            "memory": "{ 4.GB * task.attempt }",
            "container": "flowcraft/trimmomatic",
            "version": "0.36-1"
        }}


class FastqcTrimmomatic(Process):
    """Fastqc + Trimmomatic process template interface

    This process executes FastQC only to inform the trim range for trimmomatic,
    not for QC checks.

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    It contains one **secondary channel link end**:

        - ``SIDE_phred`` (alias: ``SIDE_phred``): Receives FastQ phred score

    It contains three **status channels**:

        - ``STATUS_fastqc``: Status for the fastqc process
        - ``STATUS_report``: Status for the fastqc_report process
        - ``STATUS_trim``: Status for the trimmomatic process
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.link_end.append({"link": "SIDE_phred", "alias": "SIDE_phred"})

        self.status_channels = ["STATUS_fastqc", "STATUS_fastqc_report",
                                "STATUS_trimmomatic"]

        self.dependencies = ["integrity_coverage"]

        self.params = {
            "adapters": {
                "default": "'None'",
                "description":
                    "Path to adapters files, if any."
            },
            "trimSlidingWindow": {
                "default": "'5:20'",
                "description":
                    "Perform sliding window trimming, cutting once the "
                    "average quality within the window falls below a "
                    "threshold."
            },
            "trimLeading": {
                "default": "3",
                "description":
                    "Cut bases off the start of a read, if below a threshold "
                    "quality."
            },
            "trimTrailing": {
                "default": "3",
                "description":
                    "Cut bases of the end of a read, if below a "
                    "threshold quality."
            },
            "trimMinLength": {
                "default": "55",
                "description":
                    "Drop the read if it is below a specified length."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "fastqc": {
                "cpus": 2,
                "memory": "'4GB'",
                "container": "flowcraft/fastqc",
                "version": "0.11.7-1"
            },
            "trimmomatic": {
                "cpus": 2,
                "memory": "{ 4.GB * task.attempt }",
                "container": "flowcraft/trimmomatic",
                "version": "0.36-1"
            }
        }


class FilterPoly(Process):
    """PrinSeq process to filter non-informative sequences from reads

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq
        - ``ptype``: pre_assembly

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "adapter": {
                "default": "'A 50%; T 50%; N 50%'",
                "description":
                    "Pattern to filter the reads. Please separate parameter"
                    "values with a space and separate new parameter sets with"
                    " semicolon (;). Parameters are defined by two values: "
                    "the pattern (any combination of the letters ATCGN), and "
                    "the number of repeats or percentage of occurence."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"filter_poly": {
            "cpus": 1,
            "memory": "{ 4.GB * task.attempt }",
            "container": "flowcraft/prinseq",
            "version": "0.20.4-1"
        }}

        self.status_channels = [
            "filter_poly"
        ]


class DownsampleFastq(Process):
    """Downsamples FastQ file based on depth using seqtk

    This process is set with:

        - ``input_type``: fastq
        - ``output_type``: fastq

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = "fastq"

        self.params = {
            "genomeSize": {
                "default": 1,
                "description":
                    "Genome size estimate for the samples in Mb. It is used to"
                    " estimate the coverage"
            },
            "depth": {
                "default": 100,
                "description":
                    "Maximum estimated depth coverage allowed. FastQ with "
                    "higher estimated depth will be subsampled to this value."
            },
            "seed":{
                "default": 100,
                "description": "The seed number for seqtk. By default it is 100"
                               "and should be equal for both pairs of "
                               "reads."
            },
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {"downsample_fastq": {
            "cpus": 1,
            "memory": "{ 4.GB * task.attempt }",
            "container": "flowcraft/seqtk",
            "version": "1.3.0-3"
        }}

        self.status_channels = [
            "downsample_fastq"
        ]


================================================
FILE: flowcraft/generator/components/typing.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class SeqTyping(Process):
    """

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = None

        self.link_start = None

        self.directives = {"seq_typing": {
            "cpus": 4,
            "memory": "'4GB'",
            "container": "flowcraft/seq_typing",
            "version": "2.0-1"
        }}

        self.params = {
            "referenceFileO": {
                "default": "null",
                "description":
                    "Fasta file containing reference sequences. If more"
                    "than one file is passed via the 'referenceFileH parameter"
                    ", a reference sequence for each file will be determined. "
            },
            "referenceFileH": {
                "default": "null",
                "description":
                    "Fasta file containing reference sequences. If more"
                    "than one file is passed via the 'referenceFileO parameter"
                    ", a reference sequence for each file will be determined. "
            }
        }


class PathoTyping(Process):
    """

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = None

        self.ignore_type = True

        self.params = {
            "species": {
                "default": "null",
                "description":
                    "Species name. Must be the complete species name with"
                    "genus and species, e.g.: 'Yersinia enterocolitica'. "
            }
        }

        self.link_start = None
        self.link_end.append({"link": "MAIN_raw",
                              "alias": "SIDE_PathoType_raw"})

        self.directives = {"patho_typing": {
            "cpus": 4,
            "memory": "'4GB'",
            "container": "flowcraft/patho_typing",
            "version": "0.3.0-1"
        }}


class Sistr(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.directives = {"sistr": {
            "cpus": 4,
            "memory": "'4GB'",
            "container": "ummidock/sistr_cmd",
            "version": "1.0.2"
        }}


class Momps(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = None

        self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

        self.params = {
            "clearInput": {
                "default": "false",
                "description":
                    "Permanently removes temporary input files. This option "
                    "is only useful to remove temporary files in large "
                    "workflows and prevents nextflow's resume functionality. "
                    "Use with caution."
            }
        }

        self.directives = {
            "momps": {
                "cpus": 3,
                "memory": "'4GB'",
                "container": "flowcraft/momps",
                "version": "0.1.1-1"
            }
        }


class DengueTyping(Process):
    """

    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fasta"
        self.output_type = "fasta"

        self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

        self.link_start.extend(["_ref_seqTyping"])

        self.params = {
            "reference": {
                "default": "ref/DENV_TYPING_DB_V2.fasta",
                "description":
                    "Typing database."
            },
            "get_genome": {
                "default": "true",
                "description":
                    "Retrieves the sequence of the closest reference."
            }
        }

        self.directives = {"dengue_typing_assembly": {
            "cpus": 4,
            "memory": "'1GB'",
            "container": "flowcraft/seq_typing",
            "version": "2.0-1"
        },
            "dengue_typing_reads": {
                "cpus": 4,
                "memory": "{ 5.GB * task.attempt }",
                "container": "ummidock/seq_typing",
                "version": "2.2-02"
            }
        }

        self.status_channels = [
            "dengue_typing_assembly",
            "dengue_typing_reads"
        ]


class Seroba(Process):
    """
    Serotyping of Streptococcus pneumoniae sequencing data
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "fastq"
        self.output_type = None

        self.params = {
            "coverage": {
                "default": "20",
                "description":
                    "Threshold for k-mer coverage of the reference sequence (default = 20)"
            }
        }

        self.directives = {
            "seroba": {
                "cpus": 3,
                "memory": "'4GB'",
                "container": "sangerpathogens/seroba",
                "version": "latest"
            }
        }

================================================
FILE: flowcraft/generator/components/variant_calling.py
================================================
try:
    from generator.process import Process
except ImportError:
    from flowcraft.generator.process import Process


class Haplotypecaller(Process):
    """Call germline SNPs and indels via local re-assembly of haplotypes

        This process is set with:

            - ``input_type``: bam
            - ``output_type``: vcf
            - ``ptype``: varaint calling

        """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = "bam"

        self.params = {
            "reference": {
                "default": "null",
                "description": "Specifies the reference genome to be provided "
                               "to GATK HaplotypeCaller."
            },
            "intervals": {
                "default": "null",
                "description": "Interval list file to specify the regions to call variants."
            }
        }

        self.directives = {
            "haplotypecaller": {
                "container": "broadinstitute/gatk",
                "memory": "{2.Gb*task.attempt}",
                "cpus": 4,
            },
            "merge_vcfs": {
                "container": "broadinstitute/gatk",
                "memory": "{5.Gb*task.attempt}",
                "cpus": 4,
            }
        }

        self.status_channels = [
            "haplotypecaller",
            "merge_vcfs"
        ]

================================================
FILE: flowcraft/generator/engine.py
================================================
import os
import sys
import json
import jinja2
import shutil
import logging
import requests

from collections import defaultdict
from os.path import dirname, join, abspath, split, splitext, exists, basename


logger = logging.getLogger("main.{}".format(__name__))

try:
    import generator.process as pc
    import generator.error_handling as eh
    from __init__ import __version__
    from generator import header_skeleton as hs
    from generator import footer_skeleton as fs
    from generator.process_details import colored_print
    from generator.pipeline_parser import guess_process
except ImportError:
    import flowcraft.generator.process as pc
    import flowcraft.generator.error_handling as eh
    from flowcraft import __version__
    from flowcraft.generator import header_skeleton as hs
    from flowcraft.generator import footer_skeleton as fs
    from flowcraft.generator.process_details import colored_print
    from flowcraft.generator.pipeline_parser import guess_process


class NextflowGenerator:

    def __init__(self, process_connections, nextflow_file, process_map,
                 pipeline_name="flowcraft", ignore_dependencies=False,
                 auto_dependency=True, merge_params=True, export_params=False):

        self.processes = []

        self.process_map = process_map
        """
        dict: Maps the nextflow template name to the corresponding Process
        class of the component.
        """

        # Create the processes attribute with the first special init process.
        # This process will handle the forks of the raw input channels and
        # secondary inputs
        self.processes = [pc.Init(template="init")]
        """
        list: Stores the process interfaces in the specified order
        """

        self._fork_tree = defaultdict(list)
        """
        dict: A dictionary with the fork tree of the pipeline, which consists
        on the the paths of each lane. For instance, a single fork with two
        sinks is represented as: {1: [2,3]}. Subsequent forks are then added
        sequentially: {1:[2,3], 2:[3,4,5]}. This allows the path upstream
        of a process in a given lane to be traversed until the start of the
        pipeline.
        """

        self.lanes = 0
        """
        int: Stores the number of lanes in the pipelines
        """

        self.export_parameters = export_params
        """
        bool: Determines whether the build mode is only for the export of 
        parameters in JSON format. Setting to True will disabled some checks,
        such as component dependency requirements
        """

        # When the export_params option is used, disable the auto dependency
        # feature automatically
        auto_deps = auto_dependency if not self.export_parameters else False

        # Builds the connections in the processes, which parses the
        # process_connections dictionary into the self.processes attribute
        # list.
        self._build_connections(process_connections, ignore_dependencies,
                                auto_deps)

        self.nf_file = nextflow_file
        """
        str: Path to file where the pipeline will be generated
        """

        self.pipeline_name = pipeline_name
        """
        str: Name of the pipeline, for customization and help purposes.
        """

        self.template = ""
        """
        str: String that will harbour the pipeline code
        """

        self.secondary_channels = {}
        """
        dict: Stores secondary channel links
        """

        self.main_raw_inputs = {}
        """
        list: Stores the main raw inputs from the user parameters into the
        first process(es).
        """

        self.merge_params = merge_params
        """
        bool: Determines whether the params of the pipeline should be merged
        (i.e., the same param name in multiple components is merged into one)
        or if they should be unique and specific to each component.
        """

        self.extra_inputs = {}
        """
        """

        self.status_channels = []
        """
        list: Stores the status channels from each process
        """

        self.skip_class = [pc.Compiler]
        """
        list: Stores the Process classes that should be skipped when iterating
        over the :attr:`~NextflowGenerator.processes` list.
        """

        self.resources = ""
        """
        str: Stores the resource directives string for each nextflow process.
        See :func:`NextflowGenerator._get_resources_string`.
        """

        self.containers = ""
        """
        str: Stores the container directives string for each nextflow process.
        See :func:`NextflowGenerator._get_container_string`.
        """

        self.params = ""
        """
        str: Stores the params directives string for the nextflow pipeline.
        See :func:`NextflowGenerator._get_params_string`
        """

        self.config = ""
        """
        str: Stores de configuration for the nextflow pipeline.
        See :func:`NextflowGenerator._get_config_string`
        """

        self.user_config = ""
        """
        str: Stores the user configuration file placeholder. This is an
        empty configuration file that is only added the first time to a
        project directory. If the file already exists, it will not overwrite
        it.
        """

        self.compilers = {
            "patlas_consensus": {
                "cls": pc.PatlasConsensus,
                "template": "patlas_consensus"
            }
        }
        """
        dict: Maps the information about each available compiler process in
        flowcraft. The key of each entry is the name/signature of the
        compiler process. The value is a json/dict object that contains two
        key:pair values:
            - ``cls``: The reference to the compiler class object.
            - ``template``: The nextflow template file of the process.
        """


    @staticmethod
    def _parse_process_name(name_str):
        """Parses the process string and returns the process name and its
        directives

        Process strings my contain directive information with the following
        syntax::

            proc_name={'directive':'val'}

        This method parses this string and returns the process name as a
        string and the directives information as a dictionary.

        Parameters
        ----------
        name_str : str
            Raw string with process name and, potentially, directive
            information

        Returns
        -------
        str
            Process name
        dict or None
            Process directives
        """

        directives = None

        fields = name_str.split("=")
        process_name = fields[0]

        if len(fields) == 2:
            _directives = fields[1].replace("'", '"')
            try:
                directives = json.loads(_directives)
            except json.decoder.JSONDecodeError:
                raise eh.ProcessError(
                    "Could not parse directives for process '{}'. The raw"
                    " string is: {}\n"
                    "Possible causes include:\n"
                    "\t1. Spaces inside directives\n"
                    "\t2. Missing '=' symbol before directives\n"
                    "\t3. Missing quotes (' or \") around directives\n"
                    "A valid example: process_name={{'cpus':'2'}}".format(
                        process_name, name_str))

        return process_name, directives

    def _build_connections(self, process_list, ignore_dependencies,
                           auto_dependency):
        """Parses the process connections dictionaries into a process list

        This method is called upon instantiation of the NextflowGenerator
        class. Essentially, it sets the main input/output channel names of the
        processes so that they can be linked correctly.

        If a connection between two consecutive process is not possible due
        to a mismatch in the input/output types, it exits with an error.

        Returns
        -------

        """

        logger.debug("=============================")
        logger.debug("Building pipeline connections")
        logger.debug("=============================")

        logger.debug("Processing connections: {}".format(process_list))

        for p, con in enumerate(process_list):

            logger.debug("Processing connection '{}': {}".format(p, con))

            # Get lanes
            in_lane = con["input"]["lane"]
            out_lane = con["output"]["lane"]
            logger.debug("[{}] Input lane: {}".format(p, in_lane))
            logger.debug("[{}] Output lane: {}".format(p, out_lane))

            # Update the total number of lines of the pipeline
            if out_lane > self.lanes:
                self.lanes = out_lane

            # Get process names and directives for the output process
            p_in_name, p_out_name, out_directives = self._get_process_names(
                con, p)

            # Check if process is available or correctly named
            if p_out_name not in self.process_map:
                logger.error(colored_print(
                    "\nThe process '{}' is not available."
                        .format(p_out_name), "red_bold"))
                guess_process(p_out_name, self.process_map)
                sys.exit(1)

            # Instance output process
            out_process = self.process_map[p_out_name](template=p_out_name)

            # Update directives, if provided
            if out_directives:
                out_process.update_attributes(out_directives)

            # Set suffix strings for main input/output channels. Suffixes are
            # based on the lane and the arbitrary and unique process id
            # e.g.: 'process_1_1'
            input_suf = "{}_{}".format(in_lane, p)
            output_suf = "{}_{}".format(out_lane, p)
            logger.debug("[{}] Setting main channels with input suffix '{}'"
                         " and output suffix '{}'".format(
                            p, input_suf, output_suf))
            out_process.set_main_channel_names(input_suf, output_suf, out_lane)

            # Instance input process, if it exists. In case of init, the
            # output process forks from the raw input user data
            if p_in_name != "__init__":
                # Create instance of input process
                in_process = self.process_map[p_in_name](template=p_in_name)
                # Test if two processes can be connected by input/output types
                logger.debug("[{}] Testing connection between input and "
                             "output processes".format(p))
                self._test_connection(in_process, out_process)
                out_process.parent_lane = in_lane
            else:
                # When the input process is __init__, set the parent_lane
                # to None. This will tell the engine that this process
                # will receive the main input from the raw user input.
                out_process.parent_lane = None
            logger.debug("[{}] Parent lane: {}".format(
                p, out_process.parent_lane))

            # If the current connection is a fork, add it to the fork tree
            if in_lane != out_lane:
                logger.debug("[{}] Connection is a fork. Adding lanes to "
                             "fork list".format(p))
                self._fork_tree[in_lane].append(out_lane)
                # Update main output fork of parent process
                try:
                    parent_process = [
                        x for x in self.processes if x.lane == in_lane and
                        x.template == p_in_name
                    ][0]
                    logger.debug(
                        "[{}] Updating main forks of parent fork '{}' with"
                        " '{}'".format(p, parent_process,
                                       out_process.input_channel))
                    parent_process.update_main_forks(out_process.input_channel)
                except IndexError:
                    pass
            else:
                # Get parent process, naive version
                parent_process = self.processes[-1]

                # Check if the last process' lane matches the lane of the
                # current output process. If not, get the last process
                # in the same lane
                if parent_process.lane and parent_process.lane != out_lane:
                    parent_process = [x for x in self.processes[::-1]
                                      if x.lane == out_lane][0]

                if parent_process.output_channel:
                    logger.debug(
                        "[{}] Updating input channel of output process"
                        " with '{}'".format(
                            p, parent_process.output_channel))
                    out_process.input_channel = parent_process.output_channel

            # Check for process dependencies
            if out_process.dependencies and not ignore_dependencies:
                logger.debug("[{}] Dependencies found for process '{}': "
                             "{}".format(p, p_out_name,
                                         out_process.dependencies))
                parent_lanes = self._get_fork_tree(out_lane)
                for dep in out_process.dependencies:
                    if not self._search_tree_backwards(dep, parent_lanes):
                        if auto_dependency:
                            self._add_dependency(
                                out_process, dep, in_lane, out_lane, p)
                        elif not self.export_parameters:
                            logger.error(colored_print(
                                "\nThe following dependency of the process"
                                " '{}' is missing: {}".format(p_out_name, dep),
                                "red_bold"))
                            sys.exit(1)

            self.processes.append(out_process)

        logger.debug("Completed connections: {}".format(self.processes))
        logger.debug("Fork tree: {}".format(self._fork_tree))

    def _get_process_names(self, con, pid):
        """Returns the input/output process names and output process directives

        Parameters
        ----------
        con : dict
            Dictionary with the connection information between two processes.

        Returns
        -------
        input_name : str
            Name of the input process
        output_name : str
            Name of the output process
        output_directives : dict
            Parsed directives from the output process
        """

        try:
            _p_in_name = con["input"]["process"]
            p_in_name, _ = self._parse_process_name(_p_in_name)
            logger.debug("[{}] Input channel: {}".format(pid, p_in_name))
            _p_out_name = con["output"]["process"]
            p_out_name, out_directives = self._parse_process_name(
                _p_out_name)
            logger.debug("[{}] Output channel: {}".format(pid, p_out_name))
        # Exception is triggered when the process name/directives cannot
        # be processes.
        except eh.ProcessError as ex:
            logger.error(colored_print(ex.value, "red_bold"))
            sys.exit(1)

        return p_in_name, p_out_name, out_directives

    def _add_dependency(self, p, template, inlane, outlane, pid):
        """Automatically Adds a dependency of a process.

        This method adds a template to the process list attribute as a
        dependency. It will adapt the input lane, output lane and process
        id of the process that depends on it.

        Parameters
        ----------
        p : Process
            Process class that contains the dependency.
        template : str
            Template name of the dependency.
        inlane : int
            Input lane.
        outlane : int
            Output lane.
        pid : int
            Process ID.
        """

        dependency_proc = self.process_map[template](template=template)

        if dependency_proc.input_type != p.input_type:
            logger.error("Cannot automatically add dependency with different"
                         " input type. Input type of process '{}' is '{}."
                         " Input type of dependency '{}' is '{}'".format(
                            p.template, p.input_type, template,
                            dependency_proc.input_type))

        input_suf = "{}_{}_dep".format(inlane, pid)
        output_suf = "{}_{}_dep".format(outlane, pid)
        dependency_proc.set_main_channel_names(input_suf, output_suf, outlane)

        # To insert the dependency process before the current process, we'll
        # need to move the input channel name of the later to the former, and
        # set a new connection between the dependency and the process.
        dependency_proc.input_channel = p.input_channel
        p.input_channel = dependency_proc.output_channel

        # If the current process was the first in the pipeline, change the
        # lanes so that the dependency becomes the first process
        if not p.parent_lane:
            p.parent_lane = outlane
            dependency_proc.parent_lane = None
        else:
            dependency_proc.parent_lane = inlane
            p.parent_lane = outlane

        self.processes.append(dependency_proc)

    def _search_tree_backwards(self, template, parent_lanes):
        """Searches the process tree backwards in search of a provided process

        The search takes into consideration the provided parent lanes and
        searches only those

        Parameters
        ----------
        template : str
            Name of the process template attribute being searched
        parent_lanes : list
            List of integers with the parent lanes to be searched

        Returns
        -------
        bool
            Returns True when the template is found. Otherwise returns False.
        """

        for p in self.processes[::-1]:

            # Ignore process in different lanes
            if p.lane not in parent_lanes:
                continue

            # template found
            if p.template == template:
                return True

        return False

    @staticmethod
    def _test_connection(parent_process, child_process):
        """Tests if two processes can be connected by input/output type

        Parameters
        ----------
        parent_process : flowcraft.Process.Process
            Process that will be sending output.
        child_process : flowcraft.Process.Process
            Process that will receive output.

        """

        # If any of the processes has an ignore type attribute set to True,
        # don't perform the check
        if parent_process.ignore_type or child_process.ignore_type:
            return

        if parent_process.output_type != child_process.input_type:
            logger.error(
                "The output of the '{}' process ({}) cannot link with the "
                "input of the '{}' process ({}). Please check the order of "
                "the processes".format(parent_process.template,
                                       parent_process.output_type,
                                       child_process.template,
                                       child_process.input_type))
            sys.exit(1)

    def _build_header(self):
        """Adds the header template to the master template string
        """

        logger.debug("===============")
        logger.debug("Building header")
        logger.debug("===============")
        self.template += hs.header

    def _build_footer(self):
        """Adds the footer template to the master template string"""

        logger.debug("===============")
        logger.debug("Building header")
        logger.debug("===============")
        self.template += fs.footer

    def _update_raw_input(self, p, sink_channel=None, input_type=None):
        """Given a process, this method updates the
        :attr:`~Process.main_raw_inputs` attribute with the corresponding
        raw input channel of that process. The input channel and input type
        can be overridden if the `input_channel` and `input_type` arguments
        are provided.

        Parameters
        ----------
        p : flowcraft.Process.Process
            Process instance whose raw input will be modified
        sink_channel: str
            Sets the channel where the raw input will fork into. It overrides
            the process's `input_channel` attribute.
        input_type: str
            Sets the type of the raw input. It overrides the process's
            `input_type` attribute.
        """

        process_input = input_type if input_type else p.input_type
        process_channel = sink_channel if sink_channel else p.input_channel

        logger.debug("[{}] Setting raw input channel "
                     "with input type '{}'".format(p.template, process_input))
        # Get the dictionary with the raw forking information for the
        # provided input
        raw_in = p.get_user_channel(process_channel, process_input)
        logger.debug("[{}] Fetched process raw user: {}".format(p.template,
                                                                raw_in))

        if process_input in self.main_raw_inputs:
            self.main_raw_inputs[process_input]["raw_forks"].append(
                raw_in["input_channel"])
        else:
            self.main_raw_inputs[process_input] = {
                "channel": raw_in["channel"],
                "channel_str": "{}\n{} = {}".format(
                    raw_in["checks"].format(raw_in["params"]),
                    raw_in["channel"],
                    raw_in["channel_str"].format(raw_in["params"])),
                "raw_forks": [raw_in["input_channel"]]
            }
        logger.debug("[{}] Updated main raw inputs: {}".format(
            p.template, self.main_raw_inputs))

    def _update_extra_inputs(self, p):
        """Given a process, this method updates the
        :attr:`~Process.extra_inputs` attribute with the corresponding extra
        inputs of that process

        Parameters
        ----------
        p : flowcraft.Process.Process
        """

        if p.extra_input:
            logger.debug("[{}] Found extra input: {}".format(
                p.template, p.extra_input))

            if p.extra_input == "default":
                # Check if the default type is now present in the main raw
                # inputs. If so, issue an error. The default param can only
                # be used when not present in the main raw inputs
                if p.input_type in self.main_raw_inputs:
                    logger.error(colored_print(
                        "\nThe default input param '{}' of the process '{}'"
                        " is already specified as a main input parameter of"
                        " the pipeline. Please choose a different extra_input"
                        " name.".format(p.input_type, p.template), "red_bold"))
                    sys.exit(1)
                param = p.input_type
            else:
                param = p.extra_input

            dest_channel = "EXTRA_{}_{}".format(p.template, p.pid)

            if param not in self.extra_inputs:
                self.extra_inputs[param] = {
                    "input_type": p.input_type,
                    "channels": [dest_channel]
                }
            else:
                if self.extra_inputs[param]["input_type"] != p.input_type:
                    logger.error(colored_print(
                        "\nThe extra_input parameter '{}' for process"
                        " '{}' was already defined with a different "
                        "input type '{}'. Please choose a different "
                        "extra_input name.".format(
                            p.input_type, p.template,
                            self.extra_inputs[param]["input_type"]),
                        "red_bold"))
                    sys.exit(1)
                self.extra_inputs[param]["channels"].append(dest_channel)

            logger.debug("[{}] Added extra channel '{}' linked to param: '{}' "
                         "".format(p.template, param,
                                   self.extra_inputs[param]))
            p.update_main_input(
                "{}.mix({})".format(p.input_channel, dest_channel)
            )

    def _get_fork_tree(self, lane):
        """

        Parameters
        ----------
        p

        Returns
        -------
        """

        parent_lanes = [lane]

        while True:
            original_lane = lane
            for fork_in, fork_out in self._fork_tree.items():
                if lane in fork_out:
                    lane = fork_in
                    parent_lanes.append(fork_in)
            if lane == original_lane:
                break

        return parent_lanes

    def _set_implicit_link(self, p, link):
        """

        Parameters
        ----------
        p
        link

        Returns
        -------

        """

        output_type = link["link"].lstrip("_")
        parent_forks = self._get_fork_tree(p.lane)
        fork_sink = "{}_{}".format(link["alias"], p.pid)

        for proc in self.processes[::-1]:
            if proc.lane not in parent_forks:
                continue
            if proc.output_type == output_type:
                proc.update_main_forks(fork_sink)
                logger.debug("[{}] Found special implicit link '{}' with "
                             "output type '{}'. Linked '{}' with process "
                             "{}".format(
                                     p.template, link["link"], output_type,
                                     link["alias"], proc))
                return

        self._update_raw_input(p, fork_sink, output_type)

    def _update_secondary_channels(self, p):
        """Given a process, this method updates the
        :attr:`~Process.secondary_channels` attribute with the corresponding
        secondary inputs of that channel.

        The rationale of the secondary channels is the following:

            - Start storing any secondary emitting channels, by checking the
              `link_start` list attribute of each process. If there are
              channel names in the link start, it adds to the secondary
              channels dictionary.
            - Check for secondary receiving channels, by checking the
              `link_end` list attribute. If the link name starts with a
              `__` signature, it will created an implicit link with the last
              process with an output type after the signature. Otherwise,
              it will check is a corresponding link start already exists in
              the at least one process upstream of the pipeline and if so,
              it will update the ``secondary_channels`` attribute with the
              new link.

        Parameters
        ----------
        p : flowcraft.Process.Process
        """

        # Check if the current process has a start of a secondary
        # side channel
        if p.link_start:
            logger.debug("[{}] Found secondary link start: {}".format(
                p.template, p.link_start))
            for l in p.link_start:
                # If there are multiple link starts in the same lane, the
                # last one is the only one saved.
                if l in self.secondary_channels:
                    self.secondary_channels[l][p.lane] = {"p": p, "end": []}
                else:
                    self.secondary_channels[l] = {p.lane: {"p": p, "end": []}}

        # check if the current process receives a secondary side channel.
        # If so, add to the links list of that side channel
        if p.link_end:
            logger.debug("[{}] Found secondary link end: {}".format(
                p.template, p.link_end))
            for l in p.link_end:

                # Get list of lanes from the parent forks.
                parent_forks = self._get_fork_tree(p.lane)

                # Parse special case where the secondary channel links with
                # the main output of the specified type
                if l["link"].startswith("__"):
                    self._set_implicit_link(p, l)
                    continue

                # Skip if there is no match for the current link in the
                # secondary channels
                if l["link"] not in self.secondary_channels:
                    continue

                for lane in parent_forks:
                    if lane in self.secondary_channels[l["link"]]:
                        self.secondary_channels[
                            l["link"]][lane]["end"].append("{}".format(
                                "{}_{}".format(l["alias"], p.pid)))

        logger.debug("[{}] Secondary links updated: {}".format(
            p.template, self.secondary_channels))

    def _set_channels(self):
        """Sets the main channels for the pipeline

        This method will parse de the :attr:`~Process.processes` attribute
        and perform the following tasks for each process:

            - Sets the input/output channels and main input forks and adds
              them to the process's
              :attr:`flowcraft.process.Process._context`
              attribute (See
              :func:`~NextflowGenerator.set_channels`).
            - Automatically updates the main input channel of the first
              process of each lane so that they fork from the user provide
              parameters (See
              :func:`~NextflowGenerator._update_raw_input`).
            - Check for the presence of secondary channels and adds them to the
              :attr:`~NextflowGenerator.secondary_channels` attribute.

        Notes
        -----
        **On the secondary channel setup**: With this approach, there can only
        be one secondary link start for each type of secondary link. For
        instance, If there are two processes that start a secondary channel
        for the ``SIDE_max_len`` channel, only the last one will be recorded,
        and all receiving processes will get the channel from the latest
        process. Secondary channels can only link if the source process if
        downstream of the sink process in its "forking" path.
        """

        logger.debug("=====================")
        logger.debug("Setting main channels")
        logger.debug("=====================")

        for i, p in enumerate(self.processes):

            # Set main channels for the process
            logger.debug("[{}] Setting main channels with pid: {}".format(
                p.template, i))
            p.set_channels(pid=i)

            # If there is no parent lane, set the raw input channel from user
            logger.debug("{} {} {}".format(p.parent_lane, p.input_type, p.template))
            if not p.parent_lane and p.input_type:
                self._update_raw_input(p)

            self._update_extra_inputs(p)

            self._update_secondary_channels(p)

            logger.info(colored_print(
                "\tChannels set for {} \u2713".format(p.template)))

    def _set_init_process(self):
        """Sets the main raw inputs and secondary inputs on the init process

        This method will fetch the :class:`flowcraft.process.Init` process
        instance and sets the raw input (
        :func:`flowcraft.process.Init.set_raw_inputs`) for
        that process. This will handle the connection of the user parameters
        with channels that are then consumed in the pipeline.
        """

        logger.debug("========================")
        logger.debug("Setting secondary inputs")
        logger.debug("========================")

        # Get init process
        init_process = self.processes[0]
        logger.debug("Setting main raw inputs: "
                     "{}".format(self.main_raw_inputs))
        init_process.set_raw_inputs(self.main_raw_inputs)
        logger.debug("Setting extra inputs: {}".format(self.extra_inputs))
        init_process.set_extra_inputs(self.extra_inputs)

    def _set_secondary_channels(self):
        """Sets the secondary channels for the pipeline

        This will iterate over the
        :py:attr:`NextflowGenerator.secondary_channels` dictionary that is
        populated when executing
        :func:`~NextflowGenerator._update_secondary_channels` method.
        """

        logger.debug("==========================")
        logger.debug("Setting secondary channels")
        logger.debug("==========================")

        logger.debug("Setting secondary channels: {}".format(
            self.secondary_channels))

        for source, lanes in self.secondary_channels.items():

            for vals in lanes.values():

                if not vals["end"]:
                    logger.debug("[{}] No secondary links to setup".format(
                        vals["p"].template))
                    continue

                logger.debug("[{}] Setting secondary links for "
                             "source {}: {}".format(vals["p"].template,
                                                    source,
                                                    vals["end"]))

                vals["p"].set_secondary_channel(source, vals["end"])

    def _set_compiler_channels(self):
        """Wrapper method that calls functions related to compiler channels
        """

        self._set_status_channels()
        self._set_general_compilers()

    def _set_general_compilers(self):
        """Adds compiler channels to the :attr:`processes` attribute.

        This method will iterate over the pipeline's processes and check
        if any process is feeding channels to a compiler process. If so, that
        compiler process is added to the pipeline and those channels are
        linked to the compiler via some operator.
        """

        for c, c_info in self.compilers.items():

            # Instantiate compiler class object and set empty channel list
            compiler_cls = c_info["cls"](template=c_info["template"])
            c_info["channels"] = []

            for p in self.processes:
                if not any([isinstance(p, x) for x in self.skip_class]):
                    # Check if process has channels to feed to a compiler
                    if c in p.compiler:
                        # Correct channel names according to the pid of the
                        # process
                        channels = ["{}_{}".format(i, p.pid) for i in
                                    p.compiler[c]]
                        c_info["channels"].extend(channels)

            # If one ore more channels were detected, establish connections
            # and append compiler to the process list.
            if c_info["channels"]:
                compiler_cls.set_compiler_channels(c_info["channels"],
                                                   operator="join")
                self.processes.append(compiler_cls)

    def _set_status_channels(self):
        """Compiles all status channels for the status compiler process
        """

        status_inst = pc.StatusCompiler(template="status_compiler")
        report_inst = pc.ReportCompiler(template="report_compiler")

        # Compile status channels from pipeline process
        status_channels = []
        for p in [p for p in self.processes]:
            if not any([isinstance(p, x) for x in self.skip_class]):

                status_channels.extend(p.status_strs)

        if not status_channels:
            logger.debug("No status channels found. Skipping status compiler"
                         "process")
            return

        logger.debug("Setting status channels: {}".format(status_channels))

        # Check for duplicate channels. Raise exception if found.
        if len(status_channels) != len(set(status_channels)):
            raise eh.ProcessError(
                "Duplicate status channels detected. Please ensure that "
                "the 'status_channels' attributes of each process are "
                "unique. Here are the status channels:\n\n{}".format(
                    ", ".join(status_channels)
                ))

        status_inst.set_compiler_channels(status_channels)

        report_channels = ["REPORT_{}".format(x.lstrip("STATUS_")) for x in
                           status_channels]

        report_inst.set_compiler_channels(report_channels)

        self.processes.extend([status_inst, report_inst])

    @staticmethod
    def _get_resources_string(res_dict, pid):
        """ Returns the nextflow resources string from a dictionary object

        If the dictionary has at least on of the resource directives, these
        will be compiled for each process in the dictionary and returned
        as a string read for injection in the nextflow config file template.

        This dictionary should be::

            dict = {"processA": {"cpus": 1, "memory": "4GB"},
                    "processB": {"cpus": 2}}

        Parameters
        ----------
        res_dict : dict
            Dictionary with the resources for processes.
        pid : int
            Unique identified of the process

        Returns
        -------
        str
            nextflow config string
        """

        config_str = ""
        ignore_directives = ["container", "version"]

        for p, directives in res_dict.items():

            for d, val in directives.items():

                if d in ignore_directives:
                    continue

                config_str += '\n\t${}_{}.{} = {}'.format(p, pid, d, val)

        return config_str

    @staticmethod
    def _get_container_string(cont_dict, pid):
        """ Returns the nextflow containers string from a dictionary object

        If the dictionary has at least on of the container directives, these
        will be compiled for each process in the dictionary and returned
        as a string read for injection in the nextflow config file template.

        This dictionary should be::

            dict = {"processA": {"container": "asd", "version": "1.0.0"},
                    "processB": {"container": "dsd"}}

        Parameters
        ----------
        cont_dict : dict
            Dictionary with the containers for processes.
        pid : int
            Unique identified of the process

        Returns
        -------
        str
            nextflow config string
        """

        config_str = ""

        for p, directives in cont_dict.items():

            container = ""

            if "container" in directives:
                container += directives["container"]

                if "version" in directives:
                    container += ":{}".format(directives["version"])
                else:
                    container += ":latest"

            if container:
                config_str += '\n\t${}_{}.container = "{}"'.format(p, pid, container)

        return config_str

    def _get_params_string(self):
        """Returns the nextflow params string from a dictionary object.

        The params dict should be a set of key:value pairs with the
        parameter name, and the default parameter value::

            self.params = {
                "genomeSize": 2.1,
                "minCoverage": 15
            }

        The values are then added to the string as they are. For instance,
        a ``2.1`` float will appear as ``param = 2.1`` and a
        ``"'teste'" string will appear as ``param = 'teste'`` (Note the
        string).

        Returns
        -------
        str
            Nextflow params configuration string
        """

        params_str = ""

        for p in self.processes:

            logger.debug("[{}] Adding parameters: {}\n".format(
                p.template, p.params)
            )

            # Add an header with the template name to structure the params
            # configuration
            if p.params and p.template != "init":

                p.set_param_id("_{}".format(p.pid))
                params_str += "\n\t/*"
                params_str += "\n\tComponent '{}_{}'\n".format(p.template,
                                                               p.pid)
                params_str += "\t{}\n".format("-" * (len(p.template) + len(p.pid) + 12))
                params_str += "\t*/\n"

            for param, val in p.params.items():

                if p.template == "init":
                    param_id = param
                else:
                    param_id = "{}_{}".format(param, p.pid)

                params_str += "\t{} = {}\n".format(param_id, val["default"])

        return params_str

    def _get_merged_params_string(self):
        """Returns the merged nextflow params string from a dictionary object.

        The params dict should be a set of key:value pairs with the
        parameter name, and the default parameter value::

            self.params = {
                "genomeSize": 2.1,
                "minCoverage": 15
            }

        The values are then added to the string as they are. For instance,
        a ``2.1`` float will appear as ``param = 2.1`` and a
        ``"'teste'" string will appear as ``param = 'teste'`` (Note the
        string).

        Identical parameters in multiple processes will be merged into the same
        param.

        Returns
        -------
        str
            Nextflow params configuration string
        """

        params_str = ""

        for p in self.processes:

            logger.debug("[{}] Adding parameters: {}\n".format(
                p.template, p.params)
            )

            # Add an header with the template name to structure the params
            # configuration
            if p.params and p.template != "init":

                p.set_param_id("_{}".format(p.pid))
                params_str += "\n\t/*"
                params_str += "\n\tComponent '{}_{}'\n".format(p.template,
                                                               p.pid)
                params_str += "\t{}\n".format("-" * (len(p.template) + len(p.pid) + 12))
                params_str += "\t*/\n"

            for param, val in p.params.items():

                if p.template == "init":
                    param_id = param
                else:
                    param_id = "{}_{}".format(param, p.pid)

                params_str += "\t{} = {}\n".format(param_id, val["default"])

        return params_str

    def _get_merged_params_string(self):
        """Returns the merged nextflow params string from a dictionary object.

        The params dict should be a set of key:value pairs with the
        parameter name, and the default parameter value::

            self.params = {
                "genomeSize": 2.1,
                "minCoverage": 15
            }

        The values are then added to the string as they are. For instance,
        a ``2.1`` float will appear as ``param = 2.1`` and a
        ``"'teste'" string will appear as ``param = 'teste'`` (Note the
        string).

        Identical parameters in multiple processes will be merged into the same
        param.

        Returns
        -------
        str
            Nextflow params configuration string
        """

        params_temp = {}

        for p in self.processes:

            logger.debug("[{}] Adding parameters: {}".format(p.template,
                                                             p.params))
            for param, val in p.params.items():

                params_temp[param] = val["default"]

        config_str = "\n\t" + "\n\t".join([
            "{} = {}".format(param, val) for param, val in params_temp.items()
        ])

        return config_str

    def _get_params_help(self):

        help_list = []

        for p in self.processes:

            # Skip init process
            if p.template == "init":
                for param, val in p.params.items():
                    help_list.append("--{:25} {} (default: {})".format(
                        param, val["description"],
                        str(val["default"]).replace('"', "'")))
                continue

            # Add component header and a line break
            if p.params:
                help_list.extend(
                    ["",
                     "Component '{}_{}'".format(p.template.upper(), p.pid),
                     "-" * (len(p.template) + len(p.pid) + 13)])

            for param, val in p.params.items():
                help_list.append("--{:<25} {} (default: {})".format(
                    param + "_" + p.pid, val["description"],
                    str(val["default"]).replace('"', "'")))

        return help_list

    def _get_merged_params_help(self):
        """

        Returns
        -------

        """

        help_dict = {}
        help_list = []

        for p in self.processes:

            for param, val in p.params.items():

                if param in help_dict:
                    help_dict[param]["process"].append(p.template)
                else:
                    tpl = [p.template] if p.template != "init" else []
                    help_dict[param] = {"process": tpl,
                                        "description": val["description"]}

        # Transform process list into final template string
        for p, val in help_dict.items():
            if not val["process"]:
                val["process"] = ""
            else:
                val["process"] = "({})".format(";".join(val["process"]))
            help_list.append("--{:<25} {} {}".format(
                p, val["description"], val["process"]))

        return help_list

    @staticmethod
    def _render_config(template, context):

        tpl_dir = join(dirname(abspath(__file__)), "templates")
        tpl_path = join(tpl_dir, template)

        path, filename = split(tpl_path)

        return jinja2.Environment(
            loader=jinja2.FileSystemLoader(path or "./")
        ).get_template(filename).render(context)

    def _set_configurations(self):
        """This method will iterate over all process in the pipeline and
        populate the nextflow configuration files with the directives
        of each process in the pipeline.
        """

        logger.debug("======================")
        logger.debug("Setting configurations")
        logger.debug("======================")

        resources = ""
        containers = ""
        params = ""
        config = ""

        if self.merge_params:
            params += self._get_merged_params_string()
            help_list = self._get_merged_params_help()
        else:
            params += self._get_params_string()
            help_list = self._get_params_help()

        for p in self.processes:

            # Skip processes with the directives attribute populated
            if not p.directives:
                continue

            logger.debug("[{}] Adding directives: {}".format(
                p.template, p.directives))
            resources += self._get_resources_string(p.directives, p.pid)
            containers += self._get_container_string(p.directives, p.pid)

        self.resources = self._render_config("resources.config", {
            "process_info": resources
        })
        self.containers = self._render_config("containers.config", {
            "container_info": containers
        })
        self.params = self._render_config("params.config", {
            "params_info": params
        })
        self.config = self._render_config("nextflow.config", {
            "pipeline_name": self.pipeline_name,
            "nf_file": self.nf_file
        })
        self.help = self._render_config("Helper.groovy", {
            "nf_file": basename(self.nf_file),
            "help_list": help_list,
            "version": __version__,
            "pipeline_name": " ".join([x.upper() for x in self.pipeline_name])
        })
        self.user_config = self._render_config("user.config", {})

    def dag_to_file(self, dict_viz, output_file=".treeDag.json"):
        """Writes dag to output file

        Parameters
        ----------
        dict_viz: dict
            Tree like dictionary that is used to export tree data of processes
            to html file and here for the dotfile .treeDag.json

        """

        outfile_dag = open(os.path.join(dirname(self.nf_file), output_file)
                           , "w")
        outfile_dag.write(json.dumps(dict_viz))
        outfile_dag.close()

    def render_pipeline(self):
        """Write pipeline attributes to json

        This function writes the pipeline and their attributes to a json file,
        that is intended to be read by resources/pipeline_graph.html to render
        a graphical output showing the DAG.

        """

        dict_viz = {
            "name": "root",
            "children": []
        }
        last_of_us = {}

        f_tree = self._fork_tree if self._fork_tree else {1: [1]}

        for x, (k, v) in enumerate(f_tree.items()):
            for p in self.processes[1:]:

                if x == 0 and p.lane not in [k] + v:
                    continue

                if x > 0 and p.lane not in v:
                    continue

                if not p.parent_lane:
                    lst = dict_viz["children"]
                else:
                    lst = last_of_us[p.parent_lane]

                tooltip = {
                    "name": "{}_{}".format(p.template, p.pid),
                    "process": {
                        "pid": p.pid,
                        "input": p.input_type,
                        "output": p.output_type if p.output_type else "None",
                        "lane": p.lane,
                    },
                    "children": []
                }

                dir_var = ""
                for k2, v2 in p.directives.items():
                    dir_var += k2
                    for d in v2:
                        try:
                            # Remove quotes from string directives
                            directive = v2[d].replace("'", "").replace('"', '') \
                                if isinstance(v2[d], str) else v2[d]
                            dir_var += "{}: {}".format(d, directive)
                        except KeyError:
                            pass

                if dir_var:
                    tooltip["process"]["directives"] = dir_var
                else:
                    tooltip["process"]["directives"] = "N/A"

                lst.append(tooltip)

                last_of_us[p.lane] = lst[-1]["children"]

        # write to file dict_viz
        self.dag_to_file(dict_viz)

        # Write tree forking information for dotfile
        with open(os.path.join(dirname(self.nf_file),
                               ".forkTree.json"), "w") as fh:
            fh.write(json.dumps(self._fork_tree))

        # send with jinja to html resource
        return self._render_config("pipeline_graph.html", {"data": dict_viz})

    def write_configs(self, project_root):
        """Wrapper method that writes all configuration files to the pipeline
        directory
        """

        # Write resources config
        with open(join(project_root, "resources.config"), "w") as fh:
            fh.write(self.resources)

        # Write containers config
        with open(join(project_root, "containers.config"), "w") as fh:
            fh.write(self.containers)

        # Write containers config
        with open(join(project_root, "params.config"), "w") as fh:
            fh.write(self.params)

        # Write nextflow config
        with open(join(project_root, "nextflow.config"), "w") as fh:
            fh.write(self.config)

        # Write user config if not present in the project directory
        if not exists(join(project_root, "user.config")):
            with open(join(project_root, "user.config"), "w") as fh:
                fh.write(self.user_config)

        lib_dir = join(project_root, "lib")
        if not exists(lib_dir):
            os.makedirs(lib_dir)
        with open(join(lib_dir, "Helper.groovy"), "w") as fh:
            fh.write(self.help)

        # Generate the pipeline DAG
        pipeline_to_json = self.render_pipeline()
        with open(splitext(self.nf_file)[0] + ".html", "w") as fh:
            fh.write(pipeline_to_json)

    def export_params(self):
        """Export pipeline params as a JSON to stdout

        This run mode iterates over the pipeline processes and exports the
        params dictionary of each component as a JSON to stdout.
        """

        params_json = {}

        # Skip first init process
        for p in self.processes[1:]:
            params_json[p.template] = p.params

        # Flush params json to stdout
        sys.stdout.write(json.dumps(params_json))

    def export_directives(self):
        """Export pipeline directives as a JSON to stdout
        """

        directives_json = {}

        # Skip first init process
        for p in self.processes[1:]:
            directives_json[p.template] = p.directives

        # Flush params json to stdout
        sys.stdout.write(json.dumps(directives_json))

    def fetch_docker_tags(self):
        """
        Export all dockerhub tags associated with each component given by
        the -t flag.
        """

        # dict to store the already parsed components (useful when forks are
        # given to the pipeline string via -t flag
        dict_of_parsed = {}

        # fetches terminal width and subtracts 3 because we always add a
        # new line character and we want a space at the beggining and at the end
        # of each line
        terminal_width = shutil.get_terminal_size().columns - 3

        # first header
        center_string = " Selected container tags "

        # starts a list with the headers
        tags_list = [
            [
                "=" * int(terminal_width / 4),
                "{0}{1}{0}".format(
                    "=" * int(((terminal_width/2 - len(center_string)) / 2)),
                    center_string)
                ,
                "{}\n".format("=" * int(terminal_width / 4))
            ],
            ["component", "container", "tags"],
            [
                "=" * int(terminal_width / 4),
                "=" * int(terminal_width / 2),
                "=" * int(terminal_width / 4)
            ]
        ]

        # Skip first init process and iterate through the others
        for p in self.processes[1:]:
            template = p.template
            # if component has already been printed then skip and don't print
            # again
            if template in dict_of_parsed:
                continue

            # starts a list of  containers for the current process in
            # dict_of_parsed, in which each containers will be added to this
            # list once it gets parsed
            dict_of_parsed[template] = {
                "container": []
            }

            # fetch repo name from directives of each component.
            for directives in p.directives.values():
                try:
                    repo = directives["container"]
                    default_version = directives["version"]
                except KeyError:
                    # adds the default container if container key isn't present
                    # this happens for instance in integrity_coverage
                    repo = "flowcraft/flowcraft_base"
                    default_version = "1.0.0-1"
                # checks if repo_version already exists in list of the
                # containers for the current component being queried
                repo_version = repo + default_version
                if repo_version not in dict_of_parsed[template]["container"]:
                    # make the request to docker hub
                    r = requests.get(
                        "https://hub.docker.com/v2/repositories/{}/tags/"
                        .format(repo)
                    )
                    # checks the status code of the request, if it is 200 then
                    # parses docker hub entry, otherwise retrieve no tags but
                    # alerts the user
                    if r.status_code != 404:
                        # parse response content to dict and fetch results key
                        r_content = json.loads(r.content)["results"]
                        for version in r_content:
                            printed_version = (version["name"] + "*") \
                                if version["name"] == default_version \
                                else version["name"]
                            tags_list.append([template, repo, printed_version])
                    else:
                        tags_list.append([template, repo, "No DockerHub tags"])

                dict_of_parsed[template]["container"].append(repo_version)

        # iterate through each entry in tags_list and print the list of tags
        # for each component. Each entry (excluding the headers) contains
        # 3 elements (component name, container and tag version)
        for x, entry in enumerate(tags_list):
            # adds different color to the header in the first list and
            # if row is pair add one color and if is even add another (different
            # background)
            color = "blue_bold" if x < 3 else \
                ("white" if x % 2 != 0 else "0;37;40m")
            # generates a small list with the terminal width for each column,
            # this will be given to string formatting as the 3, 4 and 5 element
            final_width = [
                int(terminal_width/4),
                int(terminal_width/2),
                int(terminal_width/4)
            ]
            # writes the string to the stdout
            sys.stdout.write(
                colored_print("\n {0: <{3}} {1: ^{4}} {2: >{5}}".format(
                    *entry, *final_width), color)
            )
        # assures that the entire line gets the same color
        sys.stdout.write("\n{0: >{1}}\n".format("(* = default)",
                                                terminal_width + 3))

    def build(self):
        """Main pipeline builder

        This method is responsible for building the
        :py:attr:`NextflowGenerator.template` attribute that will contain
        the nextflow code of the pipeline.

        First it builds the header, then sets the main channels, the
        secondary inputs, secondary channels and finally the
        status channels. When the pipeline is built, is writes the code
        to a nextflow file.
        """

        logger.info(colored_print(
            "\tSuccessfully connected {} process(es) with {} "
            "fork(s) across {} lane(s) \u2713".format(
                len(self.processes[1:]), len(self._fork_tree), self.lanes)))

        # Generate regular nextflow header that sets up the shebang, imports
        # and all possible initial channels
        self._build_header()

        self._set_channels()

        self._set_init_process()

        self._set_secondary_channels()

        logger.info(colored_print(
            "\tSuccessfully set {} secondary channel(s) \u2713".format(
                len(self.secondary_channels))))

        self._set_compiler_channels()

        self._set_configurations()

        logger.info(colored_print(
            "\tFinished configurations \u2713"))

        for p in self.processes:
            self.template += "\n{}".format(p.template_str)

        self._build_footer()

        project_root = dirname(self.nf_file)

        # Write configs
        self.write_configs(project_root)

        # Write pipeline file
        with open(self.nf_file, "w") as fh:
            fh.write(self.template)

        logger.info(colored_print(
            "\tPipeline written into {} \u2713".format(self.nf_file)))


================================================
FILE: flowcraft/generator/error_handling.py
================================================
class ProcessError(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)


class SanityError(Exception):
    """
    Class to raise a custom error for sanity checks
    """
    def __init__(self, value):
        self.value = "inSANITY ERROR: {}".format(value)

    # def __str__(self):
    #     return repr(self.value)


class InspectionError(Exception):
    def __init__(self, value):
        self.value = "Inspection ERROR: {}".format(value)


class ReportError(Exception):
    def __init__(self, value):
        self.value = "Reports ERROR: {}".format(value)


class RecipeError(Exception):
    def __init__(self, value):
        self.value = "Recipe ERROR: {}".format(value)

    # def __str__(self):
    #     return repr(self.value)

class LogError(Exception):
    def __init__(self, value):
        self.value = "Log ERROR: {}".format(value)


================================================
FILE: flowcraft/generator/footer_skeleton.py
================================================
footer = """
workflow.onComplete {
  // Display complete message
  log.info "Completed at: " + workflow.complete
  log.info "Duration    : " + workflow.duration
  log.info "Success     : " + workflow.success
  log.info "Exit status : " + workflow.exitStatus
}

workflow.onError {
  // Display error message
  log.info "Workflow execution stopped with the following message:"
  log.info "  " + workflow.errorMessage
}
"""

================================================
FILE: flowcraft/generator/header_skeleton.py
================================================
header = """#!/usr/bin/env nextflow

import Helper
import CollectInitialMetadata

// Pipeline version
if (workflow.commitId){
    version = "0.1 $workflow.revision"
} else {
    version = "0.1 (local version)"
}

params.help = false
if (params.help){
    Help.print_help(params)
    exit 0
}

def infoMap = [:]
if (params.containsKey("fastq")){
    infoMap.put("fastq", file(params.fastq).size())
}
if (params.containsKey("fasta")){
    if (file(params.fasta) instanceof LinkedList){
        infoMap.put("fasta", file(params.fasta).size())
    } else {
        infoMap.put("fasta", 1) 
    }
}
if (params.containsKey("accessions")){
    // checks if params.accessions is different from null
    if (params.accessions) {
        BufferedReader reader = new BufferedReader(new FileReader(params.accessions));
        int lines = 0;
        while (reader.readLine() != null) lines++;
        reader.close();
        infoMap.put("accessions", lines)
    }
}

Help.start_info(infoMap, "$workflow.start", "$workflow.profile")
CollectInitialMetadata.print_metadata(workflow)
    """

================================================
FILE: flowcraft/generator/inspect.py
================================================
import re
import os
import sys
import uuid
import time
import curses
import signal
import locale
import socket
import logging
import hashlib
import requests
import json

from pympler import asizeof
from os.path import join, abspath
from time import gmtime, strftime, sleep
from collections import defaultdict, OrderedDict

try:
    import generator.error_handling as eh
    from generator.process_details import colored_print
    from generator.utils import get_nextflow_filepath
except ImportError:
    import flowcraft.generator.error_handling as eh
    from flowcraft.generator.process_details import colored_print
    from flowcraft.generator.utils import get_nextflow_filepath

locale.setlocale(locale.LC_ALL, "")
code = locale.getpreferredencoding()

logger = logging.getLogger("main.{}".format(__name__))


def signal_handler(screen):
    """This function is bound to the SIGINT signal (like ctrl+c) to graciously
    exit the program and reset the curses options.
    """

    if screen:
        screen.clear()
        screen.refresh()

        curses.nocbreak()
        screen.keypad(0)
        curses.echo()
        curses.endwin()

    print("Exiting flowcraft inspection... Bye")
    sys.exit(0)


class NextflowInspector:

    MAX_RETRIES = 1000
    """
    int: Number of retries for parsing trace and log files. Only exit with non-0
    error code after these retries.
    """

    def __init__(self, trace_file, refresh_rate, pretty=False, ip_addr=None):

        self.trace_file = trace_file
        """
        str: Path to nextflow trace file.
        """

        self.trace_sizestamp = None
        """
        str: Stores the sizestamp of the last modification of the trace file.
        This is used to parse the file only when it has changed.
        """

        self.refresh_rate = refresh_rate
        """
        float: Frequency (in seconds) that the curses screen will be refreshed.
        """

        self.stored_ids = []
        """
        list: Stores the task_ids that have already been parsed. It is used
        to skip them when parsing the trace files multiple times.
        """

        self.stored_log_ids = []
        """
        list: Stores the time stamps of the log file lines that were already
        parsed. It is used to skip parsing the log files multilpe times
        """

        self.trace_info = defaultdict(list)
        """
        dict: Main object that stores the status information for each process
        name in the trace file.
        """

        self.process_stats = {}
        """
        dict: Contains some statistics for each process.
        """

        self.processes = OrderedDict()
        """
        dict: Dictionary of processes from the pipeline with the status of the
        channel as the value. This information is retrieved from the
        .nextflow.log file in the :func:`_parser_pipeline_processes` method
        and updated in the :func:`_update_barrier_status` and
        :func:`_update_process_stats` and :func:`_update_submission_status`.
        """

        self.process_tags = {}
        """
        dict: Dictionary of processes with summary information for each tag
        it processes
        """

        self.samples = []
        """
        list: List of samples inferred from the pipeline.
        """

        self.skip_processes = ["status", "compile_status", "report",
                               "compile_reports", "fullConsensus",
                               "compile_status_buffer"]
        """
        list: List of special processes that should be skipped for inspection
        purposes.
        """

        self.log_file = ".nextflow.log"
        """
        str: Name of the nextflow log file.
        """

        self.log_sizestamp = None
        """
        str: Stores the sizestamp of the last modification of the nextflow
        log file. This is used to parse the file only when it has changed.
        """

        self.pipeline_tag = ""
        """
        str: Tag of the pipeline, parsed from .nextflow.log
        """

        self.log_retry = 0
        """
        int: Each time the log file is not found, this counter is
        increased. Only when it matches the :attr:`MAX_RETRIES` attribute
        does it raises a FileNotFoundError.
        """

        self.trace_retry = 0
        """
        int: Each time the log file is not found, this counter is 
        increased. Only when it matches the :attr:`MAX_RETRIES` attribute
        does it raises a FileNotFoundError.
        """

        self.pipeline_name = ""
        """
        str: Name of the nextflow pipeline file.
        """

        self.time_start = None
        """
        datetime.time object with the starting time of the pipeline.
        """

        self.time_stop = None
        """
        datetime.time object with the finish time of the pipeline. This
        attribute is only set when the pipeline is not running.
        """

        self.workdir = os.getcwd()
        """
        str: Path to the pipeline work directory
        """

        self.execution_command = None
        """
        str: The command used to execute the pipeline
        """

        self.nextflow_version = None
        """
        str: Nextflow's version string, as retrieved from the log file.
        """

        self.run_status = ""
        """
        str: Status of the pipeline. Can be either 'running', 'aborted',
        'error', 'complete'.
        """

        self.abort_cause = None
        """
        str or None: When :attr:`run_status` is "aborted", this attribute
        will contain the reason provided in the nextflow log. When this
        attribute is not None, it will also trigger the sending of the
        final lines of the nextflow log to broadcast.
        """

        if not ip_addr:
            self.app_address = "http://www.flowcraft.live:80/"
        else:
            self.app_address = ip_addr
            """
            str: Address of flowcraft web app
            """

        self.broadcast_address = "{}inspect/api/status".format(
            self.app_address)
        """
        str: Address of the REST api where the information will be sent
        """

        self._c = 0
        """
        Counter of payloads sent, for debug purposes
        """

        self.send = True
        """
        boolean: This attribute will be set to False after sending a request
        and set to True when there is a change in the inspection attributes.
        """

        # Skip these process names (they are check with the startswith()
        # method) when using the --pretty option
        if pretty:
            self._blacklist = [
                "report_coverage_", "fastqc2_report", "compile_fastqc_status2",
                "fastqc_report", "trim_report", "compile_fastqc_status",
                "report_corrupt_", "jsonDumpingMapping", "compile_mlst_",
                "mashOutputJson_", "mashDistOutputJson_", "pilon_report_",
                "compile_pilon_report"
            ]
        else:
            self._blacklist = []

        # CURSES ATTRIBUTES
        # Init curses screen
        self.screen = None
        self.top_line = 0
        self.padding = 0
        self.screen_lines = None
        self.max_width = 0
        self.content_lines = 0

        # Checks if nextflow log and trace files are available
        self._check_required_files()
        # Gathers the complete list of processes from the nextflow log
        self._get_pipeline_processes()
        # Fetches the pipeline status from the nextflow log
        self._update_pipeline_status()

        # Bind SIGINT to singal_handler function. This makes a clean exit
        # from the curses interface when exiting through ctrl+c.
        signal.signal(signal.SIGINT, lambda *x: signal_handler(self.screen))

    #################
    # UTILITY METHODS
    #################

    def _check_required_files(self):
        """Checks whetner the trace and log files are available
        """

        if not os.path.exists(self.trace_file):
            raise eh.InspectionError("The provided trace file could not be "
                                     "opened: {}".format(self.trace_file))

        if not os.path.exists(self.log_file):
            raise eh.InspectionError("The .nextflow.log files could not be "
                                     "opened. Are you sure you are in a "
                                     "nextflow project directory?")

    @staticmethod
    def _header_mapping(header):
        """Parses the trace file header and retrieves the positions of each
        column key.

        Parameters
        ----------
        header : str
            The header line of nextflow's trace file

        Returns
        -------
        dict
            Mapping the column ID to its position (e.g.: {"tag":2})
        """

        return dict(
            (x.strip(), pos) for pos, x in enumerate(header.split("\t"))
        )

    @staticmethod
    def _expand_path(hash_str):
        """Expands the hash string of a process (ae/1dasjdm) into a full
        working directory

        Parameters
        ----------
        hash_str : str
            Nextflow process hash with the beggining of the work directory

        Returns
        -------
        str
            Path to working directory of the hash string
        """

        try:
            first_hash, second_hash = hash_str.split("/")
            first_hash_path = join(abspath("work"), first_hash)

            for l in os.listdir(first_hash_path):
                if l.startswith(second_hash):
                    return join(first_hash_path, l)
        except FileNotFoundError:
            return None

    @staticmethod
    def _hms(s):
        """Converts a hms string into seconds.

        Parameters
        ----------
        s : str
            The hms string can be something like '20s', '1m30s' or '300ms'.

        Returns
        -------
        float
            Time in seconds.

        """

        if s == "-":
            return 0

        if s.endswith("ms"):
            return float(s.rstrip("ms")) / 1000

        fields = list(map(float, re.split("[dhms]", s)[:-1]))
        if len(fields) == 4:
            return fields[0] * 24 * 3600 + fields[1] * 3600 + fields[2] * 60 +\
                fields[3]
        if len(fields) == 3:
            return fields[0] * 3600 + fields[1] * 60 + fields[2]
        elif len(fields) == 2:
            return fields[0] * 60 + fields[1]
        else:
            return fields[0]

    @staticmethod
    def _size_coverter(s):
        """Converts size string into megabytes

        Parameters
        ----------
        s : str
            The size string can be '30KB', '20MB' or '1GB'

        Returns
        -------
        float
            With the size in bytes

        """

        if s.upper().endswith("KB"):
            return float(s.rstrip("KB")) / 1024

        elif s.upper().endswith(" B"):
            return float(s.rstrip("B")) / 1024 / 1024

        elif s.upper().endswith("MB"):
            return float(s.rstrip("MB"))

        elif s.upper().endswith("GB"):
            return float(s.rstrip("GB")) * 1024

        elif s.upper().endswith("TB"):
            return float(s.rstrip("TB")) * 1024 * 1024

        else:
            return float(s)

    @staticmethod
    def _size_compress(s):
        """Shortens a megabytes string.
        """

        if s / 1024 > 1:
            return "{}GB".format(round(s / 1024, 1))
        else:
            return "{}MB".format(s)

    #########################
    # AUXILIARY PARSE METHODS
    #########################

    def _get_pipeline_processes(self):
        """Parses the .nextflow.log file and retrieves the complete list
        of processes

        This method searches for specific signatures at the beginning of the
        .nextflow.log file::

             Apr-19 19:07:32.660 [main] DEBUG nextflow.processor
             TaskProcessor - Creating operator > report_corrupt_1_1 --
             maxForks: 4

        When a line with the .*Creating operator.* signature is found, the
        process name is retrieved and populates the :attr:`processes` attribute
        """

        with open(self.log_file) as fh:

            for line in fh:
                if re.match(".*Creating operator.*", line):
                    # Retrieves the process name from the string
                    match = re.match(".*Creating operator > (.*) --", line)
                    process = match.group(1)

                    if any([process.startswith(x) for x in self._blacklist]):
                        continue

                    if process not in self.skip_processes:
                        self.processes[match.group(1)] = {
                            "barrier": "W",
                            "submitted": set(),
                            "finished": set(),
                            "failed": set(),
                            "retry": set(),
                            "cpus": None,
                            "memory": None
                        }
                        self.process_tags[process] = {}

                # Retrieves the pipeline name from the string
                if re.match(".*Launching `.*` \[.*\] ", line):
                    tag_match = re.match(".*Launching `.*` \[(.*)\] ", line)
                    self.pipeline_tag = tag_match.group(1) if tag_match else \
                        "?"
                    name_match = re.match(".*Launching `(.*)` \[.*\] ", line)
                    self.pipeline_name = name_match.group(1) if name_match \
                        else "?"

        self.content_lines = len(self.processes)

    def _clear_inspect(self):
        """Clears inspect attributes when re-executing a pipeline"""

        self.trace_info = defaultdict(list)
        self.process_tags = {}
        self.process_stats = {}
        self.samples = []
        self.stored_ids = []
        self.stored_log_ids = []
        self.time_start = None
        self.time_stop = None
        self.execution_command = None
        self.nextflow_version = None
        self.abort_cause = None
        self._c = 0
        # Clean up of tag running status
        for p in self.processes.values():
            p["barrier"] = "W"
            for i in ["submitted", "finished", "failed", "retry"]:
                p[i] = set()

    def _update_pipeline_status(self):
        """Parses the .nextflow.log file for signatures of pipeline status.
        It sets the :attr:`status_info` attribute.
        """

        with open(self.log_file) as fh:

            try:
                first_line = next(fh)
            except:
                raise eh.InspectionError("Could not read .nextflow.log file. Is file empty?")
            time_str = " ".join(first_line.split()[:2])
            self.time_start = time_str

            if not self.execution_command:
                try:
                    self.execution_command = re.match(
                        ".*nextflow run (.*)", first_line).group(1)
                except AttributeError:
                    self.execution_command = "Unknown"

            for line in fh:

                if "DEBUG nextflow.cli.CmdRun" in line:
                    if not self.nextflow_version:
                        try:
                            vline = next(fh)
                            self.nextflow_version = re.match(
                                ".*Version: (.*)", vline).group(1)
                        except AttributeError:
                            self.nextflow_version = "Unknown"

                if "Session aborted" in line:
                    self.run_status = "aborted"
                    # Get abort cause
                    try:
                        self.abort_cause = re.match(
                            ".*Cause: (.*)", line).group(1)
                    except AttributeError:
                        self.abort_cause = "Unknown"
                    # Get time of pipeline stop
                    time_str = " ".join(line.split()[:2])
                    self.time_stop = time_str
                    self.send = True
                    return
                if "Execution complete -- Goodbye" in line:
                    self.run_status = "complete"
                    # Get time of pipeline stop
                    time_str = " ".join(line.split()[:2])
                    self.time_stop = time_str
                    self.send = True
                    return

        if self.run_status not in ["running", ""]:
            self._clear_inspect()
            # Take a break to allow nextflow to restart before refreshing
            # pipeine processes
            sleep(5)
            self._get_pipeline_processes()

        self.run_status = "running"

    def _update_tag_status(self, process, vals):
        """ Updates the 'submitted', 'finished', 'failed' and 'retry' status
        of each process/tag combination.

        Process/tag combinations provided to this method already appear on
        the trace file, so their submission status is updated based on their
        execution status from nextflow.

        For instance, if a tag is successfully
        complete, it is moved from the 'submitted' to the 'finished' list.
        If not, it is moved to the 'failed' list.

        Parameters
        ----------
        process : str
            Name of the current process. Must be present in attr:`processes`
        vals : list
            List of tags for this process that have been gathered in the
            trace file.
        """

        good_status = ["COMPLETED", "CACHED"]

        # Update status of each process
        for v in list(vals)[::-1]:
            p = self.processes[process]
            tag = v["tag"]

            # If the process/tag is in the submitted list, move it to the
            # complete or failed list
            if tag in p["submitted"]:
                p["submitted"].remove(tag)
                if v["status"] in good_status:
                    p["finished"].add(tag)
                elif v["status"] == "FAILED":
                    if not v["work_dir"]:
                        v["work_dir"] = ""
                    self.process_tags[process][tag]["log"] = \
                        self._retrieve_log(join(v["work_dir"], ".command.log"))
                    p["failed"].add(tag)

            # It the process/tag is in the retry list and it completed
            # successfully, remove it from the retry and fail lists. Otherwise
            # maintain it in the retry/failed lists
            elif tag in p["retry"]:
                if v["status"] in good_status:
                    p["retry"].remove(tag)
                    p["failed"].remove(tag)
                    del self.process_tags[process][tag]["log"]
                elif self.run_status == "aborted":
                    p["retry"].remove(tag)

            elif v["status"] in good_status:
                p["finished"].add(tag)

            # Filter tags without a successfull status.
            if v["status"] not in good_status:
                if v["tag"] in list(p["submitted"]) + list(p["finished"]):
                    vals.remove(v)

        return vals

    def _update_barrier_status(self):
        """Checks whether the channels to each process have been closed.
        """

        with open(self.log_file) as fh:

            for line in fh:

                # Exit barrier update after session abort signal
                if "Session aborted" in line:
                    return

                if "<<< barrier arrive" in line:
                    # Retrieve process name from string
                    process_m = re.match(".*process: (.*)\)", line)
                    if process_m:
                        process = process_m.group(1)
                        # Updates process channel to complete
                        if process in self.processes:
                            self.processes[process]["barrier"] = "C"

    @staticmethod
    def _retrieve_log(path):
        """Method used to retrieve the contents of a log file into a list.

        Parameters
        ----------
        path

        Returns
        -------
        list or None
            Contents of the provided file, each line as a list entry
        """

        if not os.path.exists(path):
            return None

        with open(path) as fh:
            return fh.readlines()

    def _update_trace_info(self, fields, hm):
        """Parses a trace line and updates the :attr:`status_info` attribute.

        Parameters
        ----------
        fields : list
            List of the tab-seperated elements of the trace line
        hm : dict
            Maps the column IDs to their position in the fields argument.
            This dictionary object is retrieve from :func:`_header_mapping`.
        """

        process = fields[hm["process"]]

        if process not in self.processes:
            return

        # Get information from a single line of trace file
        info = dict((column, fields[pos]) for column, pos in hm.items())

        # The headers that will be used to populate the process
        process_tag_headers = ["realtime", "rss", "rchar", "wchar"]
        for h in process_tag_headers:

            # In the rare occasion the tag is parsed first in the trace
            # file than the log file, add the new tag.
            if info["tag"] not in self.process_tags[process]:
                # If the 'start' tag is present in the trace, use that
                # information. If not, it will be parsed in the log file.
                try:
                    timestart = info["start"].split()[1]
                except KeyError:
                    timestart = None
                self.process_tags[process][info["tag"]] = {
                    "workdir": self._expand_path(info["hash"]),
                    "start": timestart
                }

            if h in info and info["tag"] != "-":
                if h != "realtime" and info[h] != "-":
                    self.process_tags[process][info["tag"]][h] = \
                        round(self._size_coverter(info[h]), 2)
                else:
                    self.process_tags[process][info["tag"]][h] = info[h]

        # Set allocated cpu and memory information to process
        if "cpus" in info and not self.processes[process]["cpus"]:
            self.processes[process]["cpus"] = info["cpus"]
        if "memory" in info and not self.processes[process]["memory"]:
            try:
                self.processes[process]["memory"] = self._size_coverter(
                    info["memory"])
            except ValueError:
                self.processes[process]["memory"] = None

        if info["hash"] in self.stored_ids:
            return

        # If the task hash code is provided, expand it to the work directory
        # and add a new entry
        if "hash" in info:
            hs = info["hash"]
            info["work_dir"] = self._expand_path(hs)

        if "tag" in info:
            tag = info["tag"]
            if tag != "-" and tag not in self.samples and \
                    tag.split()[0] not in self.samples:
                self.samples.append(tag)

        self.trace_info[process].append(info)
        self.stored_ids.append(info["hash"])

    def _update_process_resources(self, process, vals):
        """Updates the resources info in :attr:`processes` dictionary.
        """

        resources = ["cpus"]

        for r in resources:
            if not self.processes[process][r]:
                try:
                    self.processes[process][r] = vals[0]["cpus"]
                # When the trace column is not present
                except KeyError:
                    pass

    def _cpu_load_parser(self, cpus, cpu_per, t):
        """Parses the cpu load from the number of cpus and its usage
        percentage and returnsde cpu/hour measure

        Parameters
        ----------
        cpus : str
            Number of cpus allocated.
        cpu_per : str
            Percentage of cpu load measured (e.g.: 200,5%).
        t : str
            The time string can be something like '20s', '1m30s' or '300ms'.
        """

        try:
            _cpus = float(cpus)
            _cpu_per = float(cpu_per.replace(",", ".").replace("%", ""))
            hours = self._hms(t) / 60 / 24

            return ((_cpu_per / (100 * _cpus)) * _cpus) * hours

        except ValueError:
            return 0

    def _assess_resource_warnings(self, process, vals):
        """Assess whether the cpu load or memory usage is above the allocation

        Parameters
        ----------
        process : str
            Process name
        vals : vals
            List of trace information for each tag of that process

        Returns
        -------
        cpu_warnings : dict
            Keys are tags and values are the excessive cpu load
        mem_warnings : dict
            Keys are tags and values are the excessive rss
        """

        cpu_warnings = {}
        mem_warnings = {}

        for i in vals:
            try:
                expected_load = float(i["cpus"]) * 100
                cpu_load = float(i["%cpu"].replace(",", ".").replace("%", ""))

                if expected_load * 0.9 > cpu_load > expected_load * 1.10:
                    cpu_warnings[i["tag"]] = {
                        "expected":  expected_load,
                        "value": cpu_load
                    }
            except (ValueError, KeyError):
                pass

            try:
                rss = self._size_coverter(i["rss"])
                mem_allocated = self._size_coverter(i["memory"])

                if rss > mem_allocated * 1.10:
                    mem_warnings[i["tag"]] = {
                        "expected": mem_allocated,
                        "value": rss
                    }
            except (ValueError, KeyError):
                pass

        return cpu_warnings, mem_warnings

    def _update_process_stats(self):
        """Updates the process stats with the information from the processes

        This method is called at the end of each static parsing of the nextflow
        trace file. It re-populates the :attr:`process_stats` dictionary
        with the new stat metrics.
        """

        good_status = ["COMPLETED", "CACHED"]

        for process, vals in self.trace_info.items():

            # Update submission status of tags for each process
            vals = self._update_tag_status(process, vals)

            # Update process resources
            self._update_process_resources(process, vals)

            self.process_stats[process] = {}

            inst = self.process_stats[process]

            # Get number of completed samples
            inst["completed"] = "{}".format(
                len([x for x in vals if x["status"] in good_status]))

            # Get average time
            try:
                time_array = [self._hms(x["realtime"]) for x in vals]
                mean_time = round(sum(time_array) / len(time_array), 1)
                mean_time_str = strftime('%H:%M:%S', gmtime(mean_time))
                inst["realtime"] = mean_time_str
            # When the realtime column is not present
            except KeyError:
                inst["realtime"] = "-"

            # Get cumulative cpu/hours
            try:
                cpu_hours = [self._cpu_load_parser(
                    x["cpus"], x["%cpu"], x["realtime"]) for x in vals]
                inst["cpuhour"] = round(sum(cpu_hours), 2)
            # When the realtime, cpus or %cpus column are not present
            except KeyError:
                inst["cpuhour"] = "-"

            # Assess resource warnings
            inst["cpu_warnings"], inst["mem_warnings"] = \
                self._assess_resource_warnings(process, vals)

            # Get maximum memory
            try:
                rss_values = [self._size_coverter(x["rss"]) for x in vals
                              if x["rss"] != "-"]
                if rss_values:
                    max_rss = round(max(rss_values))
                    rss_str = self._size_compress(max_rss)
                else:
                    rss_str = "-"
                inst["maxmem"] = rss_str
            except KeyError:
                inst["maxmem"] = "-"

            # Get read size
            try:
                rchar_values = [self._size_coverter(x["rchar"]) for x in vals
                                if x["rchar"] != "-"]
                if rchar_values:
                    avg_rchar = round(sum(rchar_values) / len(rchar_values))
                    rchar_str = self._size_compress(avg_rchar)
                else:
                    rchar_str = "-"
            except KeyError:
                rchar_str = "-"
            inst["avgread"] = rchar_str

            # Get write size
            try:
                wchar_values = [self._size_coverter(x["wchar"]) for x in vals
                                if x["wchar"] != "-"]
                if wchar_values:
                    avg_wchar = round(sum(wchar_values) / len(wchar_values))
                    wchar_str = self._size_compress(avg_wchar)
                else:
                    wchar_str = "-"
            except KeyError:
                wchar_str = "-"
            inst["avgwrite"] = wchar_str

    #################
    # PARSING METHODS
    #################

    def trace_parser(self):
        """Method that parses the trace file once and updates the
        :attr:`status_info` attribute with the new entries.
        """

        # Check the timestamp of the tracefile. Only proceed with the parsing
        # if it changed from the previous time.
        size_stamp = os.path.getsize(self.trace_file)
        self.trace_retry = 0
        if size_stamp and size_stamp == self.trace_sizestamp:
            return
        else:
            logger.debug("Updating trace size stamp to: {}".format(size_stamp))
            self.trace_sizestamp = size_stamp

        with open(self.trace_file) as fh:

            # Skip potential empty lines at the start of file
            header = next(fh).strip()
            while not header:
                header = next(fh).strip()

            # Get header mappings before parsing the file
            hm = self._header_mapping(header)

            for line in fh:

                # Skip empty lines
                if line.strip() == "":
                    continue

                fields = line.strip().split("\t")

                # Skip if task ID was already processes
                if fields[hm["task_id"]] in self.stored_ids:
                    continue

                # Parse trace entry and update status_info attribute
                self._update_trace_info(fields, hm)
                self.send = True

        self._update_process_stats()
        self._update_barrier_status()

    def log_parser(self):
        """Method that parses the nextflow log file once and updates the
        submitted number of samples for each process
        """

        # Check the timestamp of the log file. Only proceed with the parsing
        # if it changed from the previous time.
        size_stamp = os.path.getsize(self.log_file)
        self.log_retry = 0
        if size_stamp and size_stamp == self.log_sizestamp:
            return
        else:
            logger.debug("Updating log size stamp to: {}".format(size_stamp))
            self.log_sizestamp = size_stamp

        # Regular expression to catch four groups:
        # 1. Start timestamp
        # 2. Work directory hash
        # 3. Process name
        # 4. Tag name
        r = ".* (.*) \[.*\].*\[(.*)\].*process > (.*) \((.*)\).*"

        with open(self.log_file) as fh:

            for line in fh:
                if "Submitted process >" in line or \
                        "Re-submitted process >" in line or \
                        "Cached process >" in line:
                    m = re.match(r, line)
                    if not m:
                        continue

                    time_start = m.group(1)
                    workdir = m.group(2)
                    process = m.group(3)
                    tag = m.group(4)

                    # Skip if this line has already been parsed
                    if time_start + tag not in self.stored_log_ids:
                        self.stored_log_ids.append(time_start + tag)
                    else:
                        continue

                    # For first time processes
                    if process not in self.processes:
                        continue
                    p = self.processes[process]

                    # Skip is process/tag combination has finished or is retrying
                    if tag in list(p["finished"]) + list(p["retry"]):
                        continue

                    # Update failed process/tags when they have been re-submitted
                    if tag in list(p["failed"]) and \
                            "Re-submitted process >" in line:
                        p["retry"].add(tag)
                        self.send = True
                        continue

                    # Set process barrier to running. Check for barrier status
                    # are performed at the end of the trace parsing in the
                    # _update_barrier_status method.
                    p["barrier"] = "R"
                    if tag not in p["submitted"]:
                        p["submitted"].add(tag)
                        # Update the process_tags attribute with the new tag.
                        # Update only when the tag does not exist. This may rarely
                        # occur when the tag is parsed first in the trace file
                        if tag not in self.process_tags[process]:
                            self.process_tags[process][tag] = {
                                "workdir": self._expand_path(workdir),
                                "start": time_start
                            }
                            self.send = True
                        # When the tag is filled in the trace file parsing,
                        # the timestamp may not be present in the trace. In
                        # those cases, fill that information here.
                        elif not self.process_tags[process][tag]["start"]:
                            self.process_tags[process][tag]["start"] = time_start
                            self.send = True

        self._update_pipeline_status()

    def update_inspection(self):
        """Wrapper method that calls the appropriate main updating methods of
        the inspection.

        It is meant to be used inside a loop (like while), so that it can
        continuously update the class attributes from the trace and log files.
        It already implements checks to parse these files only when they
        change, and they ignore entries that have been previously processes.
        """

        try:
            self.log_parser()
        except (FileNotFoundError, StopIteration) as e:
            logger.debug("ERROR: " + str(sys.exc_info()[0]))
            self.log_retry += 1
            if self.log_retry == self.MAX_RETRIES:
                raise e
        try:
            self.trace_parser()
        except (FileNotFoundError, StopIteration) as e:
            logger.debug("ERROR: " + str(sys.exc_info()[0]))
            self.trace_retry += 1
            if self.trace_retry == self.MAX_RETRIES:
                raise e

    #################
    # CURSES METHODS
    #################

    def display_overview(self):
        """Displays the default pipeline inspection overview
        """

        stay_alive = True

        self.screen = curses.initscr()

        self.screen.keypad(True)
        self.screen.nodelay(-1)
        curses.cbreak()
        curses.noecho()
        curses.start_color()

        self.screen_lines = self.screen.getmaxyx()[0]
        # self.screen_width = self.screen.getmaxyx()[1]

        try:
            while stay_alive:

                # Provide functionality to certain keybindings
                self._curses_keybindings()
                # Updates main inspector attributes
                self.update_inspection()
                # Display curses interface
                self.flush_overview()

                sleep(self.refresh_rate)
        except FileNotFoundError:
            sys.stderr.write(colored_print(
                "ERROR: nextflow log and/or trace files are no longer "
                "reachable!", "red_bold"))
        except Exception as e:
            sys.stderr.write(str(e))
        finally:
            curses.nocbreak()
            self.screen.keypad(0)
            curses.echo()
            curses.endwin()

    def _curses_keybindings(self):

        c = self.screen.getch()
        # Provide scroll up/down with keys or mouse wheel
        if c == curses.KEY_UP:
            self._updown("up")
        elif c == curses.KEY_DOWN:
            self._updown("down")
        elif c == curses.KEY_LEFT:
            self._rightleft("left")
        elif c == curses.KEY_RIGHT:
            self._rightleft("right")
        # Trigger screen size update on resize
        elif c == curses.KEY_RESIZE:
            self.screen_lines = self.screen.getmaxyx()[0]
        # Exit interface when pressing q
        elif c == ord('q'):
            raise Exception

    def _updown(self, direction):
        """Provides curses scroll functionality.
        """

        if direction == "up" and self.top_line != 0:
            self.top_line -= 1
        elif direction == "down" and \
                self.screen.getmaxyx()[0] + self.top_line\
                <= self.content_lines + 3:
            self.top_line += 1

    def _rightleft(self, direction):
        """Provides curses horizontal padding"""

        if direction == "left" and self.padding != 0:
            self.padding -= 1

        if direction == "right" and \
                self.screen.getmaxyx()[1] + self.padding < self.max_width:
            self.padding += 1

    def flush_overview(self):
        """Displays the default overview of the pipeline execution from the
        :attr:`status_info`, :attr:`processes` and :attr:`run_status`
        attributes into stdout.
        """

        colors = {
            "W": 1,
            "R": 2,
            "C": 3
        }

        pc = {
            "running": 3,
            "complete": 3,
            "aborted": 4,
            "error": 4
        }

        curses.init_pair(1, curses.COLOR_WHITE, curses.COLOR_BLACK)
        curses.init_pair(2, curses.COLOR_BLUE, curses.COLOR_BLACK)
        curses.init_pair(3, curses.COLOR_GREEN, curses.COLOR_BLACK)
        curses.init_pair(4, curses.COLOR_MAGENTA, curses.COLOR_BLACK)

        # self.screen.erase()

        height, width = self.screen.getmaxyx()
        win = curses.newpad(height, 2000)

        # Add static header
        header = "Pipeline [{}] inspection at {}. Status: ".format(
            self.pipeline_tag, strftime("%Y-%m-%d %H:%M:%S", gmtime()))

        win.addstr(0, 0, header)
        win.addstr(0, len(header), self.run_status,
                   curses.color_pair(pc[self.run_status]))
        submission_str = "{0:23.23}  {1:23.23}  {2:23.23}  {3:23.23}".format(
            "Running: {}".format(
                sum([len(x["submitted"]) for x in self.processes.values()])
            ),
            "Failed: {}".format(
                sum([len(x["failed"]) for x in self.processes.values()])
            ),
            "Retrying: {}".format(
                sum([len(x["retry"]) for x in self.processes.values()])
            ),
            "Completed: {}".format(
                sum([len(x["finished"]) for x in self.processes.values()])
            )
        )

        win.addstr(
            1, 0, submission_str, curses.color_pair(1)
        )

        headers = ["", "Process", "Running", "Complete", "Error",
                   "Avg Time", "Max Mem", "Avg Read", "Avg Write"]
        header_str = "{0: ^1} " \
                     "{1: ^25}  " \
                     "{2: ^7} " \
                     "{3: ^7} " \
                     "{4: ^7} " \
                     "{5: ^10} " \
                     "{6: ^10} " \
                     "{7: ^10} " \
                     "{8: ^10} ".format(*headers)
        self.max_width = len(header_str)
        win.addstr(3, 0, header_str, curses.A_UNDERLINE | curses.A_REVERSE)

        # Get display size
        top = self.top_line
        bottom = self.screen_lines - 4 + self.top_line

        # Fetch process information
        for p, process in enumerate(
                list(self.processes.keys())[top:bottom]):

            if process not in self.process_stats:
                vals = ["-"] * 8
                txt_fmt = curses.A_NORMAL
            else:
                ref = self.process_stats[process]
                vals = [ref["completed"],
                        len(self.processes[process]["failed"]),
                        ref["realtime"],
                        ref["maxmem"], ref["avgread"],
                        ref["avgwrite"]]
                txt_fmt = curses.A_BOLD

            proc = self.processes[process]
            if proc["retry"]:
                completed = "{}({})".format(len(proc["submitted"]),
                                            len(proc["retry"]))
            else:
                completed = "{}".format(len(proc["submitted"]))

            win.addstr(
                4 + p, 0, "{0: ^1} "
                          "{1:25.25}  "
                          "{2: ^7} "
                          "{3: ^7} "
                          "{4: ^7} "
                          "{5: ^10} "
                          "{6: ^10} "
                          "{7: ^10} "
                          "{8: ^10} ".format(
                                proc["barrier"],
                                process,
                                completed,
                                *vals),
                curses.color_pair(colors[proc["barrier"]]) | txt_fmt)

        win.clrtoeol()
        win.refresh(0, self.padding, 0, 0, height-1, width-1)

    ###################
    # BROADCAST METHODS
    ###################

    def _convert_process_dict(self):

        d = {}

        for k, v in self.processes.items():
            d[k] = {
                "barrier": v["barrier"],
                "cpus": v["cpus"],
                "memory": v["memory"]
            }
            for i in ["submitted", "finished", "failed", "retry"]:
                d[k][i] = list(v[i])

        return d

    def _prepare_table_data(self):

        # Set data mappings
        mappings = {
            "Barrier": "barrier",
            "Process": "process",
            "Running": "running",
            "Complete": "complete",
            "Error": "error",
            "Avg Time": "avgTime",
            "CPU/hour": "cpuhour",
            "Max Mem": "maxMem",
            "Avg Read": "avgRead",
            "Avg Write": "avgWrite"
        }

        # Set table data
        data = []
        table_headers = ["avgTime", "cpuhour", "maxMem", "avgRead", "avgWrite"]
        for process in list(self.processes):

            proc = self.processes[process]
            # Add general data that is always available for all processes
            current_data = {
                "process": process,
                "barrier": proc["barrier"],
                "complete": list(proc["finished"]),
                "error": list(proc["failed"]),
                "running": list(proc["submitted"])
            }

            # Add stats data that is only available for processes that have
            # finished once.
            if process not in self.process_stats:
                current_data = {
                    **current_data,
                    **dict((x, "-") for x in table_headers),
                    **{"cpuWarn": {}, "memWarn": {}}
                }

            else:
                ref = self.process_stats[process]
                current_data = {
                    **current_data,
                    **{"avgTime": ref["realtime"],
                       "cpuhour": ref["cpuhour"],
                       "maxMem": ref["maxmem"],
                       "avgRead": ref["avgread"],
                       "avgWrite": ref["avgwrite"],
                       "cpuWarn": ref["cpu_warnings"],
                       "memWarn": ref["mem_warnings"]}
                }

            data.append(current_data)

        return mappings, data

    def _prepare_overview_data(self):

        return [
            {
                "header": "Pipeline name",
                "value": self.pipeline_name
            },
            {
                "header": "Pipeline tag",
                "value": self.pipeline_tag
            },
            {
                "header": "Number of processes",
                "value": len(self.processes)
            }]

    def _prepare_general_details(self):
        return [
            {
                "header": "Pipeline directory",
                "value": self.workdir
            },
            {
                "header": "Work directory",
                "value": join(self.workdir, "work")
            },
            {
                "header": "Nextflow command",
                "value": self.execution_command
            },
            {
                "header": "Nextflow version",
                "value": self.nextflow_version
            }
        ]

    def _get_log_lines(self, n=300):
        """Returns a list with the last ``n`` lines of the nextflow log file

        Parameters
        ----------
        n : int
            Number of last lines from the log file

        Returns
        -------
        list
            List of strings with the nextflow log
        """

        with open(self.log_file) as fh:
            last_lines = fh.readlines()[-n:]

        return last_lines

    def _prepare_run_status_data(self):

        if self.run_status == "aborted":
            log_lines = self._get_log_lines()
        else:
            log_lines = None

        return {
            "value": self.run_status,
            "abortCause": self.abort_cause,
            "logLines": log_lines
        }

    def _send_status_info(self, run_id):

        mappings, data = self._prepare_table_data()
        overview_data = self._prepare_overview_data()
        general_details = self._prepare_general_details()
        status_data = self._prepare_run_status_data()

        # Add current year to start and stop dates
        time_start = "{} {}".format(time.strftime("%Y"), self.time_start)
        time_stop = "{} {}".format(time.strftime("%Y"), self.time_stop) \
            if self.time_stop else "-"
        # Get enconding for proper parsing of time
        time_locale = locale.getlocale()[0]

        status_json = {
            "generalOverview": overview_data,
            "generalDetails": general_details,
            "tableData": data,
            "tableMappings": mappings,
            "processInfo": self._convert_process_dict(),
            "processTags": self.process_tags,
            "runStatus": status_data,
            "timeStart": time_start,
            "timeStop": time_stop,
            "timeLocale": time_locale,
            "processes": list(self.processes)
        }

        self._c += 1
        logger.debug("Payload [{}] sent with size: {}".format(
            self._c,
            asizeof.asizeof(json.dumps(status_json))
        ))

        try:
            requests.put(self.broadcast_address,
                         json={"run_id": run_id, "status_json": status_json})
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _prepare_static_info(self):
        """Prepares the first batch of information, containing static
        information such as the pipeline file, and configuration files

        Returns
        -------
        dict
            Dict with the static information for the first POST request
        """

        pipeline_files = {}

        with open(join(self.workdir, self.pipeline_name)) as fh:
            pipeline_files["pipelineFile"] = fh.readlines()

        nf_config = join(self.workdir, "nextflow.config")
        if os.path.exists(nf_config):
            with open(nf_config) as fh:
                pipeline_files["configFile"] = fh.readlines()

        # Check for specific flowcraft configurations files
        configs = {
            "params.config": "paramsFile",
            "resources.config": "resourcesFile",
            "containers.config": "containersFile",
            "user.config": "userFile",
        }
        for config, key in configs.items():
            cfile = join(self.workdir, config)
            if os.path.exists(cfile):
                with open(cfile) as fh:
                    pipeline_files[key] = fh.readlines()

        return pipeline_files

    def _dag_file_to_dict(self):
        """Function that opens the dotfile named .treeDag.json in the current
        working directory

        Returns
        -------
        Returns a dictionary with the dag object to be used in the post
        instance available through the method _establish_connection

        """
        try:
            dag_file = open(os.path.join(self.workdir, ".treeDag.json"))
            dag_json = json.load(dag_file)
        except (FileNotFoundError, json.decoder.JSONDecodeError):
            logger.warning(colored_print(
                "WARNING: dotfile named .treeDag.json not found or corrupted",
                "red_bold"))
            dag_json = {}

        return dag_json

    def _establish_connection(self, run_id, dict_dag):

        try:

            static_info = self._prepare_static_info()

            logger.debug("Sending initial data with run id: {}".format(run_id))

            payload = {"run_id": run_id, "dag_json": dict_dag,
                       "pipeline_files": static_info}
            logger.debug("Connection payload size: {}".format(
                asizeof.asizeof(payload)))

            r = requests.post(self.broadcast_address,
                              json=payload)

            logger.debug("Response received: {}".format(r.status_code))
            if r.status_code != 201:
                logger.error(colored_print(
                    "ERROR: There was a problem sending data to the server"
                    "with reason: {}".format(r.reason)))
                sys.exit(1)
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _close_connection(self, run_id):

        try:
            r = requests.delete(self.broadcast_address,
                                json={"run_id": run_id})
            if r.status_code != 202:
                logger.error(colored_print(
                    "ERROR: There was a problem sending data to the server"
                    "with reason: {}".format(r.reason)))
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _get_run_hash(self):
        """Gets the hash of the nextflow file"""

        # Get name and path of the pipeline from the log file
        pipeline_path = get_nextflow_filepath(self.log_file)

        # Get hash from the entire pipeline file
        pipeline_hash = hashlib.md5()
        with open(pipeline_path, "rb") as fh:
            for chunk in iter(lambda: fh.read(4096), b""):
                pipeline_hash.update(chunk)
        # Get hash from the current working dir and hostname
        workdir = self.workdir.encode("utf8")
        hostname = socket.gethostname().encode("utf8")
        hardware_addr = str(uuid.getnode()).encode("utf8")
        dir_hash = hashlib.md5(workdir + hostname + hardware_addr)

        return pipeline_hash.hexdigest() + dir_hash.hexdigest()

    def _print_msg(self, run_id):

        inspect_address = "{}inspect/{}".format(self.app_address, run_id)
        logger.info(colored_print(
            "Starting broadcast. You can see the pipeline progress on the "
            "link below:", "green_bold"))
        logger.info("{}".format(inspect_address))

    def broadcast_status(self):

        logger.info(colored_print("Preparing broadcast data...", "green_bold"))

        run_hash = self._get_run_hash()
        dict_dag = self._dag_file_to_dict()
        _broadcast_sent = False
        logger.debug("Establishing connection...")
        self._establish_connection(run_hash, dict_dag)

        stay_alive = True
        try:
            logger.debug("Starting inspection loop")
            while stay_alive:

                if not _broadcast_sent:
                    self._print_msg(run_hash)
                    _broadcast_sent = True

                self.update_inspection()
                if self.send:
                    logger.debug("Updating inspection")
                    self._send_status_info(run_hash)
                    self.send = False

                sleep(self.refresh_rate)

        except FileNotFoundError:
            logger.error(colored_print(
                "ERROR: nextflow log and/or trace files are no longer "
                "reachable!", "red_bold"))
        except Exception:
            logger.exception("ERROR: " + str(sys.exc_info()[0]))
        finally:
            logger.info("Closing connection")
            self._close_connection(run_hash)


================================================
FILE: flowcraft/generator/pipeline_parser.py
================================================
import os
import logging
import re
from difflib import SequenceMatcher

try:
    from generator.error_handling import SanityError
    from generator.process_details import colored_print
except ImportError:
    from flowcraft.generator.error_handling import SanityError
    from flowcraft.generator.process_details import colored_print

logger = logging.getLogger("main.{}".format(__name__))

# Set the tokens used for the main syntax
# Token signaling the start of a fork
FORK_TOKEN = "("
# Token separating different lanes from a fork
LANE_TOKEN = "|"
# Token that closes a fork
CLOSE_TOKEN = ")"


def guess_process(query_str, process_map):
    """
    Function to guess processes based on strings that are not available in
    process_map. If the string has typos and is somewhat similar (50%) to any
    process available in flowcraft it will print info to the terminal,
    suggesting the most similar processes available in flowcraft.

    Parameters
    ----------
    query_str: str
        The string of the process with potential typos
    process_map:
        The dictionary that contains all the available processes

    """

    save_list = []
    # loops between the processes available in process_map
    for process in process_map:
        similarity = SequenceMatcher(None, process, query_str)
        # checks if similarity between the process and the query string is
        # higher than 50%
        if similarity.ratio() > 0.5:
            save_list.append(process)

    # checks if any process is stored in save_list
    if save_list:
        logger.info(colored_print(
            "Maybe you meant:\n\t{}".format("\n\t".join(save_list)), "white"))

    logger.info(colored_print("Hint: check the available processes by using "
                              "the '-l' or '-L' flag.", "white"))


def remove_inner_forks(text):
    """Recursively removes nested brackets

    This function is used to remove nested brackets from fork strings using
    regular expressions

    Parameters
    ----------
    text: str
        The string that contains brackets with inner forks to be removed

    Returns
    -------
    text: str
        the string with only the processes that are not in inner forks, thus
        the processes that belong to a given fork.

    """

    n = 1  # run at least once for one level of fork
    # Then this loop assures that all brackets will get removed in a nested
    # structure
    while n:
        # this removes non-nested brackets
        text, n = re.subn(r'\([^()]*\)', '', text)

    return text


def empty_tasks(p_string):
    """
    Function to check if pipeline string is empty or has an empty string

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """
    if p_string.strip() == "":
        raise SanityError("'-t' parameter received an empty string or "
                          "an empty file.")


def brackets_but_no_lanes(p_string):
    """
    Function to check if a LANE_TOKEN is provided but no fork is initiated.
    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    if "|" in p_string and "(" not in p_string:
        raise SanityError("No fork initiation character '(' was "
                          "provided but there is a fork lane separator "
                          "character '|'")


def brackets_insanity_check(p_string):
    """
    This function performs a check for different number of '(' and ')'
    characters, which indicates that some forks are poorly constructed.

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    if p_string.count(FORK_TOKEN) != p_string.count(CLOSE_TOKEN):
        # get the number of each type of bracket and state the one that has a
        # higher value
        dict_values = {
            FORK_TOKEN: p_string.count(FORK_TOKEN),
            CLOSE_TOKEN: p_string.count(CLOSE_TOKEN)
        }
        max_bracket = max(dict_values, key=dict_values.get)

        raise SanityError(
            "A different number of '(' and ')' was specified. There are "
            "{} extra '{}'. The number of '(' and ')'should be equal.".format(
                str(abs(
                    p_string.count(FORK_TOKEN) - p_string.count(CLOSE_TOKEN))),
                max_bracket))


def lane_char_insanity_check(p_string):
    """
    This function performs a sanity check for multiple '|' character
    between two processes.

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    if LANE_TOKEN + LANE_TOKEN in p_string:
        raise SanityError("Duplicated fork separator character '|'.")


def final_char_insanity_check(p_string):
    """
    This function checks if lane token is the last element of the pipeline
    string.

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    # Check if last character of string is a LANE_TOKEN
    if p_string.endswith(LANE_TOKEN):
        raise SanityError("Fork separator character '|' cannot be the "
                          "last element of pipeline string")


def fork_procs_insanity_check(p_string):
    """
    This function checks if the pipeline string contains a process between
    the fork start token or end token and the separator (lane) token. Checks for
    the absence of processes in one of the branches of the fork ['|)' and '(|']
    and for the existence of a process before starting a fork (in an inner fork)
    ['|('].

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    # Check for the absence of processes in one of the branches of the fork
    # ['|)' and '(|'] and for the existence of a process before starting a fork
    # (in an inner fork) ['|('].
    if FORK_TOKEN + LANE_TOKEN in p_string or \
            LANE_TOKEN + CLOSE_TOKEN in p_string or \
            LANE_TOKEN + FORK_TOKEN in p_string:
        raise SanityError("There must be a process between the fork "
                          "start character '(' or end ')' and the separator of "
                          "processes character '|'")


def start_proc_insanity_check(p_string):
    """
    This function checks if there is a starting process after the beginning of
    each fork. It checks for duplicated start tokens ['(('].

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    if FORK_TOKEN + FORK_TOKEN in p_string:
        raise SanityError("There must be a starting process after the "
                          "fork before adding a new fork. E.g: proc1 ( proc2.1 "
                          "(proc3.1 | proc3.2) | proc 2.2 )")


def late_proc_insanity_check(p_string):
    """
    This function checks if there are processes after the close token. It
    searches for everything that isn't "|" or ")" after a ")" token.

    Parameters
    ----------
    p_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    if re.search('\{}[^|)]'.format(CLOSE_TOKEN), p_string):
        raise SanityError("After a fork it is not allowed to have any "
                          "alphanumeric value.")


def inner_fork_insanity_checks(pipeline_string):
    """
    This function performs two sanity checks in the pipeline string. The first
    check, assures that each fork contains a lane token '|', while the second
    check looks for duplicated processes within the same fork.

    Parameters
    ----------
    pipeline_string: str
         String with the definition of the pipeline, e.g.::
             'processA processB processC(ProcessD | ProcessE)'

    """

    # first lets get all forks to a list.
    list_of_forks = []  # stores forks
    left_indexes = []  # stores indexes of left brackets

    # iterate through the string looking for '(' and ')'.
    for pos, char in enumerate(pipeline_string):
        if char == FORK_TOKEN:
            # saves pos to left_indexes list
            left_indexes.append(pos)
        elif char == CLOSE_TOKEN and len(left_indexes) > 0:
            # saves fork to list_of_forks
            list_of_forks.append(pipeline_string[left_indexes[-1] + 1: pos])
            # removes last bracket from left_indexes list
            left_indexes = left_indexes[:-1]

    # sort list in descending order of number of forks
    list_of_forks.sort(key=lambda x: x.count(FORK_TOKEN), reverse=True)

    # Now, we can iterate through list_of_forks and check for errors in each
    # fork
    for fork in list_of_forks:
        # remove inner forks for these checks since each fork has its own entry
        # in list_of_forks. Note that each fork is now sorted in descending
        # order which enables to remove sequentially the string for the fork
        # potentially with more inner forks
        for subfork in list_of_forks:
            # checks if subfork is contained in fork and if they are different,
            # avoiding to remove itself
            if subfork in list_of_forks and subfork != fork:
                # removes inner forks. Note that string has no spaces
                fork_simplified = fork.replace("({})".format(subfork), "")
            else:
                fork_simplified = fork

        # Checks if there is no fork separator character '|' within each fork
        if not len(fork_simplified.split(LANE_TOKEN)) > 1:
            raise SanityError("One of the forks doesn't have '|' "
                              "separator between the processes to fork. This is"
                              " the prime suspect: '({})'".format(fork))


def insanity_checks(pipeline_str):
    """Wrapper that performs all sanity checks on the pipeline string

    Parameters
    ----------
    pipeline_str : str
        String with the pipeline definition
    """

    # Gets rid of all spaces in string
    p_string = pipeline_str.replace(" ", "").strip()

    # some of the check functions use the pipeline_str as the user provided but
    # the majority uses the parsed p_string.
    checks = [
        [p_string, [
            empty_tasks,
            brackets_but_no_lanes,
            brackets_insanity_check,
            lane_char_insanity_check,
            final_char_insanity_check,
            fork_procs_insanity_check,
            start_proc_insanity_check,
            late_proc_insanity_check
        ]],
        [pipeline_str, [
            inner_fork_insanity_checks
        ]]
    ]

    # executes sanity checks in pipeline string before parsing it.
    for param, func_list in checks:
        for func in func_list:
            func(param)


def parse_pipeline(pipeline_str):
    """Parses a pipeline string into a list of dictionaries with the connections
     between processes

    Parameters
    ----------
    pipeline_str : str
        String with the definition of the pipeline, e.g.::
            'processA processB processC(ProcessD | ProcessE)'

    Returns
    -------
    pipeline_links : list

    """

    if os.path.exists(pipeline_str):
        logger.debug("Found pipeline file: {}".format(pipeline_str))
        with open(pipeline_str) as fh:
            pipeline_str = "".join([x.strip() for x in fh.readlines()])

    logger.info(colored_print("Resulting pipeline string:\n"))
    logger.info(colored_print(pipeline_str + "\n"))

    # Perform pipeline insanity checks
    insanity_checks(pipeline_str)

    logger.debug("Parsing pipeline string: {}".format(pipeline_str))

    pipeline_links = []
    lane = 1

    # Add unique identifiers to each process to allow a correct connection
    # between forks with same processes
    pipeline_str_modified, identifiers_to_tags = add_unique_identifiers(
        pipeline_str)

    # Get number of forks in the pipeline
    nforks = pipeline_str_modified.count(FORK_TOKEN)
    logger.debug("Found {} fork(s)".format(nforks))

    # If there are no forks, connect the pipeline as purely linear
    if not nforks:
        logger.debug("Detected linear pipeline string : {}".format(
            pipeline_str))
        linear_pipeline = ["__init__"] + pipeline_str_modified.split()
        pipeline_links.extend(linear_connection(linear_pipeline, lane))
        # Removes unique identifiers used for correctly assign fork parents with
        #  a possible same process name
        pipeline_links = remove_unique_identifiers(identifiers_to_tags,
                                                   pipeline_links)
        return pipeline_links

    for i in range(nforks):

        logger.debug("Processing fork {} in lane {}".format(i, lane))
        # Split the pipeline at each fork start position. fields[-1] will
        # hold the process after the fork. fields[-2] will hold the processes
        # before the fork.
        fields = pipeline_str_modified.split(FORK_TOKEN, i + 1)

        # Get the processes before the fork. This may be empty when the
        # fork is at the beginning of the pipeline.
        previous_process = fields[-2].split(LANE_TOKEN)[-1].split()
        logger.debug("Previous processes string: {}".format(fields[-2]))
        logger.debug("Previous processes list: {}".format(previous_process))
        # Get lanes after the fork
        next_lanes = get_lanes(fields[-1])
        logger.debug("Next lanes object: {}".format(next_lanes))
        # Get the immediate targets of the fork
        fork_sink = [x[0] for x in next_lanes]
        logger.debug("The fork sinks into the processes: {}".format(fork_sink))

        # The first fork is a special case, where the processes before AND
        # after the fork (until the start of another fork) are added to
        # the ``pipeline_links`` variable. Otherwise, only the processes
        # after the fork will be added
        if i == 0:
            # If there are no previous process, the fork is at the beginning
            # of the pipeline string. In this case, inject the special
            # "init" process.
            if not previous_process:
                previous_process = ["__init__"]
                lane = 0
            else:
                previous_process = ["__init__"] + previous_process

            # Add the linear modules before the fork
            pipeline_links.extend(
                linear_connection(previous_process, lane))

        fork_source = previous_process[-1]
        logger.debug("Fork source is set to: {}".format(fork_source))
        fork_lane = get_source_lane(previous_process, pipeline_links)
        logger.debug("Fork lane is set to: {}".format(fork_lane))
        # Add the forking modules
        pipeline_links.extend(
            fork_connection(fork_source, fork_sink, fork_lane, lane))
        # Add the linear connections in the subsequent lanes
        pipeline_links.extend(
            linear_lane_connection(next_lanes, lane))

        lane += len(fork_sink)

    pipeline_links = remove_unique_identifiers(identifiers_to_tags,
                                               pipeline_links)
    return pipeline_links


def get_source_lane(fork_process, pipeline_list):
    """Returns the lane of the last process that matches fork_process

    Parameters
    ----------
    fork_process : list
        List of processes before the fork.
    pipeline_list : list
        List with the pipeline connection dictionaries.

    Returns
    -------
    int
        Lane of the last process that matches fork_process
    """

    fork_source = fork_process[-1]
    fork_sig = [x for x in fork_process if x != "__init__"]

    for position, p in enumerate(pipeline_list[::-1]):

        if p["output"]["process"] == fork_source:

            lane = p["output"]["lane"]
            logger.debug("Possible source match found in position {} in lane"
                         " {}".format(position, lane))
            lane_sequence = [x["output"]["process"] for x in pipeline_list
                             if x["output"]["lane"] == lane]
            logger.debug("Testing lane sequence '{}' against fork signature"
                         " '{}'".format(lane_sequence, fork_sig))
            if lane_sequence == fork_sig:
                return p["output"]["lane"]

    return 0


def get_lanes(lanes_str):
    """From a raw pipeline string, get a list of lanes from the start
    of the current fork.

    When the pipeline is being parsed, it will be split at every fork
    position. The string at the right of the fork position will be provided
    to this function. It's job is to retrieve the lanes that result
    from that fork, ignoring any nested forks.

    Parameters
    ----------
    lanes_str : str
        Pipeline string after a fork split

    Returns
    -------
    lanes : list
        List of lists, with the list of processes for each lane

    """

    logger.debug("Parsing lanes from raw string: {}".format(lanes_str))

    # Temporarily stores the lanes string after removal of nested forks
    parsed_lanes = ""
    # Flag used to determined whether the cursor is inside or outside the
    # right fork
    infork = 0
    for i in lanes_str:

        # Nested fork started
        if i == FORK_TOKEN:
            infork += 1
        # Nested fork stopped
        if i == CLOSE_TOKEN:
            infork -= 1

        if infork < 0:
            break

        # Save only when in the right fork
        if infork == 0:
            # Ignore forking syntax tokens
            if i not in [FORK_TOKEN, CLOSE_TOKEN]:
                parsed_lanes += i

    return [x.split() for x in parsed_lanes.split(LANE_TOKEN)]


def linear_connection(plist, lane):
    """Connects a linear list of processes into a list of dictionaries

    Parameters
    ----------
    plist : list
        List with process names. This list should contain at least two entries.
    lane : int
        Corresponding lane of the processes

    Returns
    -------
    res : list
        List of dictionaries with the links between processes
    """

    logger.debug(
        "Establishing linear connection with processes: {}".format(plist))

    res = []
    previous = None

    for p in plist:
        # Skip first process
        if not previous:
            previous = p
            continue

        res.append({
            "input": {
                "process": previous,
                "lane": lane
            },
            "output": {
                "process": p,
                "lane": lane
            }
        })
        previous = p

    return res


def fork_connection(source, sink, source_lane, lane):
    """Makes the connection between a process and the first processes in the
    lanes to which it forks.

    The ``lane`` argument should correspond to the lane of the source process.
    For each lane in ``sink``, the lane counter will increase.

    Parameters
    ----------
    source : str
        Name of the process that is forking
    sink : list
        List of the processes where the source will fork to. Each element
        corresponds to the start of a lane.
    source_lane : int
        Lane of the forking process
    lane : int
        Lane of the source process

    Returns
    -------
    res : list
        List of dictionaries with the links between processes
    """

    logger.debug("Establishing forking of source '{}' into processes"
                 " '{}'. Source lane set to '{}' and lane set to '{}'".format(
                    source, sink, source_lane, lane))

    res = []
    # Increase the lane counter for the first lane
    lane_counter = lane + 1

    for p in sink:
        res.append({
            "input": {
                "process": source,
                "lane": source_lane
            },
            "output": {
                "process": p,
                "lane": lane_counter
            }
        })
        lane_counter += 1

    return res


def linear_lane_connection(lane_list, lane):
    """

    Parameters
    ----------
    lane_list : list
        Each element should correspond to a list of processes for a given lane
    lane : int
        Lane counter before the fork start

    Returns
    -------
    res : list
        List of dictionaries with the links between processes
    """

    logger.debug(
        "Establishing linear connections for lanes: {}".format(lane_list))

    res = []
    # Increase the lane counter for the first lane
    lane += 1

    for l in lane_list:
        res.extend(linear_connection(l, lane))
        lane += 1

    return res


def add_unique_identifiers(pipeline_str):
    """Returns the pipeline string with unique identifiers and a dictionary with
     references between the unique keys and the original values

    Parameters
    ----------
    pipeline_str : str
        Pipeline string

    Returns
    -------
    str
        Pipeline string with unique identifiers
    dict
        Match between process unique values and original names
    """

    # Add space at beginning and end of pipeline to allow regex mapping of final
    # process in linear pipelines
    pipeline_str_modified = " {} ".format(pipeline_str)

    # Regex to get all process names. Catch all words without spaces and that
    # are not fork tokens or pipes
    reg_find_proc = r"[^\s{}{}{}]+".format(LANE_TOKEN, FORK_TOKEN, CLOSE_TOKEN)
    process_names = re.findall(reg_find_proc, pipeline_str_modified)

    identifiers_to_tags = {}
    """
    dict: Matches new process names (identifiers) with original process 
    names
    """

    new_process_names = []
    """
    list: New process names used to replace in the pipeline string
    """

    # Assigns the new process names by appending a numeric id at the end of
    # the process name
    for index, val in enumerate(process_names):
        if "=" in val:
            parts = val.split("=")
            new_id = "{}_{}={}".format(parts[0], index, parts[1])
        else:
            new_id = "{}_{}".format(val, index)

        # add new process with id
        new_process_names.append(new_id)
        # makes a match between new process name and original process name
        identifiers_to_tags[new_id] = val

    # Add space between forks, pipes and the process names for the replace
    # regex to work
    match_result = lambda match: " {} ".format(match.group())

    # force to add a space between each token so that regex modification can
    # be applied
    find = r'[{}{}{}]+'.format(FORK_TOKEN, LANE_TOKEN, CLOSE_TOKEN)
    pipeline_str_modified = re.sub(find, match_result, pipeline_str_modified)

    # Replace original process names by the unique identifiers
    for index, val in enumerate(process_names):
        # regex to replace process names with non assigned process ids
        # escape characters are required to match to the dict keys
        # (identifiers_to_tags), since python keys with escape characters
        # must be escaped
        find = r'{}[^_]'.format(val).replace("\\", "\\\\")
        pipeline_str_modified = re.sub(find, new_process_names[index] + " ",
                                       pipeline_str_modified, 1)

    return pipeline_str_modified, identifiers_to_tags


def remove_unique_identifiers(identifiers_to_tags, pipeline_links):
    """Removes unique identifiers and add the original process names to the
    already parsed pipelines

    Parameters
    ----------
    identifiers_to_tags : dict
        Match between unique process identifiers and process names
    pipeline_links: list
        Parsed pipeline list with unique identifiers

    Returns
    -------
    list
        Pipeline list with original identifiers
    """

    # Replaces the unique identifiers by the original process names
    for index, val in enumerate(pipeline_links):
        if val["input"]["process"] != "__init__":
            val["input"]["process"] = identifiers_to_tags[
                val["input"]["process"]]
        if val["output"]["process"] != "__init__":
            val["output"]["process"] = identifiers_to_tags[
                val["output"]["process"]]

    return pipeline_links


================================================
FILE: flowcraft/generator/process.py
================================================
import os
import jinja2
import logging

from os.path import dirname, join, abspath

try:
    import generator.error_handling as eh
except ImportError:
    import flowcraft.generator.error_handling as eh

logger = logging.getLogger("main.{}".format(__name__))


class Process:
    """Main interface for basic process functionality

    The ``Process`` class is intended to be inherited by specific process
    classes (e.g., :py:class:`IntegrityCoverage`) and provides the basic
    functionality to build the channels and links between processes.

    Child classes are expected to inherit the ``__init__`` execution, which
    basically means that at least, the child must be defined as::

        class ChildProcess(Process):
            def__init__(self, **kwargs):
                super().__init__(**kwargs)

    This ensures that when the ``ChildProcess`` class is instantiated, it
    automatically sets the attributes of the parent class.

    This also means that child processes must be instantiated providing
    information on the process type and jinja2 template with the nextflow code.

    Parameters
    ----------
    template : str
        Name of the jinja2 template with the nextflow code for that process.
        Templates are stored in ``generator/templates``.
    """

    RAW_MAPPING = {
        "fastq": {
            "params": "fastq",
            "description": "Path expression to paired-end fastq files."
                           " (default: $params.fastq)",
            "default_value": "'fastq/*_{1,2}.*'",
            "channel": "IN_fastq_raw",
            "channel_str":
                "Channel.fromFilePairs(params.{0})"
                ".ifEmpty {{ exit 1, \"No fastq files provided with pattern:"
                "'${{params.{0}}}'\" }}",
            "checks":
                "if (params.{0} instanceof Boolean){{"
                "exit 1, \"'{0}' must be a path pattern. Provide value:"
                "'$params.{0}'\"}}\n"
                "if (!params.{0}){{ exit 1, \"'{0}' parameter "
                "missing\"}}"
        },
        "fasta": {
            "params": "fasta",
            "description": "Path fasta files. (default: $params.fastq)",
            "default_value": "'fasta/*.fasta'",
            "channel": "IN_fasta_raw",
            "channel_str":
                "Channel.fromPath(params.{0})."
                "map{{ it -> file(it).exists() ? [it.toString()"
                ".tokenize('/').last()"
                ".tokenize('.')[0..-2].join('.'), it] : null }}"
                ".ifEmpty {{ exit 1, \"No fasta files provided with pattern:"
                "'${{params.{0}}}'\" }}",
            "checks":
                "if (params.{0} instanceof Boolean){{"
                "exit 1, \"'{0}' must be a path pattern. Provide value:"
                "'$params.{0}'\"}}\n"
                "if (!params.{0}){{ exit 1, \"'{0}' parameter "
                "missing\"}}"
        },
        "accessions": {
            "params": "accessions",
            "description": "Path file with accessions, one perline. ("
                           "default: $params.fastq)",
            "default_value": "null",
            "channel": "IN_accessions_raw",
            "channel_str":
                "Channel.fromPath(params.{0})"
                ".ifEmpty {{ exit 1, \"No accessions file provided with path:"
                "'${{params.{0}}}'\" }}",
            "checks":
                "if (!params.{0}){{ exit 1, \"'{0}' parameter "
                "missing\" }}\n"
        }
    }
    """
    dict: Contains the mapping between the :attr:`Process.input_type` attribute
    and the corresponding nextflow parameter and main channel definition,
    e.g.::

        "fastq" : {
            "params": "fastq",
            "channel: "<channel>
        }
    """

    def __init__(self, template):

        self.pid = None
        """
        int: Process ID number that represents the order and position in the
        generated pipeline
        """

        self.template = template
        """
        str: Template name for the current process. This string will be used
        to fetch the file containing the corresponding jinja2 template
        in the :py:func:`_set_template` method
        """

        self._template_path = None
        """
        str: Path to the file containing the jinja2 template file. It's
        set in :py:func:`_set_template`.
        """
        self._set_template(template)

        self.input_type = None
        """
        str: Type of expected input data. Used to verify the connection between
        two processes is viable.
        """

        self.output_type = None
        """
        str: Type of output data. Used to verify the connection between
        two processes is viable.
        """

        self.ignore_type = False
        """
        boolean: If True, this process will ignore the input/output type
        requirements. This attribute is set to True for terminal singleton
        forks in the pipeline.
        """

        self.ignore_pid = False
        """
        boolean: If True, this process will not make the pid advance. This
        is used for terminal forks before the end of the pipeline.
        """

        self.dependencies = []
        """
        list: Contains the dependencies of the current process in the form
        of the :py:attr:`Process.template` attribute (e.g., [``fastqc``])
        """

        self.lane = None
        self.parent_lane = None

        self.input_channel = None
        """
        str: Place holder of the main input channel for the current process.
        This attribute can change dynamically depending on the forks and
        secondary channels in the final pipeline.
        """

        self.output_channel = None
        """
        str: Place holder of the main output channel for the current process.
        This attribute can change dynamically depending on the forks and
        secondary channels in the final pipeline.
        """

        self.input_user_channel = None
        """
        dict: Stores a dictionary of two key:value pairs containing
        the raw input channel for the process. This is automatically
         determined by the :attr:`~Process.input_type` attribute, and will
        fetch the information that is mapped in the :attr:`RAW_MAPPING`
         variable. It will only be used by the first process(es) defined in
         a pipeline.
        """

        self.link_start = []
        """
        list: List of strings with the starting points for secondary channels.
        When building the pipeline, these strings will be matched with equal
        strings in the :py:attr:`link_end` attribute of other Processes.
        """

        self.link_end = []
        """
        list: List of dictionaries containing the a string of the ending point
        for a secondary channel. Each dictionary should contain at least
        two key/vals:
        ``{"link": <link string>, "alias":<string for template>}``
        """

        self.status_channels = ["STATUS_{}".format(template)]
        """
        list: Name of the status channels produced by the process. By default,
        it sets a single status channel. If more than one status channels
        are required for the process, list each one in this attribute
        (e.g., :py:attr:`FastQC.status_channels`)
        """
        self.status_strs = []
        """
        str: Name of the status channel for the current process. These strings
        will be provided to the StatusCompiler process to collect and
        compile status reports
        """

        self.forks = []
        """
        list: List of strings with the literal definition of the forks for
        the current process, ready to be added to the template string.
        """

        self.main_forks = []
        """
        list: List of the channels onto which the main output should be
        forked into. They will be automatically added to the
        :attr:`~Process.main_forks` attribute when setting the secondary
        channels
        """

        self.secondary_inputs = []
        """
        list: List of dictionaries with secondary input channels from nextflow
        parameters. This dictionary should contain two key:value pairs
        with the ``params`` key, containing the parameter name, and the
        ``channel`` key, containing the nextflow channel definition::

            {
                "params": "pathoSpecies",
                "channel": "IN_pathoSpecies = Channel
                                                .value(params.pathoSpecies)"
            }
        """
        self.secondary_input_str = ""

        self.extra_input = ""
        """
        str:  with the name of the params that will be used to provide
        extra input into the process. This extra input will be mixed with
        the main input channel using nextflow's ``mix`` operator. Its
        channel will be defined at the start of the pipeline, based on the
        ``channel_str`` key of the :attr:`~Process.RAW_MAPPING` for the
        corresponding input type.
        """

        self.params = {}
        """
        dict: Maps the parameter names to the corresponding default values.
        """

        self.param_id = ""
        """
        str: The parameter id suffix that will be added to each parameter. In
        case it is empty, the multiple identical parameters in different
        components will be merged.
        """

        self._context = {}
        """
        dict: Dictionary with the keyword placeholders for the string template
        of the current process.
        """

        self.directives = {
            self.template: {}
        }
        """
        dict: Specifies the directives (cpus, memory, container) for each
        nextflow process in the template. If specified, this directives
        will be added to the nextflow configuration file. Otherwise,
        the default values for cpus and memory will be used. In the case
        of containers, they will not run inside any container.

        The current supported directives are:
            - cpus
            - memory
            - container
            - container tag/version

        An example of directives for two process is as follows::
        
            self.directives = {
                "processA": {"cpus": 1, "memory": "1GB"},
                "processB": {"memory": "5GB", "container": "my/image",
                             "version": "0.5.0"}
            }
        """

        self.compiler = {}
        """
        dict: Specifies channels from the current process that are received
        by a compiler process. Each key in this dictionary should match
        a compiler process key in
        :attr:`~flowcraft.generator.engine.NextflowGenerator.compilers`.
        The value should be a list of the channels that will be fed to the
        compiler process::
        
            self.compiler["patlas_consensus"] = ["mashScreenOutputChannel"]
        """

    def _set_template(self, template):
        """Sets the path to the appropriate jinja template file

        When a Process instance is initialized, this method will fetch
        the location of the appropriate template file, based on the
        ``template`` argument. It will raise an exception is the template
        file is not found. Otherwise, it will set the
        :py:attr:`Process.template_path` attribute.
        """

        # Set template directory
        tpl_dir = join(dirname(abspath(__file__)), "templates")

        # Set template file path
        tpl_path = join(tpl_dir, template + ".nf")

        if not os.path.exists(tpl_path):
            raise eh.ProcessError(
                "Template {} does not exist".format(tpl_path))

        self._template_path = join(tpl_dir, template + ".nf")

    def set_main_channel_names(self, input_suffix, output_suffix, lane):
        """Sets the main channel names based on the provide input and
        output channel suffixes. This is performed when connecting processes.

        Parameters
        ----------
        input_suffix : str
            Suffix added to the input channel. Should be based on the lane
            and an arbitrary unique id
        output_suffix : str
            Suffix added to the output channel. Should be based on the lane
            and an arbitrary unique id
        lane : int
            Sets the lane of the process.
        """

        self.input_channel = "{}_in_{}".format(self.template, input_suffix)
        self.output_channel = "{}_out_{}".format(self.template, output_suffix)
        self.lane = lane

    def set_param_id(self, param_id):
        """Sets the param_id for the process, which will be used to render
        the template.

        Parameters
        ----------
        param_id : str
            The :attr:`param_id` attribute of the process.
        """

        self._context = {**self._context, "param_id": param_id}

    def get_user_channel(self, input_channel, input_type=None):
        """Returns the main raw channel for the process

        Provided with at least a channel name, this method returns the raw
        channel name and specification (the nextflow string definition)
        for the process. By default, it will fork from the raw input of
        the process' :attr:`~Process.input_type` attribute. However, this
        behaviour can be overridden by providing the ``input_type`` argument.

        If the specified or inferred input type exists in the
        :attr:`~Process.RAW_MAPPING` dictionary, the channel info dictionary
        will be retrieved along with the specified input channel. Otherwise,
        it will return None.

        An example of the returned dictionary is::

             {"input_channel": "myChannel",
             "params": "fastq",
             "channel": "IN_fastq_raw",
             "channel_str":"IN_fastq_raw = Channel.fromFilePairs(params.fastq)"
            }

        Returns
        -------
        dict or None
            Dictionary with the complete raw channel info. None if no
            channel is found.
        """

        res = {"input_channel": input_channel}

        itype = input_type if input_type else self.input_type

        if itype in self.RAW_MAPPING:

            channel_info = self.RAW_MAPPING[itype]

            return {**res, **channel_info}

    @staticmethod
    def render(template, context):
        """Wrapper to the jinja2 render method from a template file

        Parameters
        ----------
        template : str
            Path to template file.
        context : dict
            Dictionary with kwargs context to populate the template
        """

        path, filename = os.path.split(template)

        return jinja2.Environment(
            loader=jinja2.FileSystemLoader(path or './')
        ).get_template(filename).render(context)

    @property
    def template_str(self):
        """Class property that returns a populated template string

        This property allows the template of a particular process to be
        dynamically generated and returned when doing ``Process.template_str``.

        Returns
        -------
        x : str
            String with the complete and populated process template

        """

        if not self._context:
            raise eh.ProcessError("Channels must be setup first using the "
                                  "set_channels method")

        logger.debug("Setting context for template {}: {}".format(
            self.template, self._context
        ))

        x = self.render(self._template_path, self._context)
        return x

    def set_channels(self, **kwargs):
        """ General purpose method that sets the main channels

        This method will take a variable number of keyword arguments to
        set the :py:attr:`Process._context` attribute with the information
        on the main channels for the process. This is done by appending
        the process ID (:py:attr:`Process.pid`) attribute to the input,
        output and status channel prefix strings. In the output channel,
        the process ID is incremented by 1 to allow the connection with the
        channel in the next process.

        The ``**kwargs`` system for setting the :py:attr:`Process._context`
        attribute also provides additional flexibility. In this way,
        individual processes can provide additional information not covered
        in this method, without changing it.

        Parameters
        ----------
        kwargs : dict
            Dictionary with the keyword arguments for setting up the template
            context
        """

        if not self.pid:
            self.pid = "{}_{}".format(self.lane, kwargs.get("pid"))

        for i in self.status_channels:
            if i.startswith("STATUS_"):
                self.status_strs.append("{}_{}".format(i, self.pid))
            else:
                self.status_strs.append("STATUS_{}_{}".format(i, self.pid))

        if self.main_forks:
            logger.debug("Setting main fork channels: {}".format(
                self.main_forks))
            operator = "set" if len(self.main_forks) == 1 else "into"
            self.forks = ["\n{}.{}{{ {} }}\n".format(
                self.output_channel, operator, ";".join(self.main_forks))]

        self._context = {**kwargs, **{"input_channel": self.input_channel,
                                      "output_channel": self.output_channel,
                                      "template": self.template,
                                      "forks": "\n".join(self.forks),
                                      "pid": self.pid}}

    def update_main_input(self, input_str):

        self.input_channel = input_str
        self._context["input_channel"] = self.input_channel

    def update_main_forks(self, sink):
        """Updates the forks attribute with the sink channel destination

        Parameters
        ----------
        sink : str
            Channel onto which the main input will be forked to

        """

        if not self.main_forks:
            self.main_forks = [self.output_channel]
            self.output_channel = "_{}".format(self.output_channel)
        self.main_forks.append(sink)

        # fork_lst = self.forks + self.main_forks
        operator = "set" if len(self.main_forks) == 1 else "into"
        self.forks = ["\n{}.{}{{ {} }}\n".format(
            self.output_channel, operator, ";".join(self.main_forks))]

        self._context = {**self._context,
                         **{"forks": "".join(self.forks),
                            "output_channel": self.output_channel}}

    def set_secondary_channel(self, source, channel_list):
        """ General purpose method for setting a secondary channel

        This method allows a given source channel to be forked into one or
        more channels and sets those forks in the :py:attr:`Process.forks`
        attribute. Both the source and the channels in the ``channel_list``
        argument must be the final channel strings,  which means that this
        method should be called only after setting the main channels.

        If the source is not a main channel, this will simply create a fork
        or set for every channel in the ``channel_list`` argument list::

            SOURCE_CHANNEL_1.into{SINK_1;SINK_2}

        If the source is a main channel, this will apply some changes to
        the output channel of the process, to avoid overlapping main output
        channels.  For instance, forking the main output channel for process
        2 would create a ``MAIN_2.into{...}``. The issue here is that the
        ``MAIN_2`` channel is expected as the input of the next process, but
        now is being used to create the fork. To solve this issue, the output
        channel is modified into ``_MAIN_2``, and the fork is set to
        the channels provided channels plus the ``MAIN_2`` channel::

            _MAIN_2.into{MAIN_2;MAIN_5;...}

        Parameters
        ----------
        source : str
            String with the name of the source channel
        channel_list : list
            List of channels that will receive a fork of the secondary
            channel
        """

        logger.debug("Setting secondary channel for source '{}': {}".format(
            source, channel_list))

        source = "{}_{}".format(source, self.pid)

        # Removes possible duplicate channels, when the fork is terminal
        channel_list = sorted(list(set(channel_list)))

        # When there is only one channel to fork into, use the 'set' operator
        # instead of 'into'
        op = "set" if len(channel_list) == 1 else "into"
        self.forks.append("\n{}.{}{{ {} }}\n".format(
            source, op, ";".join(channel_list)))

        logger.debug("Setting forks attribute to: {}".format(self.forks))
        self._context = {**self._context, **{"forks": "\n".join(self.forks)}}

    def update_attributes(self, attr_dict):
        """Updates the directives attribute from a dictionary object.

        This will only update the directives for processes that have been
        defined in the subclass.

        Parameters
        ----------
        attr_dict : dict
            Dictionary containing the attributes that will be used to update
            the process attributes and/or directives.

        """

        # Update directives
        # Allowed attributes to write
        valid_directives = ["pid", "ignore_type", "ignore_pid", "extra_input",
                            "group", "input_type"]

        for attribute, val in attr_dict.items():

            # If the attribute has a valid directive key, update that
            # directive
            if attribute in valid_directives and hasattr(self, attribute):
                setattr(self, attribute, val)

            # The params attribute is special, in the sense that it provides
            # information for the self.params attribute.
            elif attribute == "params":
                for name, value in val.items():
                    if name in self.params:
                        self.params[name]["default"] = value
                    else:
                        raise eh.ProcessError(
                            "The parameter name '{}' does not exist for "
                            "component '{}'".format(name, self.template))

            else:
                for p in self.directives:
                    self.directives[p][attribute] = val


class Compiler(Process):
    """Extends the Process methods to status-type processes
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.ignore_type = True
        self.link_start = None

    def set_compiler_channels(self, channel_list, operator="mix"):
        """General method for setting the input channels for the status process

        Given a list of status channels that are gathered during the pipeline
        construction, this method will automatically set the input channel
        for the status process. This makes use of the ``mix`` channel operator
        of nextflow for multiple channels::

            STATUS_1.mix(STATUS_2,STATUS_3,...)

        This will set the ``status_channels`` key for the ``_context``
        attribute of the process.

        Parameters
        ----------
        channel_list : list
            List of strings with the final name of the status channels
        operator : str
            Specifies the operator used to join the compiler channels.
            Available options are 'mix'and 'join'.
        """

        if not channel_list:
            raise eh.ProcessError("At least one status channel must be "
                                  "provided to include this process in the "
                                  "pipeline")

        if len(channel_list) == 1:
            logger.debug("Setting only one status channel: {}".format(
                channel_list[0]))
            self._context = {"compile_channels": channel_list[0]}

        else:

            first_status = channel_list[0]

            if operator == "mix":
                lst = ",".join(channel_list[1:])

                s = "{}.mix({})".format(first_status, lst)

            elif operator == "join":

                s = first_status
                for ch in channel_list[1:]:
                    s += ".join({})".format(ch)

                s += ".map{ ot -> [ ot[0], ot[1..-1] ] }"

            logger.debug("Status channel string: {}".format(s))

            self._context = {"compile_channels": s}


class Init(Process):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.input_type = None
        self.output_type = "raw"

        self.status_channels = []

    def set_raw_inputs(self, raw_input):
        """Sets the main input channels of the pipeline and their forks.

        The ``raw_input`` dictionary input should contain one entry for each
        input type (fastq, fasta, etc). The corresponding value should be a
        dictionary/json with the following key:values:

        - ``channel``: Name of the raw input channel (e.g.: channel1)
        - ``channel_str``: The nextflow definition of the channel and
           eventual checks (e.g.: channel1 = Channel.fromPath(param))
        - ``raw_forks``: A list of channels to which the channel name will
          for to.

        Each new type of input parameter is automatically added to the
        :attr:`params` attribute, so that they are automatically collected
        for the pipeline description and help.

        Parameters
        ----------
        raw_input : dict
            Contains an entry for each input type with the channel name,
            channel string and forks.
        """

        logger.debug("Setting raw inputs using raw input dict: {}".format(
            raw_input))

        primary_inputs = []

        for input_type, el in raw_input.items():

            primary_inputs.append(el["channel_str"])

            # Update the process' parameters with the raw input
            raw_channel = self.RAW_MAPPING[input_type]
            self.params[input_type] = {
                "default": raw_channel["default_value"],
                "description": raw_channel["description"]
            }

            op = "set" if len(el["raw_forks"]) == 1 else "into"

            self.forks.append("\n{}.{}{{ {} }}\n".format(
                el["channel"], op, ";".join(el["raw_forks"])
            ))

        logger.debug("Setting raw inputs: {}".format(primary_inputs))
        logger.debug("Setting forks attribute to: {}".format(self.forks))
        self._context = {**self._context,
                         **{"forks": "\n".join(self.forks),
                            "main_inputs": "\n".join(primary_inputs)}}

    def set_secondary_inputs(self, channel_dict):
        """ Adds secondary inputs to the start of the pipeline.

        This channels are inserted into the pipeline file as they are
        provided in the values of the argument.

        Parameters
        ----------
        channel_dict : dict
            Each entry should be <parameter>: <channel string>.
        """

        logger.debug("Setting secondary inputs: {}".format(channel_dict))

        secondary_input_str = "\n".join(list(channel_dict.values()))
        self._context = {**self._context,
                         **{"secondary_inputs": secondary_input_str}}

    def set_extra_inputs(self, channel_dict):
        """Sets the initial definition of the extra input channels.

        The ``channel_dict`` argument should contain the input type and
        destination channel of each parameter (which is the key)::

            channel_dict = {
                "param1": {
                    "input_type": "fasta"
                    "channels": ["abricate_2_3", "chewbbaca_3_4"]
                }
            }

        Parameters
        ----------
        channel_dict : dict
            Dictionary with the extra_input parameter as key, and a dictionary
            as a value with the input_type and destination channels
        """

        extra_inputs = []

        for param, info in channel_dict.items():

            # Update the process' parameters with the raw input
            raw_channel = self.RAW_MAPPING[info["input_type"]]
            self.params[param] = {
                "default": raw_channel["default_value"],
                "description": raw_channel["description"]
            }

            channel_name = "IN_{}_extraInput".format(param)
            channel_str = self.RAW_MAPPING[info["input_type"]]["channel_str"]
            extra_inputs.append("{} = {}".format(channel_name,
                                                 channel_str.format(param)))

            op = "set" if len(info["channels"]) == 1 else "into"
            extra_inputs.append("{}.{}{{ {} }}".format(
                channel_name, op, ";".join(info["channels"])))

        self._context = {
            **self._context,
            **{"extra_inputs": "\n".join(extra_inputs)}
        }


class StatusCompiler(Compiler):
    """Status compiler process template interface

    This special process receives the status channels from all processes
    in the generated pipeline.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)


class ReportCompiler(Compiler):
    """Reports compiler process template interface

    This special process receives the report channels from all processes
    in the generated pipeline.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)


class PatlasConsensus(Compiler):
    """Patlas consensus compiler process template interface

    This special process receives the channels associated with the
    ``patlas_consensus`` key.
    """

    def __init__(self, **kwargs):

        super().__init__(**kwargs)


================================================
FILE: flowcraft/generator/process_collector.py
================================================
import re
import pkgutil

try:
    from generator import components
except ImportError:
    from flowcraft.generator import components


def convert_camel_case(name):
    """Convers a CamelCase string into a snake_case one

    Parameters
    ----------
    name : str
        An arbitrary string that may be CamelCase

    Returns
    -------
    str
        The input string converted into snake_case

    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


def collect_process_map():
    """Collects Process classes and return dict mapping templates to classes

    This function crawls through the components module and retrieves all
    classes that inherit from the Process class. Then, it converts the name
    of the classes (which should be CamelCase) to snake_case, which is used
    as the template name.

    Returns
    -------
    dict
        Dictionary mapping the template name (snake_case) to the corresponding
        process class.
    """

    process_map = {}

    prefix = "{}.".format(components.__name__)
    for importer, modname, _ in pkgutil.iter_modules(components.__path__,
                                                     prefix):

        _module = importer.find_module(modname).load_module(modname)

        _component_classes = [
            cls for cls in _module.__dict__.values() if
            isinstance(cls, type) and cls.__name__ != "Process"
        ]

        for cls in _component_classes:
            process_map[convert_camel_case(cls.__name__)] = cls

    return process_map


================================================
FILE: flowcraft/generator/process_details.py
================================================
import logging
import sys

logger = logging.getLogger("main.{}".format(__name__))

COLORS = {
    "green_bold": "1;32m",
    "red_bold": "1;31m",
    "white": "0;38m",
    "white_bold": "1;38m",
    "white_underline": "4;38m",
    "blue_bold": "1;36m",
    "purple_bold": "1;34m",
    "yellow_bold": "1;93m"
}


def colored_print(msg, color_label="white_bold"):
    """
    This function enables users to add a color to the print. It also enables
    to pass end_char to print allowing to print several strings in the same line
    in different prints.

    Parameters
    ----------
    color_string: str
        The color code to pass to the function, which enables color change as
        well as background color change.
    msg: str
        The actual text to be printed
    end_char: str
        The character in which each print should finish. By default it will be
        "\n".

    """

    if sys.stdout.encoding != "UTF-8":
        msg = "".join([i if ord(i) < 128 else "" for i in msg])

    # try except first looks for the color in COLORS dictionary, otherwise use
    # color_label as the color.
    try:
        col = COLORS[color_label]
    except KeyError:
        col = color_label

    return "\x1b[{}{}\x1b[0m".format(col, msg)


def procs_dict_parser(procs_dict):
    """
    This function handles the dictionary of attributes of each Process class
    to print to stdout lists of all the components or the components which the
    user specifies in the -t flag.

    Parameters
    ----------
    procs_dict: dict
        A dictionary with the class attributes for all the components (or
        components that are used by the -t flag), that allow to create
        both the short_list and detailed_list. Dictionary example:
        {"abyss": {'input_type': 'fastq', 'output_type': 'fasta',
        'dependencies': [], 'directives': {'abyss': {'cpus': 4,
        'memory': '{ 5.GB * task.attempt }', 'container': 'flowcraft/abyss',
        'version': '2.1.1', 'scratch': 'true'}}}
    """

    logger.info(colored_print(
        "\n===== L I S T   O F   P R O C E S S E S =====\n", "green_bold"))

    #Sort to print alphabetically ordered list of processes to ease reading
    procs_dict_ordered = {k: procs_dict[k] for k in sorted(procs_dict)}

    for template, dict_proc_info in procs_dict_ordered.items():
        template_str = "=> {}".format(template)
        logger.info(colored_print(template_str, "blue_bold"))

        for info in dict_proc_info:
            info_str = "{}:".format(info)

            if isinstance(dict_proc_info[info], list):
                if not dict_proc_info[info]:
                    arg_msg = "None"
                else:
                    arg_msg = ", ".join(dict_proc_info[info])
            elif info == "directives":
                # this is used for the "directives", which is a dict
                if not dict_proc_info[info]:
                    # if dict is empty then add None to the message
                    arg_msg = "None"
                else:
                    # otherwise fetch all template names within a component
                    # and all the directives for each template to a list
                    list_msg = ["\n      {}: {}".format(
                        templt,
                        " , ".join(["{}: {}".format(dr, val)
                                    for dr, val in drs.items()]))
                                for templt, drs in dict_proc_info[info].items()
                    ]
                    # write list to a str
                    arg_msg = "".join(list_msg)
            else:
                arg_msg = dict_proc_info[info]

            logger.info("   {} {}".format(
                colored_print(info_str, "white_underline"), arg_msg
            ))


def proc_collector(process_map, args, pipeline_string):
    """
    Function that collects all processes available and stores a dictionary of
    the required arguments of each process class to be passed to
    procs_dict_parser

    Parameters
    ----------
    process_map: dict
        The dictionary with the Processes currently available in flowcraft
        and their corresponding classes as values
    args: argparse.Namespace
        The arguments passed through argparser that will be access to check the
        type of list to be printed
    pipeline_string: str
        the pipeline string

    """

    arguments_list = []

    # prints a detailed list of the process class arguments
    if args.detailed_list:
        # list of attributes to be passed to proc_collector
        arguments_list += [
            "input_type",
            "output_type",
            "description",
            "dependencies",
            "conflicts",
            "directives"
        ]

    # prints a short list with each process and the corresponding description
    if args.short_list:
        arguments_list += [
            "description"
        ]

    if arguments_list:
        # dict to store only the required entries
        procs_dict = {}
        # loops between all process_map Processes
        for name, cls in process_map.items():

            # instantiates each Process class
            cls_inst = cls(template=name)

            # checks if recipe is provided
            if pipeline_string:
                if name not in pipeline_string:
                    continue

            d = {arg_key: vars(cls_inst)[arg_key] for arg_key in
                 vars(cls_inst) if arg_key in arguments_list}
            procs_dict[name] = d

        procs_dict_parser(procs_dict)

        sys.exit(0)


================================================
FILE: flowcraft/generator/recipe.py
================================================
try:
    from generator.process_details import colored_print
    import generator.error_handling as eh
    from generator import recipes
except ImportError:
    from flowcraft.generator.process_details import colored_print
    import flowcraft.generator.error_handling as eh
    from flowcraft.generator import recipes

from collections import OrderedDict
import sys
import json
import logging
import pkgutil

logger = logging.getLogger("main.{}".format(__name__))


class InnuendoRecipe:

    def __init__(self):
        """Class to build automatic pipelines based on the processes provided.

        This class provides the methods to build the most eficient pipeline
        based on the processes provided. It automatic creates the
        flowcraft pipeline string based on the relationships between the
        possible processes.

        """

        self.count_forks = 0
        """
        int : counts the total possible number of forks
        """

        self.forks = []
        """
        list : a list with all the possible forks
        """

        self.pipeline_string = ""
        """
        str : the generated pipeline string
        """

        self.process_to_id = {}
        """
        dict: key value between the process name and its identifier
        """

        self.process_descriptions = {}

    @staticmethod
    def validate_pipeline(pipeline_string):
        """Validate pipeline string

        Validates the pipeline string by searching for forbidden characters

        Parameters
        ----------
        pipeline_string : str
            STring with the processes provided

        Returns
        -------

        """
        if "(" in pipeline_string or ")" in pipeline_string or "|" in \
                pipeline_string:
            logger.error(
                colored_print("Please provide a valid task list!", "red_bold")
            )
            return False

        return True

    def build_upstream(self, process_descriptions, task, all_tasks,
                       task_pipeline,
                       count_forks, total_tasks, forks):
        """Builds the upstream pipeline of the current process

        Checks for the upstream processes to the current process and
        adds them to the current pipeline fragment if they were provided in
        the process list.

        Parameters
        ----------
        process_descriptions : dict
            Information of processes input, output and if is forkable
        task : str
            Current process
        all_tasks : list
            A list of all provided processes
        task_pipeline : list
            Current pipeline fragment
        count_forks : int
            Current number of forks
        total_tasks : str
            All space separated processes
        forks : list
            Current forks
        Returns
        -------
        list : resulting pipeline fragment
        """
        if task in process_descriptions:
            if process_descriptions[task][1] is not None:
                if len(process_descriptions[task][1].split("|")) > 1:
                    local_forks = process_descriptions[task][1].split("|")

                    # Produces a new pipeline fragment for each forkable
                    #  process
                    for local_fork in local_forks:
                        if local_fork in total_tasks:
                            count_forks += 1
                            task_pipeline.insert(
                                0,
                                process_descriptions[task][1]
                            )
                            self.define_pipeline_string(
                                process_descriptions,
                                local_fork,
                                False,
                                True,
                                count_forks,
                                total_tasks,
                                forks
                            )

                    return task_pipeline
                else:
                    # Adds the process to the pipeline fragment in case it is
                    # provided in the task list
                    if process_descriptions[task][1] in total_tasks:
                        task_pipeline.insert(
                            0,
                            process_descriptions[task][1].split("|")[0]
                        )

                        # Proceeds building upstream until the input for a
                        # process is None
                        self.build_upstream(
                            process_descriptions,
                            process_descriptions[task][1].split("|")[0],
                            all_tasks,
                            task_pipeline,
                            count_forks,
                            total_tasks,
                            forks
                        )
                    else:
                        logger.error(
                            colored_print("{} not in provided protocols as "
                                          "input for {}".format(
                                process_descriptions[task][1], task), "red_bold"
                            )
                        )

                        sys.exit()

                    return task_pipeline
            else:
                return task_pipeline

    def build_downstream(self, process_descriptions, task, all_tasks,
                         task_pipeline,
                         count_forks, total_tasks, forks):
        """Builds the downstream pipeline of the current process

        Checks for the downstream processes to the current process and
        adds them to the current pipeline fragment.

        Parameters
        ----------
        process_descriptions : dict
            Information of processes input, output and if is forkable
        task : str
            Current process
        all_tasks : list
            A list of all provided processes
        task_pipeline : list
            Current pipeline fragment
        count_forks : int
            Current number of forks
        total_tasks : str
            All space separated processes
        forks : list
            Current forks
        Returns
        -------
        list : resulting pipeline fragment
        """

        if task in process_descriptions:
            if process_descriptions[task][2] is not None:
                if len(process_descriptions[task][2].split("|")) > 1:
                    local_forks = process_descriptions[task][2].split("|")

                    # Adds the process to the pipeline fragment downstream
                    # and defines a new pipeline fragment for each fork.
                    # Those will only look for downstream processes
                    for local_fork in local_forks:
                        if local_fork in total_tasks:
                            count_forks += 1
                            task_pipeline.append(process_descriptions[task][2])
                            self.define_pipeline_string(
                                process_descriptions,
                                local_fork,
                                False,
                                True,
                                count_forks,
                                total_tasks,
                                forks
                            )

                    return task_pipeline
                else:
                    if process_descriptions[task][2] in total_tasks:
                        task_pipeline.append(process_descriptions[task][2].split("|")[0])

                        # Proceeds building downstream until the output for a
                        # process is None
                        self.build_downstream(
                            process_descriptions,
                            process_descriptions[task][2].split("|")[0],
                            all_tasks,
                            task_pipeline,
                            count_forks,
                            total_tasks,
                            forks
                        )

                    return task_pipeline
            else:
                return task_pipeline

    def define_pipeline_string(self, process_descriptions, tasks,
                               check_upstream,
                               check_downstream, count_forks, total_tasks,
                               forks):
        """Builds the possible forks and connections between the provided
        processes

        This method loops through all the provided tasks and builds the
        upstream and downstream pipeline if required. It then returns all
        possible forks than need to be merged à posteriori`

        Parameters
        ----------
        process_descriptions : dict
            Information of processes input, output and if is forkable
        tasks : str
            Space separated processes
        check_upstream : bool
            If is to build the upstream pipeline of the current task
        check_downstream : bool
            If is to build the downstream pipeline of the current task
        count_forks : int
            Number of current forks
        total_tasks : str
            All space separated processes
        forks : list
            Current forks

        Returns
        -------
        list : List with all the possible pipeline forks
        """

        tasks_array = tasks.split()

        for task_unsplit in tasks_array:
            task = task_unsplit.split("=")[0]

            if task not in process_descriptions.keys():
                logger.error(
                    colored_print(
                        "{} not in the possible processes".format(task),
                        "red_bold"
                    )
                )

                sys.exit()
            else:
                process_split = task_unsplit.split("=")

                if len(process_split) > 1:
                    self.process_to_id[process_split[0]] = process_split[1]

            # Only uses the process if it is not already in the possible forks
            if not bool([x for x in forks if task in x]) and not bool([y for y in forks if process_descriptions[task][2] in y]):
                task_pipeline = []

                if task in process_descriptions:

                    if check_upstream:
                        task_pipeline = self.build_upstream(
                            process_descriptions,
                            task,
                            tasks_array,
                            task_pipeline,
                            count_forks,
                            total_tasks,
                            forks
                        )

                    task_pipeline.append(task)

                    if check_downstream:
                        task_pipeline = self.build_downstream(
                            process_descriptions,
                            task,
                            tasks_array,
                            task_pipeline,
                            count_forks,
                            total_tasks,
                            forks
                        )

                # Adds the pipeline fragment to the list of possible forks
                forks.append(list(OrderedDict.fromkeys(task_pipeline)))

            # Checks for task in fork. Case order of input processes is reversed
            elif bool([y for y in forks if process_descriptions[task][2] in y]):
                for fork in forks:
                    if task not in fork:
                        try:
                            dependent_index = fork.index(process_descriptions[task][2])
                            fork.insert(dependent_index, task)
                        except ValueError:
                            continue

        for i in range(0, len(forks)):
            for j in range(0, len(forks[i])):
                try:
                    if len(forks[i][j].split("|")) > 1:
                        forks[i][j] = forks[i][j].split("|")
                        tmp_fork = []
                        for s in forks[i][j]:
                            if s in total_tasks:
                                tmp_fork.append(s)

                        forks[i][j] = tmp_fork

                except AttributeError as e:
                    continue

        return forks

    def build_pipeline_string(self, forks):
        """Parses, filters and merge all possible pipeline forks into the
        final pipeline string

        This method checks for shared start and end sections between forks
        and merges them according to the shared processes::

            [[spades, ...], [skesa, ...], [...,[spades, skesa]]]
                -> [..., [[spades, ...], [skesa, ...]]]

        Then it defines the pipeline string by replacing the arrays levels
        to the flowcraft fork format::

            [..., [[spades, ...], [skesa, ...]]]
                -> ( ... ( spades ... | skesa ... ) )

        Parameters
        ----------
        forks : list
            List with all the possible pipeline forks.

        Returns
        -------
        str : String with the pipeline definition used as input for
        parse_pipeline
        """

        final_forks = []

        for i in range(0, len(forks)):
            needs_merge = [False, 0, 0, 0, 0, ""]
            is_merged = False
            for i2 in range(0, len(forks[i])):
                for j in range(i, len(forks)):
                    needs_merge[0] = False
                    for j2 in range(0, len(forks[j])):
                        try:
                            j2_fork = forks[j][j2].split("|")
                        except AttributeError:
                            j2_fork = forks[j][j2]

                        # Gets the indexes of the forks matrix that need to
                        # be merged
                        if forks[i][i2] in j2_fork and (i2 == 0 or j2 == 0) and i != j:
                            needs_merge[0] = True
                            needs_merge[1] = i
                            needs_merge[2] = i2
                            needs_merge[3] = j
                            needs_merge[4] = j2
                            needs_merge[5] = forks[i][i2]

                    if needs_merge[0]:
                        index_merge_point = forks[needs_merge[3]][-1].index(needs_merge[5])

                        # Merges the forks. If only one fork is possible,
                        # that fork is neglected and it merges into a single
                        # channel.
                        if needs_merge[2] == 0:
                            if len(forks[needs_merge[3]][-1]) < 2:
                                forks[needs_merge[3]] = forks[needs_merge[3]][:-1] + forks[needs_merge[1]][::]
                            else:
                                forks[needs_merge[3]][-1][index_merge_point] = forks[needs_merge[1]]

                        elif needs_merge[4] == 0:
                            if len(forks[needs_merge[3]][-1]) < 2:
                                forks[needs_merge[3]] = forks[needs_merge[3]][:-1] + forks[needs_merge[1]][::]
                            else:
                                forks[needs_merge[3]][-1][index_merge_point] = forks[needs_merge[1]]

                        is_merged = True

            # Adds forks that dont need merge to the final forks
            if needs_merge[0] is not None and not is_merged:
                if bool([nf for nf in forks[i] if "|" in nf]):
                    continue
                final_forks.append(forks[i])

        if len(final_forks) == 1:
            final_forks = str(final_forks[0])

        # parses the string array to the flowcraft nomenclature
        pipeline_string = " " + str(final_forks)\
            .replace("[[", "( ")\
            .replace("]]", " )")\
            .replace("]", " |")\
            .replace(", [", " ")\
            .replace("'", "")\
            .replace(",", "")\
            .replace("[", "")

        if pipeline_string[-1] == "|":
            pipeline_string = pipeline_string[:-1]

        to_search = " {} "
        to_replace = " {}={} "

        # Replace only names by names + process ids
        for key, val in self.process_to_id.items():
            # Case only one process in the pipeline
            pipeline_string = pipeline_string\
                .replace(to_search.format(key),
                         to_replace.format(key, val))

        return pipeline_string

    def run_auto_pipeline(self, tasks):
        """Main method to run the automatic pipeline creation

        This method aggregates the functions required to build the pipeline
        string that can be used as input for the workflow generator.

        Parameters
        ----------
        tasks : str
            A string with the space separated tasks to be included in the
            pipeline

        Returns
        -------
        str : String with the pipeline definition used as input for
        parse_pipeline
        """

        self.forks = self.define_pipeline_string(
            self.process_descriptions,
            tasks,
            True,
            True,
            self.count_forks,
            tasks,
            self.forks
        )

        self.pipeline_string = self.build_pipeline_string(self.forks)

        return self.pipeline_string

    # def get_process_info(self):
    #     return list(self.process_descriptions.keys())


class Innuendo(InnuendoRecipe):
    """
    Recipe class for the INNUENDO Project. It has all the available in the
    platform for quick use of the processes in the scope of the project.
    """

    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

        # The description of the processes
        # [forkable, input_process, output_process]
        self.process_descriptions = {
            "reads_download": [False, None,"integrity_coverage|seq_typing|patho_typing"],
            "patho_typing": [True, None, None],
            "seq_typing": [True, None, None],
            "integrity_coverage": [True, None, "fastqc_trimmomatic"],
            "fastqc_trimmomatic": [False, "integrity_coverage",
                                   "true_coverage"],
            "true_coverage": [False, "fastqc_trimmomatic",
                              "fastqc"],
            "fastqc": [False, "true_coverage", "check_coverage"],
            "check_coverage": [False, "fastqc", "spades|skesa"],
            "spades": [False, "fastqc_trimmomatic", "process_spades"],
            "skesa": [False, "fastqc_trimmomatic", "process_skesa"],
            "process_spades": [False, "spades", "assembly_mapping"],
            "process_skesa": [False, "skesa", "assembly_mapping"],
            "assembly_mapping": [False, "process_spades", "pilon"],
            "pilon": [False, "assembly_mapping", "mlst"],
            "mlst": [False, "pilon", "abricate|prokka|chewbbaca|sistr"],
            "sistr": [True, "mlst", None],
            "abricate": [True, "mlst", None],
            #"prokka": [True, "mlst", None],
            "chewbbaca": [True, "mlst", None]
        }


def brew_innuendo(args):
    """Brews a given list of processes according to the recipe

    Parameters
    ----------
    args : argparse.Namespace
        The arguments passed through argparser that will be used to check the
        the recipe, tasks and brew the process

    Returns
    -------
    str
        The final pipeline string, ready for the engine.
    list
        List of process strings.
    """

    # Create recipe class instance
    automatic_pipeline = Innuendo()

    if not args.tasks:
        input_processes = " ".join(
            automatic_pipeline.process_descriptions.keys())
    else:
        input_processes = args.tasks

    # Validate the provided pipeline processes
    validated = automatic_pipeline.validate_pipeline(input_processes)
    if not validated:
        sys.exit(1)
    # Get the final pipeline string
    pipeline_string = automatic_pipeline.run_auto_pipeline(input_processes)

    return pipeline_string


class Recipe:

    def __init__(self):

        self.pipeline_str = None
        """
        str: The raw pipeline string, with no attribute or directives, except
        for number indicators for when there are duplicate components.
        
        e.g.: "fastqc trimmomatic spades"
        e.g.: "fastqc trimmomatic (spades#1 | spades#2)
        """

        self.directives = {}
        """
        dict: Dictionary with the parameters and directives for each component
        in the pipeline_str attribute. Missing components will be left with
        the default parameters and directives. 
        """

    def brew(self):

        if not hasattr(self, "name"):
            raise eh.RecipeError("Recipe class '{}' does not have a 'name' "
                                 "attribute set".format(self.__class__))

        if not self.pipeline_str:
            raise eh.RecipeError("Recipe with name '{}' does not have a "
                                 "pipeline_str attribute set".format(self.name))

        for component, vals in self.directives.items():

            params = vals.get("params", None)
            directives = vals.get("directives", None)

            # Check for component number symbol
            if "#" in component:
                _component = component.split("#")[0]
            else:
                _component = component

            component_str = self._get_component_str(_component, params,
                                                    directives)

            self.pipeline_str = self.pipeline_str.replace(component,
                                                          component_str)

        return self.pipeline_str

    @staticmethod
    def _get_component_str(component, params=None, directives=None):
        """ Generates a component string based on the provided parameters and
        directives

        Parameters
        ----------
        component : str
            Component name
        params : dict
            Dictionary with parameter information
        directives : dict
            Dictionary with directives information

        Returns
        -------
        str
            Component string with the parameters and directives, ready for
            parsing by flowcraft engine
        """

        final_directives = {}

        if directives:
            final_directives = directives

        if params:
            final_directives["params"] = params

        if final_directives:
            return "{}={}".format(
                component, json.dumps(final_directives, separators=(",", ":")))
        else:
            return component


def brew_recipe(recipe_name):
    """Returns a pipeline string from a recipe name.

    Parameters
    ----------
    recipe_name : str
        Name of the recipe. Must match the name attribute in one of the classes
        defined in :mod:`flowcraft.generator.recipes`

    Returns
    -------
    str
        Pipeline string ready for parsing and processing by flowcraft engine
    """

    # This will iterate over all modules included in the recipes subpackage
    # It will return the import class and the module name, algon with the
    # correct prefix
    prefix = "{}.".format(recipes.__name__)
    for importer, modname, _ in pkgutil.iter_modules(recipes.__path__, prefix):

        # Import the current module
        _module = importer.find_module(modname).load_module(modname)

        # Fetch all available classes in module
        _recipe_classes = [cls for cls in _module.__dict__.values() if
                           isinstance(cls, type)]

        # Iterate over each Recipe class, and check for a match with the
        # provided recipe name.
        for cls in _recipe_classes:
            # Create instance of class to allow fetching the name attribute
            recipe_cls = cls()
            if getattr(recipe_cls, "name", None) == recipe_name:
                return recipe_cls.brew()

    logger.error(
        colored_print("Recipe name '{}' does not exist.".format(recipe_name))
    )
    sys.exit(1)


def list_recipes(full=False):
    """Method that iterates over all available recipes and prints their
    information to the standard output

    Parameters
    ----------
    full : bool
        If true, it will provide the pipeline string along with the recipe name
    """

    logger.info(colored_print(
        "\n===== L I S T   O F   R E C I P E S =====\n",
        "green_bold"))

    # This will iterate over all modules included in the recipes subpackage
    # It will return the import class and the module name, algon with the
    # correct prefix
    prefix = "{}.".format(recipes.__name__)
    for importer, modname, _ in pkgutil.iter_modules(recipes.__path__, prefix):

        # Import the current module
        _module = importer.find_module(modname).load_module(modname)

        # Fetch all available classes in module
        _recipe_classes = [cls for cls in _module.__dict__.values() if
                           isinstance(cls, type)]

        # Iterate over each Recipe class, and check for a match with the
        # provided recipe name.
        for cls in _recipe_classes:

            recipe_cls = cls()

            if hasattr(recipe_cls, "name"):
                logger.info(colored_print("=> {}".format(recipe_cls.name), "blue_bold"))
                if full:
                    logger.info(colored_print("\t {}".format(recipe_cls.__doc__), "purple_bold"))
                    logger.info(colored_print("Pipeline string: {}\n".format(recipe_cls.pipeline_str), "yellow_bold"))

    sys.exit(0)


================================================
FILE: flowcraft/generator/recipes/__init__.py
================================================


================================================
FILE: flowcraft/generator/recipes/denim.py
================================================
try:
    from generator.recipe import Recipe
except ImportError:
    from flowcraft.generator.recipe import Recipe


class Denim(Recipe):
    """
    DEN-IM: Dengue Virus Identification from Metagenomic and Targeted Sequencing
    Standalone version available at https://github.com/assemblerflow/DEN-IM
    """

    def __init__(self):

        self.name = "denim"

        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "filter_poly " \
                            "bowtie " \
                            "retrieve_mapped " \
                            "check_coverage " \
                            "viral_assembly " \
                            "assembly_mapping " \
                            "pilon " \
                            "split_assembly " \
                            "dengue_typing " \
                            "mafft " \
                            "raxml"

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "params": {"genomeSize": "0.012", "minCoverage": "15"}
            },
            "check_coverage": {
                "params": {"genomeSize": "0.012", "minCoverage": "15"}
            },
            "bowtie": {
                "directives": {"container": "flowcraft/bowtie_dengue",
                               "version": "2-1"},
                "params": {
                    "reference": "\"ref/1_GenotypesDENV_14-05-18.fasta\""}
            },
            "assembly_mapping": {
                "params": {"AMaxContigs": "1000", "genomeSize": "0.01"}
            },
            "split_assembly": {
                "params": {"size": "10000"}
            }
        }

================================================
FILE: flowcraft/generator/recipes/innuca.py
================================================
try:
    from generator.recipe import Recipe
except ImportError:
    from flowcraft.generator.recipe import Recipe


class Innuca(Recipe):
    """
    Bacterial genome assembly pipeline based on the SPAdes assembler and using
    pre-assembly quality control and read trimming and post-assembly polishing
    with Pilon
    """

    def __init__(self):
        super().__init__()

        # Recipe name
        self.name = "innuca"

        # Recipe pipeline
        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "fastqc " \
                            "check_coverage " \
                            "true_coverage " \
                            "spades " \
                            "process_spades " \
                            "pilon " \
                            "mlst "

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "directives": {"cpus": "1", "memory": "\"2GB\""},
                "params": {"genomeSize": "1", "minCoverage": "15"}
            }
        }


================================================
FILE: flowcraft/generator/recipes/plasmids.py
================================================
try:
    from generator.recipe import Recipe
except ImportError:
    from flowcraft.generator.recipe import Recipe


class Plasmids(Recipe):
    """
    Plasmid detection pipeline using mapping, mash_screen and assembly with
    SPAdes, with gene annotations with abricate. Outputs json files that
    can be imported into pATLAS.
    """

    def __init__(self):
        super().__init__()

        self.name = "plasmids"

        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "( spades pilon (mash_dist | abricate) |" \
                            "mash_screen | " \
                            "mapping_patlas)"

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "params": {"genomeSize": "0"}
            }
        }


class PlasmidsMapping(Recipe):
    """
    Plasmid detection pipeline using mapping with bowtie2. Outputs json
    files that can be imported into pATLAS.
    """

    def __init__(self):
        super().__init__()

        self.name = "plasmids_mapping"

        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "mapping_patlas"

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "params": {"genomeSize": "0"}
            }
        }


class PlasmidsAssembly(Recipe):
    """
    Plasmid detection pipeline using assembly with SPAdes and mash dist.
    Outputs json files that can be imported into pATLAS.
    """

    def __init__(self):
        super().__init__()

        self.name = "plasmids_assembly"

        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "spades " \
                            "pilon " \
                            "mash_dist"

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "params": {"genomeSize": "0"}
            }
        }


class PlasmidsMash(Recipe):
    """
    Plasmid detection pipeline using mash screen. Outputs json files that can
    be imported into pATLAS.
    """

    def __init__(self):
        super().__init__()

        self.name = "plasmids_mash"

        self.pipeline_str = "integrity_coverage " \
                            "fastqc_trimmomatic " \
                            "mash_screen"

        # Recipe parameters and directives
        self.directives = {
            "integrity_coverage": {
                "params": {"genomeSize": "0"}
            }
        }


================================================
FILE: flowcraft/generator/report.py
================================================
import os
import re
import sys
import json
import uuid
import signal
import socket
import hashlib
import logging
import requests

from os.path import join, abspath
from time import sleep
from pympler.asizeof import asizeof

try:
    import generator.error_handling as eh
    from generator.process_details import colored_print
    from generator.utils import get_nextflow_filepath
except ImportError:
    import flowcraft.generator.error_handling as eh
    from flowcraft.generator.process_details import colored_print
    from flowcraft.generator.utils import get_nextflow_filepath

logger = logging.getLogger("main.{}".format(__name__))


def signal_handler():
    """This function is bound to the SIGINT signal (like ctrl+c) to graciously
    exit the program and reset the curses options.
    """

    print("Exiting flowcraft report brodcast... Bye")
    sys.exit(0)


class FlowcraftReport:

    def __init__(self, report_file, trace_file=None, log_file=None,
                 watch=False, ip_addr=None):

        self.report_file = report_file
        """
        str: Path to Report JSON file.
        """

        if not ip_addr:
            self.app_address = "http://www.flowcraft.live:80/"
        else:
            self.app_address = ip_addr
            """
            str: Address of flowcraft web app
            """

        self.broadcast_address = "{}reports/broadcast/api/reports".format(
            self.app_address)

        self.refresh_rate = 1

        self.send = True
        """
        boolean: This attribute is used when the report mode is used with the
        --watch option. It will be set to False after sending a request, and 
        set to True when there is a change in the pipeline reports.
        """

        self.watch = watch
        """
        boolean: When False, the reports mode will try to open the provided
        report JSON file and send it to the flowcraft service. When True, 
        it will try to open the nextflow trace file instead and continuously 
        compile the report JSON files from the `report` processes as they 
        are created. 
        """

        self.log_file = log_file
        """
        str: Path to .nextflow.log file.
        """

        self.log_sizestamp = None
        """
        str: Stores the sizestamp of the last modification of the trace file.
        This is used to parse the file only when it has changed.
        """

        self.status_info = None
        """
        str: Status of the pipeline execution. Used in the watch report mode
        and varies between 'running', 'aborted', 'complete'.
        """

        self.trace_file = trace_file
        """
        str: Path to nextflow trace file.
        """

        self.trace_sizestamp = None
        """
        str: Stores the sizestamp of the last modification of the trace file.
        This is used to parse the file only when it has changed.
        """

        self.trace_retry = 0
        """
        int: Each time the log file is not found, this counter is 
        increased. Only when it matches the :attr:`MAX_RETRIES` attribute
        does it raises a FileNotFoundError.
        """

        self.stored_ids = []
        """
        list: Stores the task_ids that have already been parsed. It is used
        to skip them when parsing the trace files multiple times.
        """

        self.report_queue = []
        """
        list: Stores the paths of the report JSON files that are on queue to
        be sent to the flowcraft service. This list will be emptied when these
        JSONs are sent.
        """

        # Checks if report file is available
        self._check_required_files()

        signal.signal(signal.SIGINT, lambda *x: signal_handler())

    def _check_required_files(self):

        if not os.path.exists(self.report_file) and not self.watch:
            raise eh.ReportError("The provided report JSON file could not be"
                                 " opened: {}".format(self.report_file))

    @staticmethod
    def _header_mapping(header):
        """Parses the trace file header and retrieves the positions of each
        column key.

        Parameters
        ----------
        header : str
            The header line of nextflow's trace file

        Returns
        -------
        dict
            Mapping the column ID to its position (e.g.: {"tag":2})
        """

        return dict(
            (x.strip(), pos) for pos, x in enumerate(header.split("\t"))
        )

    @staticmethod
    def _expand_path(hash_str):
        """Expands the hash string of a process (ae/1dasjdm) into a full
        working directory

        Parameters
        ----------
        hash_str : str
            Nextflow process hash with the beggining of the work directory

        Returns
        -------
        str
            Path to working directory of the hash string
        """

        try:
            first_hash, second_hash = hash_str.split("/")
            first_hash_path = join(abspath("work"), first_hash)

            for l in os.listdir(first_hash_path):
                if l.startswith(second_hash):
                    return join(first_hash_path, l)
        except FileNotFoundError:
            return None

    def _get_report_id(self):
        """Returns a hash of the reports JSON file
        """

        if self.watch:

            # Searches for the first occurence of the nextflow pipeline
            # file name in the .nextflow.log file
            pipeline_path = get_nextflow_filepath(self.log_file)

            # Get hash from the entire pipeline file
            pipeline_hash = hashlib.md5()
            with open(pipeline_path, "rb") as fh:
                for chunk in iter(lambda: fh.read(4096), b""):
                    pipeline_hash.update(chunk)
            # Get hash from the current working dir and hostname
            workdir = os.getcwd().encode("utf8")
            hostname = socket.gethostname().encode("utf8")
            hardware_addr = str(uuid.getnode()).encode("utf8")
            dir_hash = hashlib.md5(workdir + hostname + hardware_addr)

            return pipeline_hash.hexdigest() + dir_hash.hexdigest()

        else:
            with open(self.report_file) as fh:
                report_json = json.loads(fh.read())

            metadata = report_json["data"]["results"][0]["nfMetadata"]

            try:
                report_id = metadata["scriptId"] + metadata["sessionId"]
            except KeyError:
                raise eh.ReportError("Incomplete or corrupt report JSON file "
                                     "missing the 'scriptId' and/or 'sessionId' "
                                     "metadata information")

            return report_id

    def _update_pipeline_status(self):
        """
        Parses the .nextflow.log file for signatures of pipeline status and sets
        the :attr:`status_info` attribute.
        """

        prev_status = self.status_info

        with open(self.log_file) as fh:

            for line in fh:

                if "Session aborted" in line:
                    self.status_info = "aborted"
                    self.send = True if prev_status != self.status_info \
                        else self.send
                    return

                if "Execution complete -- Goodbye" in line:
                    self.status_info = "complete"
                    self.send = True if prev_status != self.status_info \
                        else self.send
                    return

            self.status_info = "running"
            self.send = True if prev_status != self.status_info \
                else self.send

    def update_trace_watch(self):
        """Parses the nextflow trace file and retrieves the path of report JSON
        files that have not been sent to the service yet.
        """

        # Check the size stamp of the tracefile. Only proceed with the parsing
        # if it changed from the previous size.
        size_stamp = os.path.getsize(self.trace_file)
        self.trace_retry = 0
        if size_stamp and size_stamp == self.trace_sizestamp:
            return
        else:
            logger.debug("Updating trace size stamp to: {}".format(size_stamp))
            self.trace_sizestamp = size_stamp

        with open(self.trace_file) as fh:

            # Skip potential empty lines at the start of file
            header = next(fh).strip()
            while not header:
                header = next(fh).strip()

            # Get header mappings before parsing the file
            hm = self._header_mapping(header)

            for line in fh:
                # Skip empty lines
                if line.strip() == "":
                    continue

                fields = line.strip().split("\t")

                # Skip if task ID was already processes
                if fields[hm["task_id"]] in self.stored_ids:
                    continue

                if fields[hm["process"]] == "report":
                    self.report_queue.append(
                        self._expand_path(fields[hm["hash"]])
                    )
                    self.send = True

                # Add the processed trace line to the stored ids. It will be
                # skipped in future parsers
                self.stored_ids.append(fields[hm["task_id"]])

    def update_log_watch(self):
        """Parses nextflow log file and updates the run status
        """

        # Check the size stamp of the tracefile. Only proceed with the parsing
        # if it changed from the previous size.
        size_stamp = os.path.getsize(self.log_file)
        self.trace_retry = 0
        if size_stamp and size_stamp == self.log_sizestamp:
            return
        else:
            logger.debug("Updating log size stamp to: {}".format(size_stamp))
            self.log_sizestamp = size_stamp

        self._update_pipeline_status()

    def _send_live_report(self, report_id):
        """Sends a PUT request with the report JSON files currently in the
        report_queue attribute.

        Parameters
        ----------
        report_id : str
            Hash of the report JSON as retrieved from :func:`~_get_report_hash`
        """

        # Determines the maximum number of reports sent at the same time in
        # the same payload
        buffer_size = 100
        logger.debug("Report buffer size set to: {}".format(buffer_size))

        for i in range(0, len(self.report_queue), buffer_size):

            # Reset the report compilation batch
            reports_compilation = []

            # Iterate over report JSON batches determined by buffer_size
            for report in self.report_queue[i: i + buffer_size]:
                try:
                    report_file = [x for x in os.listdir(report)
                                   if x.endswith(".json")][0]
                except IndexError:
                    continue
                with open(join(report, report_file)) as fh:
                    reports_compilation.append(json.loads(fh.read()))

            logger.debug("Payload sent with size: {}".format(
                asizeof(json.dumps(reports_compilation))
            ))
            logger.debug("status: {}".format(self.status_info))

            try:
                requests.put(
                    self.broadcast_address,
                    json={"run_id": report_id,
                          "report_json": reports_compilation,
                          "status": self.status_info}
                )
            except requests.exceptions.ConnectionError:
                logger.error(colored_print(
                    "ERROR: Could not establish connection with server. The server"
                    " may be down or there is a problem with your internet "
                    "connection.", "red_bold"))
                sys.exit(1)

        # When there is no change in the report queue, but there is a change
        # in the run status of the pipeline
        if not self.report_queue:

            logger.debug("status: {}".format(self.status_info))

            try:
                requests.put(
                    self.broadcast_address,
                    json={"run_id": report_id,
                          "report_json": [],
                          "status": self.status_info}
                )
            except requests.exceptions.ConnectionError:
                logger.error(colored_print(
                    "ERROR: Could not establish connection with server. The"
                    " server may be down or there is a problem with your "
                    "internet connection.", "red_bold"))
                sys.exit(1)

        # Reset the report queue after sending the request
        self.report_queue = []

    def _init_live_reports(self, report_id):
        """Sends a POST request to initialize the live reports

        Parameters
        ----------
        report_id : str
            Hash of the report JSON as retrieved from :func:`~_get_report_hash`
        """

        logger.debug("Sending initial POST request to {} to start report live"
                     " update".format(self.broadcast_address))

        try:
            with open(".metadata.json") as fh:
                metadata = [json.load(fh)]
        except:
            metadata = []

        start_json = {
            "data": {"results": metadata}
        }

        try:
            requests.post(
                self.broadcast_address,
                json={"run_id": report_id, "report_json": start_json,
                      "status": self.status_info}
            )
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _close_connection(self, report_id):
        """Sends a delete request for the report JSON hash

        Parameters
        ----------
        report_id : str
            Hash of the report JSON as retrieved from :func:`~_get_report_hash`
        """

        logger.debug(
            "Closing connection and sending DELETE request to {}".format(
                self.broadcast_address))

        try:
            r = requests.delete(self.broadcast_address,
                                json={"run_id": report_id})
            if r.status_code != 202:
                logger.error(colored_print(
                    "ERROR: There was a problem sending data to the server"
                    "with reason: {}".format(r.reason)))
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _send_report(self, report_id):

        with open(self.report_file) as fh:
            report_json = json.loads(fh.read())

        logger.debug("Unique payload sent with size: {}".format(
            asizeof(json.dumps(report_json))
        ))

        try:
            requests.post(
                self.broadcast_address,
                json={"run_id": report_id, "report_json": report_json}
            )
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)

    def _print_msg(self, run_id):

        report_address = "{}reports/broadcast/{}".format(self.app_address,
                                                         run_id)
        logger.info(colored_print(
            "The pipeline reports are available in the following link:",
            "green_bold"))
        logger.info("{}".format(report_address))

    def broadcast_report(self):

        logger.info(colored_print("Preparing to broacast reports...",
                                  "green_bold"))

        report_hash = self._get_report_id()

        # When in watch mode,
        if self.watch:
            logger.info(colored_print("\tFetching pipeline run status",
                                      "green_bold"))
            self._update_pipeline_status()
            logger.info(colored_print(
                "\tSending initial request to test service", "green_bold"))
            self._init_live_reports(report_hash)
            logger.info(colored_print("\tInitial parsing of trace file",
                                      "green_bold"))
            self.update_trace_watch()

            self._print_msg(report_hash)

        logger.debug("Establishing connection...")

        stay_alive = True
        _broadcast_sent = False
        try:
            while stay_alive:

                # When not in watch mode, send the report JSON once
                if not _broadcast_sent and not self.watch:
                    self._send_report(report_hash)
                    self._print_msg(report_hash)
                    _broadcast_sent = True

                # When in watch mode, continuously monitor the trace file for
                # updates
                if self.watch:
                    self.update_trace_watch()
                    self.update_log_watch()
                    # When new report JSON files are available, send then
                    # via a PUT request
                    if self.send:
                        self._send_live_report(report_hash)
                        self.send = False

                sleep(self.refresh_rate)

        except FileNotFoundError as e:
            print(e)
            logger.error(colored_print(
                "ERROR: Report JSON file is not reachable!", "red_bold"))
        except Exception as e:
            logger.exception("ERROR: " + e)
        finally:
            logger.info("Closing connection")
            self._close_connection(report_hash)


================================================
FILE: flowcraft/generator/templates/Helper.groovy
================================================
class Help {

    static def start_info(Map info, String time, String profile) {

        println ""
        println "============================================================"
        println "                {{ pipeline_name }}"
        println "============================================================"
        println "Built using flowcraft v{{ version }}"
        println ""
        if (info.containsKey("fastq")){
        int nsamples = info.fastq / 2
        println " Input FastQ                 : $info.fastq"
        println " Input samples               : $nsamples"
        }
        if (info.containsKey("fasta")){
        println " Input Fasta                 : $info.fasta"
        }
        if (info.containsKey("accessions")){
        println " Input accessions            : $info.accessions"
        }
        println " Reports are found in        : ./reports"
        println " Results are found in        : ./results"
        println " Profile                     : $profile"
        println ""
        println "Starting pipeline at $time"
        println ""

    }

    static void complete_info(nextflow.script.WorkflowMetadata wf) {

        println ""
        println "Pipeline execution summary"
        println "=========================="
        println "Completed at                 : $wf.complete"
        println "Duration                     : $wf.duration"
        println "Success                      : $wf.success"
        println "Work directory               : $wf.workDir"
        println "Exit status                  : $wf.exitStatus"
        println ""

    }

    static def print_help(Map params) {

        println ""
        println "============================================================"
        println "                {{ pipeline_name }}"
        println "============================================================"
        println "Built using flowcraft v{{ version }}"
        println ""
        println ""
        println "Usage: "
        println "    nextflow run {{ nf_file }}"
        println ""
        {% for line in help_list -%}
        println "       {{ line }}"
        {% endfor %}
    }

}

class CollectInitialMetadata {

    public static void print_metadata(nextflow.script.WorkflowMetadata workflow){

        def treeDag = new File("${workflow.projectDir}/.treeDag.json").text
        def forkTree = new File("${workflow.projectDir}/.forkTree.json").text

        def metadataJson = "{'nfMetadata':{'scriptId':'${workflow.scriptId}',\
'scriptName':'${workflow.scriptName}',\
'profile':'${workflow.profile}',\
'container':'${workflow.container}',\
'containerEngine':'${workflow.containerEngine}',\
'commandLine':'${workflow.commandLine}',\
'runName':'${workflow.runName}',\
'sessionId':'${workflow.sessionId}',\
'projectDir':'${workflow.projectDir}',\
'launchDir':'${workflow.launchDir}',\
'startTime':'${workflow.start}',\
'dag':${treeDag},\
'forks':${forkTree}}}"

        def json = metadataJson.replaceAll("'", '"')

        def jsonFile = new File(".metadata.json")
        jsonFile.write json
    }
}

================================================
FILE: flowcraft/generator/templates/abricate.nf
================================================
if ( params.abricateDataDir{{ param_id }} ){
    if ( !file(params.abricateDataDir{{ param_id }}).exists() ){
        exit 1, "'abricateDataDir{{ param_id }}' data directory was not found: '${params.abricateDatabases{{ param_id }}}'"
    }
    dataDirOpt = "--datadir ${params.abricateDataDir{{ param_id }}}"
} else {
    dataDirOpt = ""
}

if ( !params.abricateMinId{{ param_id }}.toString().isNumber() ){
    exit 1, "'abricateMinId{{ param_id }}' parameter must be a number. Provide value: '${params.abricateMinId{{ param_id }}}'"
}

if ( !params.abricateMinCov{{ param_id }}.toString().isNumber() ){
    exit 1, "'abricateMinCov{{ param_id }}' parameter must be a number. Provide value: '${params.abricateMinCov{{ param_id }}}'"
}


process abricate_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { "${sample_id} ${db}" }
    publishDir "results/annotation/abricate_{{ pid }}/${sample_id}"

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    each db from params.abricateDatabases{{ param_id }}
    val min_id from Channel.value(params.abricateMinId{{ param_id }})
    val min_cov from Channel.value(params.abricateMinCov{{ param_id }})

    output:
    file '*.tsv' into abricate_out_{{ pid }}
    {% with task_name="abricate", suffix="_$db" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # Run abricate
        abricate $dataDirOpt --minid $min_id --mincov $min_cov --db $db $assembly > ${sample_id}_abr_${db}.tsv
        echo pass > .status
    } || {
        echo fail > .status
    }
    """

}


process process_abricate_{{ pid }} {

    tag "process_abricate_{{ pid }}"

    // Send POST request to platform
    {% with overwrite="false" %}
    {% include "report_post.txt" ignore missing %}
    {% endwith %}

    input:
    file abricate_file from abricate_out_{{ pid }}.collect()

    output:
    {% with task_name="process_abricate", sample_id="val('process_abricate')" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_abricate.py"


}


================================================
FILE: flowcraft/generator/templates/abyss.nf
================================================
process abyss_{{ pid }} {
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/abyss_{{ pid }}/', pattern: '*-scaffolds.fa'
    publishDir 'results/assembly/abyss_{{ pid }}/', pattern: '*-scaffolds.gfa'

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val k from Channel.value(params.abyssKmer{{ param_id }})

    output:
    set sample_id, file('*-scaffolds.fa') into {{ output_channel }}
    file "*-scaffolds.gfa" into gfa1_{{ pid }}
    {% with task_name="abyss" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    "abyss-pe name=${sample_id} graph=gfa k=${k} v=-v in=\"${fastq_pair[0]} ${fastq_pair[1]}\""
}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/assembly_mapping.nf
================================================
if ( !params.minAssemblyCoverage{{ param_id }}.toString().isNumber() ){
    if (params.minAssemblyCoverage{{ param_id }}.toString() != 'auto'){
        exit 1, "'minAssemblyCoverage{{ param_id }}' parameter must be a number or 'auto'. Provided value: ${params.minAssemblyCoverage{{ param_id }}}"
    }
}
if ( !params.AMaxContigs{{ param_id }}.toString().isNumber() ){
    exit 1, "'AMaxContigs{{ param_id }}' parameter must be a number. Provide value: '${params.AMaxContigs{{ param_id }}}'"
}

IN_assembly_mapping_opts_{{ pid }} = Channel.value([params.minAssemblyCoverage{{ param_id }},params.AMaxContigs{{ param_id }}])
IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})


process assembly_mapping_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})

    output:
    set sample_id, file(assembly), 'coverages.tsv', 'coverage_per_bp.tsv', 'sorted.bam', 'sorted.bam.bai' into MAIN_am_out_{{ pid }}
    set sample_id, file("coverage_per_bp.tsv") optional true into SIDE_BpCoverage_{{ pid }}
    {% with task_name="assembly_mapping" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        echo [DEBUG] BUILDING BOWTIE INDEX FOR ASSEMBLY: $assembly >> .command.log 2>&1
        bowtie2-build --threads ${task.cpus} $assembly genome_index >> .command.log 2>&1
        echo [DEBUG] MAPPING READS FROM $fastq >> .command.log 2>&1
        bowtie2 -q --very-sensitive-local --threads ${task.cpus} -x genome_index -1 ${fastq[0]} -2 ${fastq[1]} -S mapping.sam >> .command.log 2>&1
        echo [DEBUG] CONVERTING AND SORTING SAM TO BAM >> .command.log 2>&1
        samtools sort -o sorted.bam -O bam -@ ${task.cpus} mapping.sam && rm *.sam  >> .command.log 2>&1
        echo [DEBUG] CREATING BAM INDEX >> .command.log 2>&1
        samtools index sorted.bam >> .command.log 2>&1
        echo [DEBUG] ESTIMATING READ DEPTH >> .command.log 2>&1
        parallel -j ${task.cpus} samtools depth -ar {} sorted.bam \\> {}.tab  ::: \$(grep ">" $assembly | cut -c 2- | tr " " "_")
        # Insert 0 coverage count in empty files. See Issue #2
        echo [DEBUG] REMOVING EMPTY FILES  >> .command.log 2>&1
        find . -size 0 -print0 | xargs -0 -I{} sh -c 'echo -e 0"\t"0"\t"0 > "{}"'
        echo [DEBUG] COMPILING COVERAGE REPORT  >> .command.log 2>&1
        parallel -j ${task.cpus} echo -n {.} '"\t"' '&&' cut -f3 {} '|' paste -sd+ '|' bc >> coverages.tsv  ::: *.tab
        cat *.tab > coverage_per_bp.tsv
        rm *.tab
        if [ -f "coverages.tsv" ]
        then
            echo pass > .status
        else
            echo fail > .status
        fi
        echo -n "" > .report.json
        echo -n "" > .versions
    } || {
        echo fail > .status
    }
    """
}


/** PROCESS_ASSEMBLY_MAPPING -  MAIN
Processes the results from the assembly_mapping process and filters the
assembly contigs based on coverage and length thresholds.
*/
process process_assembly_mapping_{{ pid }} {

    // Send POST request to platform
    {% with overwrite="false" %}
    {% include "post.txt" ignore missing %}
    {% endwith %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1

    input:
    set sample_id, file(assembly), file(coverage), file(coverage_bp), file(bam_file), file(bam_index) from MAIN_am_out_{{ pid }}
    val opts from IN_assembly_mapping_opts_{{ pid }}
    val gsize from IN_genome_size_{{ pid }}

    output:
    set sample_id, '*_filt.fasta', 'filtered.bam', 'filtered.bam.bai' into {{ output_channel }}
    {% with task_name="process_am" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_assembly_mapping.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/bandage.nf
================================================
// True when a GFA secondary channel is connected to this component.
has_gfa1_{{pid}} = binding.hasVariable('gfa1_{{pid}}')

process bandage_{{pid}} {
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "reports/assembly/bandage_{{pid}}/$sample_id"

    input:
    set sample_id, file(fasta) from {{input_channel}}
    file gfa1 from has_gfa1_{{pid}} ? gfa1_{{pid}} : Channel.value("NA")
    file reference from params.reference{{param_id}} ?
        Channel.fromPath(params.reference{{param_id}}) :
        Channel.value("NA")

    output:
    file "*.png"
    file "*.svg"
    {% with task_name="bandage" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    // Use the GFA assembly when available and FASTA otherwise.
    assembly = has_gfa1_{{pid}} ? gfa1 : fasta
    command =
        """
        time Bandage image $assembly ${assembly}.png >>.command.log 2>&1
        time Bandage image $assembly ${assembly}.svg >>.command.log 2>&1
        """
    if (params.reference{{param_id}})
        command +=
            """
            time Bandage image $assembly ${assembly}.ref.png --query $reference >>.command.log 2>&1
            time Bandage image $assembly ${assembly}.ref.svg --query $reference >>.command.log 2>&1
            """
    command
}


================================================
FILE: flowcraft/generator/templates/base_recalibrator.nf
================================================
baseRecalibratorFasta_{{ pid }} = Channel.value(params.reference{{ param_id }}.split("/").last())
baseRecalibratorRef_{{ pid }} = Channel.fromPath("${params.reference{{ param_id }}}.*").collect().toList()
baseRecalibratorDbsnp_{{ pid }} = Channel.fromPath("${params.dbsnp{{ param_id }}}")
baseRecalibratorDbsnpIdx_{{ pid }} = Channel.fromPath("${params.dbsnpIdx{{ param_id }}}")
baseRecalibratorGoldenIndel_{{ pid }} = Channel.fromPath("${params.goldenIndel{{ param_id }}}")
baseRecalibratorGoldenIndelIdx_{{ pid }} = Channel.fromPath("${params.goldenIndelIdx{{ param_id }}}")

process base_recalibrator_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set val(sample_id), file(bam), file(bai) from {{ input_channel }}
    each file(reference) from baseRecalibratorRef_{{pid}}
    val(fasta) from baseRecalibratorFasta_{{pid}}
    each file(dbsnp) from baseRecalibratorDbsnp_{{pid}}
    each file(dbsnp_idx) from baseRecalibratorDbsnpIdx_{{pid}}
    each file(golden_indel) from baseRecalibratorGoldenIndel_{{pid}}
    each file(golden_indel_idx) from baseRecalibratorGoldenIndelIdx_{{pid}}
    
    output:
    set sample_id, file("${sample_id}_recal_data.table"), file(bam), file(bai) into baserecalibrator_table
    {% with task_name="base_recalibrator" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    # gunzip dbsnp & golden_indel if gzipped
    [[ "\$(file --mime-type $dbsnp | cut -d' ' -f2)" == "application/x-gzip" ]] && gzip -d --force $dbsnp
    dbsnp=\$(basename $dbsnp .gz)
    [[ "\$(file --mime-type $dbsnp_idx | cut -d' ' -f2)" == "application/x-gzip" ]] && gzip -d --force $dbsnp_idx
    [[ "\$(file --mime-type $golden_indel | cut -d' ' -f2)" == "application/x-gzip" ]] && gzip -d --force $golden_indel
    golden_indel=\$(basename $golden_indel .gz)
    [[ "\$(file --mime-type $golden_indel_idx | cut -d' ' -f2)" == "application/x-gzip" ]] && gzip -d --force $golden_indel_idx

    gatk BaseRecalibrator \
      -I $bam \
      --known-sites \$dbsnp \
      --known-sites \$golden_indel \
      -O ${sample_id}_recal_data.table \
      -R ${fasta}.fasta
    """
}


process apply_bqsr_{{ pid }} {

    {% include "post.txt" ignore missing %}

    publishDir "results/mapping/apply_bqsr_{{ pid }}"

    tag { sample_id }

    input:
    set sample_id, file(baserecalibrator_table), file(bam), file(bai) from baserecalibrator_table
    
    output:
    set sample_id, file("${sample_id}_recalibrated.bam"), file("${sample_id}_recalibrated.bai") into {{ output_channel }}
    {% with task_name="apply_bqsr" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    gatk ApplyBQSR \
      -I $bam \
      -bqsr $baserecalibrator_table \
      -O ${sample_id}_recalibrated.bam \
      --create-output-bam-index
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/bcalm.nf
================================================
// Check parameter
if ( !params.bcalmKmerSize{{ param_id }}.toString().isNumber() ){
    exit 1, "'bcalmKmerSize{{ param_id }}' parameter must be a number. Provided value: '${params.bcalmKmes%rSize{{ param_id }}}'"
}

// Clear
clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process bcalm_{{ pid }} {
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "reports/assembly/quast_{{pid}}/$sample_id"

    input:
    set sample_id, file(fastq) from {{input_channel}}
    val KmerSize from Channel.value(params.bcalmKmerSize{{param_id}})
    
output:
    file "*.unitig.fa"
    {% with task_name="bcalm" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
	bcalm -in $fastq -out unitig -kmer-size $KmerSize"

  	if [ "$clear" = "true" ];
	then
    	    find . -type f  -print | egrep "work/.*(h5)|(glue)" | xargs -L 1 rm
	fi
    }
    """
}


================================================
FILE: flowcraft/generator/templates/bowtie.nf
================================================
// Check for the presence of absence of both index and fasta reference
if (params.index{{ param_id }} == null && params.reference{{ param_id }} == null){
    exit 1, "An index or a reference fasta file must be provided."
} else if (params.index{{ param_id }} != null && params.reference{{ param_id }} != null){
    exit 1, "Provide only an index OR a reference fasta file."
}

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

if (params.reference{{ param_id }}){

    reference_in_{{ pid }} = Channel.fromPath(params.reference{{ param_id }})
        .map{it -> file(it).exists() ? [it.toString().tokenize('/').last().tokenize('.')[0..-2].join('.') ,it] : null}

    process bowtie_build_{{ pid }} {

        // Send POST request to platform
        {% include "post.txt" ignore missing %}

        tag { build_id }
        storeDir 'bowtie_index/'
        maxForks 1

        input:
        set build_id, file(fasta) from reference_in_{{ pid }}

        output:
        val build_id into bowtieIndexId_{{ pid }}
        file "${build_id}*.bt2" into bowtieIndex_{{ pid }}

        script:
        """
        # checking if reference file is empty. Moved here due to allow reference file to be inside the container.
        if [ ! -f "$fasta" ]
        then
            echo "Error: ${fasta} file not found."
            exit 1
        fi

        bowtie2-build ${fasta} $build_id > ${build_id}_bowtie2_build.log
        """
    }
} else {
    bowtieIndexId_{{ pid }} = Channel.value(params.index{{ param_id }}.split("/").last())
    bowtieIndex_{{ pid }} = Channel.fromPath("${params.index{{ param_id }}}*.bt2").collect().toList()
}


process bowtie_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/mapping/bowtie_{{ pid }}/'

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    each index from bowtieIndexId_{{pid}}
    each file(index_files) from bowtieIndex_{{ pid }}

    output:
    set sample_id , file("*.bam") into {{ output_channel }}
    set sample_id, file("*_bowtie2.log") into into_json_{{ pid }}
    {% with task_name="bowtie" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        bowtie2 -x $index -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} -p $task.cpus 1> ${sample_id}.bam 2> ${sample_id}_bowtie2.log

        if [ "$clear" = "true" ];
        then
            work_regex=".*/work/.{2}/.{30}/.*"
            file_source1=\$(readlink -f \$(pwd)/${fastq_pair[0]})
            file_source2=\$(readlink -f \$(pwd)/${fastq_pair[1]})
            if [[ "\$file_source1" =~ \$work_regex ]]; then
                rm \$file_source1 \$file_source2
            fi
        fi

        echo pass > .status
    } || {
        echo fail > .status
    }
    """
}


process report_bowtie_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(bowtie_log) from into_json_{{ pid }}

    output:
    {% with task_name="report_bowtie" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_mapping.py"

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/bwa.nf
================================================
bwaIndexId_{{ pid }} = Channel.value(params.bwaIndex{{ param_id }}.split("/").last())
bwaIndex_{{ pid }} = Channel.fromPath("${params.bwaIndex{{ param_id }}}.*").collect().toList()

process bwa_{{ pid }} {

    {% include "post.txt" ignore missing %}

    publishDir "results/mapping/bwa_{{ pid }}"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    each index from bwaIndexId_{{pid}}
    each file(index_file) from bwaIndex_{{pid}}
   
    output:
    set sample_id, file("${sample_id}.bam"), file("${sample_id}.bam.bai") into {{ output_channel }}
    {% with task_name="bwa" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    bwa mem -M -R '@RG\\tID:${sample_id}\\tSM:${sample_id}\\tPL:Illumina' -t $task.cpus $index $fastq_pair > ${sample_id}.sam
    samtools sort -o ${sample_id}.bam -O BAM ${sample_id}.sam
    samtools index ${sample_id}.bam
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/card_rgi.nf
================================================
IN_alignment_tool_{{ pid }} = Channel.value(params.alignmentTool{{ param_id }})


process card_rgi_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/annotation/card_rgi/", pattern: "*.txt"

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    val alignmetTool from IN_alignment_tool_{{ pid }}

    output:
    file("${sample_id}_card_rgi.txt")
    {% with task_name="card_rgi" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    # Place card_rgi source in a read/write location for shifter container
    mkdir card_temp && cp -r /usr/local/lib/python3.5/dist-packages/app/ card_temp
    export PYTHONPATH="\$(pwd)/card_temp:\$PATH"

    rgi main --input_sequence ${assembly} --output_file ${sample_id}_card_rgi --input_type contig --alignment_tool ${alignmetTool} --low_quality --include_loose -d wgs --clean
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/check_coverage.nf
================================================
IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit (1, "The genomeSize parameter must be a number or a float. Provided value: '${params.genomeSize{{ param_id }}}'")}
IN_min_coverage_{{ pid }} = Channel.value(params.minCoverage{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit (1, "The minCoverage parameter must be a number or a float. Provided value: '${params.minCoverage{{ param_id }}}'")}

process integrity_coverage2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    cpus 1

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val gsize from IN_genome_size_{{ pid }}
    val cov from IN_min_coverage_{{ pid }}
    // Use -e option for skipping encoding guess
    val opts from Channel.value('-e')

    output:
    set sample_id,
        file(fastq_pair),
        file('*_coverage'),
        file('*_max_len') optional true into MAIN_integrity_{{ pid }}
    file('*_report') into LOG_report_coverage_{{ pid }}
    {% with task_name="check_coverage" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "integrity_coverage.py"
}

{{ output_channel }} = Channel.create()
SIDE_max_len_{{ pid }} = Channel.create()

MAIN_integrity_{{ pid }}
    .filter{ it[2].text != "fail" }
    .separate({{ output_channel }}, SIDE_max_len_{{ pid }}){
        a -> [ [a[0], a[1]], [a[0], a[3].text]]
    }


process report_coverage2_{{ pid }} {

    // This process can only use a single CPU
    cpus 1
    publishDir 'reports/coverage_{{ pid }}/'

    input:
    file(report) from LOG_report_coverage_{{ pid }}.filter{ it.text != "corrupt" }.collect()

    output:
    file 'estimated_coverage_second.csv'

    """
    echo Sample,Estimated coverage,Status >> estimated_coverage_second.csv
    cat $report >> estimated_coverage_second.csv
    """
}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/chewbbaca.nf
================================================
if ( !params.schemaPath{{ param_id }} ){
    exit 1, "'schemaPath{{ param_id }}' parameter missing"
}
if ( params.chewbbacaTraining{{ param_id }}){
    if (!file(params.chewbbacaTraining{{ param_id }}).exists()) {
        exit 1, "'chewbbacaTraining{{ param_id }}' file was not found: '${params.chewbbacaTraining{{ param_id }}}'"
    }
}
if ( params.schemaSelectedLoci{{ param_id }}){
    if (!file(params.schemaSelectedLoci{{ param_id }}).exists()) {
        exit 1, "'schemaSelectedLoci{{ param_id }}' file was not found: '${params.schemaSelectedLoci{{ param_id }}}'"
    }
}
if ( params.schemaCore{{ param_id }}){
    if (!file(params.schemaCore{{ param_id }}).exists()) {
        exit 1, "'schemaCore{{ param_id }}' file was not found: '${params.schemaCore{{ param_id }}}'"
    }
}

IN_schema_{{ pid }} = Channel.fromPath(params.schemaPath{{ param_id }})


if (params.chewbbacaJson{{ param_id }} == true){
    jsonOpt = "--json"
} else {
    jsonOpt = ""
}

if (params.chewbbacaTraining{{ param_id }}){
    training = "--ptf ${params.chewbbacaTraining{{ param_id }}}"
} else {
    training = ""
}

// If chewbbaca is executed in batch mode, wait for all assembly files
// to be collected on the input channel, and only then execute chewbbaca
// providing all samples simultaneously
if (params.chewbbacaBatch{{ param_id }}) {
    process chewbbaca_batch_{{ pid }} {

        {% include "post.txt" ignore missing %}
        maxForks 1
        scratch false
        if (params.chewbbacaQueue{{ param_id }} != null) {
            queue "${params.chewbbacaQueue{{ param_id}}}"
        }
        publishDir "results/chewbbaca_alleleCall_{{ pid }}/", mode: "copy"

        input:
        file assembly from {{ input_channel }}.map{ it[1] }.collect()
        each file(schema) from IN_schema_{{ pid }}

        output:
        file 'chew_results*'
        file 'cgMLST.tsv' optional true into chewbbacaProfile_{{ pid }}
        {% with task_name="chewbbaca", sample_id="val('single')" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}

        script:
        """
        {
            set -x
            if [ -d "$schema/temp" ];
            then
                rm -r $schema/temp
            fi

            if [ "$params.schemaSelectedLoci{{ param_id }}" = "null" ];
            then
                inputGenomes=$schema
            else
                inputGenomes=${params.schemaSelectedLoci{{ param_id }}}
            fi

            echo $assembly | tr " " "\n" >> input_file.txt
            chewBBACA.py AlleleCall -i input_file.txt -g \$inputGenomes -o chew_results $jsonOpt --cpu $task.cpus $training
            if [ "$jsonOpt" = "--json" ]; then
                merge_json.py ${params.schemaCore{{ param_id }}} chew_results/*/results*
            else
                cp chew_results*/*/results_alleles.tsv cgMLST.tsv
            fi
        } || {
            echo fail > .status
        }
        """
    }

} else {
    process chewbbaca_{{ pid }} {

        // Send POST request to platform
        {% include "post.txt" ignore missing %}

        maxForks 1
        tag { sample_id }
        scratch true
        if (params.chewbbacaQueue{{ param_id }} != null) {
            queue "${params.chewbbacaQueue{{ param_id }}}"
        }
        publishDir "results/chewbbaca_alleleCall_{{ pid }}/", mode: "copy"

        input:
        set sample_id, file(assembly) from {{ input_channel }}
        each file(schema) from IN_schema_{{ pid }}

        output:
        file 'chew_results*'
        file '*_cgMLST.tsv' optional true into chewbbacaProfile_{{ pid }}
        {% with task_name="chewbbaca" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}

        script:
        """
        {
            set -x
            if [ -d "$schema/temp" ];
            then
                rm -r $schema/temp
            fi

            if [ "$params.schemaSelectedLoci{{ param_id }}" = "null" ];
            then
                inputGenomes=$schema
            else
                inputGenomes=${params.schemaSelectedLoci{{ param_id }}}
            fi

            echo $assembly >> input_file.txt
            chewBBACA.py AlleleCall -i input_file.txt -g \$inputGenomes -o chew_results_${sample_id} $jsonOpt --cpu $task.cpus $training --fc
            if [ "$jsonOpt" = "--json" ]; then
                merge_json.py ${params.schemaCore{{ param_id }}} chew_results_*/*/results* ${sample_id}
            else
                mv chew_results_*/*/results_alleles.tsv ${sample_id}_cgMLST.tsv
            fi
        } || {
            echo fail > .status
        }
        """
    }
}


process chewbbacaExtractMLST_{{ pid }} {

    publishDir "results/chewbbaca_{{ pid }}/", mode: "copy", overwrite: true

    input:
    file profiles from chewbbacaProfile_{{ pid }}.collect()

    output:
    file "results/cgMLST.tsv"

    """
    head -n1 ${profiles[0]} > chewbbaca_profiles.tsv
    awk 'FNR == 2' $profiles >> chewbbaca_profiles.tsv
    chewBBACA.py ExtractCgMLST -i chewbbaca_profiles.tsv -o results -p $params.chewbbacaProfilePercentage{{ param_id }}
    """

}


================================================
FILE: flowcraft/generator/templates/compiler_channels.txt
================================================
set {{ sample_id|default("sample_id") }}, val("{{ pid }}_{{ task_name }}{{ suffix }}"), file(".status"), file(".warning"), file(".fail"), file(".command.log") into STATUS_{{task_name}}_{{ pid }}
set {{ sample_id|default("sample_id") }}, val("{{ task_name }}_{{ pid }}{{ suffix }}"), val("{{ pid }}"), file(".report.json"), file(".versions"), file(".command.trace") into REPORT_{{task_name}}_{{ pid }}
file ".versions"

================================================
FILE: flowcraft/generator/templates/concoct.nf
================================================
IN_max_clusters_{{ pid }} = Channel.value(params.clusters{{ param_id }})
IN_length_threshold_{{ pid }} = Channel.value(params.lengthThreshold{{ param_id }})
IN_read_length_{{ pid }} = Channel.value(params.readLength{{ param_id }})
IN_iterations_{{ pid }} = Channel.value(params.iterations{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process concoct_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/assembly/binning/concoct_{{ pid }}/${sample_id}/"

    input:
    set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})
    val maxClusters from IN_max_clusters_{{ pid }}
    val read_length from IN_read_length_{{ pid }}
    val length_threshold from IN_length_threshold_{{ pid }}
    val iterations from IN_iterations_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file(assembly), file('concoct_output/*.fa') into binCh_{{ pid }}
    set sample_id, file("concoct_output/clustering_merged.csv"), file(assembly) into intoReport_{{ pid }}
    file("concoct_output/*.csv")
    file("concoct_output/*.txt")
    {% with task_name="concoct" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # cut up the contigs into chunks of 10Kb to mitigate assembly errors and give more weight to larger contigs
        cut_up_fasta.py -c 10000 -o 0 -b ${sample_id}_bedfile -m ${assembly} > ${sample_id}_split_contigs.fasta

        # map reads to cut up assembly
        echo [DEBUG] BUILDING BOWTIE INDEX FOR ASSEMBLY: $assembly >> .command.log 2>&1
        bowtie2-build ${sample_id}_split_contigs.fasta ${sample_id}_split_contigs_index >> .command.log 2>&1
        echo [DEBUG] MAPPING READS FROM $fastq >> .command.log 2>&1
        bowtie2 --threads ${task.cpus} -x ${sample_id}_split_contigs_index -1 ${fastq[0]} -2 ${fastq[1]} -S mapping.sam >> .command.log 2>&1
        echo [DEBUG] CONVERTING AND SORTING SAM TO BAM >> .command.log 2>&1
        samtools sort -o sorted.bam -O bam -@ ${task.cpus} mapping.sam && rm *.sam  >> .command.log 2>&1
        echo [DEBUG] CREATING BAM INDEX >> .command.log 2>&1
        samtools index sorted.bam >> .command.log 2>&1

        # create coverage table for concoct
        concoct_coverage_table.py ${sample_id}_bedfile sorted.bam > ${sample_id}_coverage_file.tab

        # run CONCOCT
        concoct --coverage_file ${sample_id}_coverage_file.tab --composition_file ${sample_id}_split_contigs.fasta \
        -b concoct_output/ -c ${maxClusters} -l ${length_threshold} -r ${read_length } -i ${iterations} -t ${task.cpus}

        # Merge subcontig clustering into original contig clustering
        merge_cutup_clustering.py concoct_output/clustering_*.csv > concoct_output/clustering_merged.csv

        # Extract bins as individual FASTA
        extract_fasta_bins.py --output_path concoct_output/ ${assembly} concoct_output/clustering_merged.csv

        echo pass > .status

        if [ "$clear" = "true" ];
        then
            work_regex=".*/work/.{2}/.{30}/.*"
            file_source1=\$(readlink -f \$(pwd)/${fastq[0]})
            file_source2=\$(readlink -f \$(pwd)/${fastq[1]})
            assembly_file=\$(readlink -f \$(pwd)/${assembly})
            if [[ "\$file_source1" =~ \$work_regex ]]; then
                rm \$file_source1 \$file_source2 \$assembly_file
            fi
        fi
    } || {
        echo fail > .status
    }
    """
}

process report_concoct_{{ pid }}{

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(cluster), file(contigs) from intoReport_{{ pid }}

    output:
    {% with task_name="report_concoct" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_concoct.py"

}

// emits one bin per channel
{{ output_channel }} = Channel.create()
binCh_{{ pid }}.map{ it -> [it[2].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'), it[2]]}
    .transpose()
    .map{it -> [it[1].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'),it[1]]}
    .into({{ output_channel }})

{{ forks }}

================================================
FILE: flowcraft/generator/templates/containers.config
================================================
process {
{{ container_info }}

}

================================================
FILE: flowcraft/generator/templates/dengue_typing.nf
================================================
// Check for the presence of absence of fasta reference
if (params.reference{{ param_id }} == null) {
    exit 1, "Dengue_typing: A reference fasta file must be provided."
}

getRef_{{ pid }} = params.get_genome{{ param_id }} ? "true" : "false"
checkpointReferenceGenome_{{ pid }} = Channel.value(getRef_{{ pid }})
checkpointReferenceGenome_{{ pid }}.into{ reference_reads_{{ pid }} ; reference_assembly_{{ pid }} }

reference_{{ pid }} = Channel.fromPath(params.reference{{ param_id }})

class VerifyCompletnessTyping {

    public static boolean contigs(String filename, int threshold){
        BufferedReader reader = new BufferedReader(new FileReader(filename));
        boolean result = processContigs(reader, threshold);
        reader.close()

        return result;
    }

    private static boolean processContigs(BufferedReader reader, int threshold){
        String line;
        int lineThreshold = 0;
        List splittedLine

        while ((line = reader.readLine()) != null) {
            if (line.startsWith('>')) {
                lineThreshold = 0
            } else {
                lineThreshold += line.length()
                if(lineThreshold >= threshold) {
                    return true;
                }
             }
        }

        return false;
    }
}


type_reads_{{ pid }} = Channel.create()
type_assembly_{{ pid }} = Channel.create()
{{ input_channel }}.choice(type_assembly_{{ pid }}, type_reads_{{ pid }}){a -> a[1].toString() == "null" ? false : VerifyCompletnessTyping.contigs(a[1].toString(), 10000) == true ? 0 : 1}

process dengue_typing_assembly_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/dengue_typing/${sample_id}/"


    input:
    set sample_id, file(assembly), file(reference) from type_assembly_{{ pid }}
    val get_reference from reference_assembly_{{ pid }}
    each file(reference) from Channel.fromPath("${params.reference{{ param_id }}}")

    output:
    file "seq_typing*"
    set sample_id, file(assembly) into out_typing_assembly_{{ pid }}
    file("*.fa") optional true into _ref_seqTyping_assembly_{{ pid }}
    {% with task_name="dengue_typing_assembly" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "dengue_typing_assembly.py"

}


process dengue_typing_reads_{{ pid }} {

// Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/dengue_typing/${sample_id}/"

    errorStrategy { task.exitStatus == 120 ? 'ignore' : 'retry' }

    input:
    set sample_id, file(assembly), file(fastq_pair) from type_reads_{{ pid }}.join(_LAST_fastq_{{ pid }})
    val get_reference from reference_reads_{{ pid }}
    each file(reference) from Channel.fromPath("${params.reference{{ param_id }}}")

    output:
    file "seq_typing*"
    set sample_id, file("*consensus.fasta") into out_typing_reads_{{ pid }}
    file("*.fa") optional true into _ref_seqTyping_reads_{{ pid }}
    {% with task_name="dengue_typing_reads" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "dengue_typing_reads.py"

}

out_typing_assembly_{{ pid }}.mix(out_typing_reads_{{ pid }}).set{ {{ output_channel }} }

_ref_seqTyping_assembly_{{ pid }}.mix(_ref_seqTyping_reads_{{ pid }}).set{ _ref_seqTyping_{{ pid }} }

{{ forks }}


================================================
FILE: flowcraft/generator/templates/diamond.nf
================================================
// check if any of the parameters it defined before executing the process.
if (!params.pathToDb{{ param_id }} && !params.fastaToDb{{ param_id }})
    exit 1, "'You must specify either a pathToDb or fastaToDb parameter.'"
// checks if both are defined and if so raises an error.
else if (params.pathToDb{{ param_id }} && params.fastaToDb{{ param_id }})
    exit 1, "'Both pathToDb and fastaToDb were given, choose just one.'"

// list of blasts allowed for diamond
allowedBlasts = ["blastp", "blastx"]
// checks if blast type os defined
if (!allowedBlasts.contains(params.blastType{{ param_id }}))
    exit 1, "Provide a valid blast type: blastx or blastp"

process diamond_{{ pid }}  {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "results/annotation/diamond_{{ pid }}/${sample_id}"

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    file pathToDb from params.pathToDb{{ param_id }} ?
        Channel.fromPath(params.pathToDb{{ param_id }}) : Channel.value("NA")
    file fastaToDb from params.fastaToDb{{ param_id }} ?
        Channel.fromPath(params.fastaToDb{{ param_id }}) : Channel.value("NA")
    val blast from params.blastType{{ param_id }}

    output:
    file "*.txt" into diamondOutputs
    output:
    {% with task_name="diamond"%}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    // Use database when available or otherwise use Fasta file
    if (params.pathToDb{{ param_id }})
        """
        diamond ${blast} -d ${pathToDb} -q ${assembly} \
        -o ${pathToDb}.txt -e 1E-20 -p ${task.cpus} \
        -f 6 qseqid sseqid pident length mismatch gapopen qstart qend slen sstart send evalue bitscore
        """
    else if (params.fastaToDb{{ param_id }})
        """
        diamond makedb --in ${fastaToDb} -d ${fastaToDb}
        diamond ${blast} -d ${fastaToDb}.dmnd -q ${assembly} \
        -o ${fastaToDb}.txt -e 1E-20 -p ${task.cpus} \
        -f 6 qseqid sseqid pident length mismatch gapopen qstart qend slen sstart send evalue bitscore
        """

}

================================================
FILE: flowcraft/generator/templates/downsample_fastq.nf
================================================

IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit(1, "The genomeSize parameter must be a number or a float. Provided value: '${params.genomeSize{{ param_id }}}'")}

IN_depth_{{ pid }} = Channel.value(params.depth{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit(1, "The depth parameter must be a number or a float. Provided value: '${params.depth{{ param_id }}}'")}

IN_seed_{{ pid }} = Channel.value(params.seed{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit(1, "The seed parameter must be a number or a float. Provided value: '${params.seed{{ param_id }}}'")}

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process downsample_fastq_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { "${sample_id}" }
    publishDir "results/downsample_fastq_{{ pid }}/", pattern: "_ss.*"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val gsize from IN_genome_size_{{ pid }}
    val depth from IN_depth_{{ pid }}
    val seed from IN_seed_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file('*_ss.*') into {{ output_channel }}
    {% with task_name="downsample_fastq" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "downsample_fastq.py"

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/fast_ani.nf
================================================
IN_fragLen_{{ pid }} = Channel.value(params.fragLen{{ param_id }})

// runs fast ani for multiple comparisons (many to many mode)
process fastAniMatrix_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

     publishDir 'results/fast_ani/fast_ani_{{ pid }}/',

    input:
    set sample_id, file(fasta) from {{ input_channel }}
    val fragLenValue from IN_fragLen_{{ pid }}

    output:
    set sample_id, fasta, file("*.out")
    {% with task_name="fastAniMatrix", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    mkdir fasta_store
    fasta_spliter.py ${fasta}
    fastANI --ql files_fastani.txt --rl files_fastani.txt \
    -t ${task.cpus} --fragLen ${fragLenValue} \
    -o ${sample_id.take(sample_id.lastIndexOf("."))}_fastani.out
    """

}


================================================
FILE: flowcraft/generator/templates/fasterq_dump.nf
================================================
// check if option file is provided or not
optionFile = (params.option_file{{ param_id }} == false) ? "" :
    "--option-file ${params.option_file{{ param_id }}}"

// process to run fasterq-dump from sra-tools
process fasterqDump_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { accession_id }
    publishDir "reads/${accession_id}/", pattern: "*.fastq*"
    maxRetries 1

    input:
    val accession_id from {{ input_channel }}.splitText(){ it.trim() }.filter{ it.trim() != "" }

    output:
    set accession_id, file("*.fastq*") optional true into {{ output_channel }}
    {% with task_name="fasterqDump", sample_id="accession_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        echo "Downloading the following accession: ${accession_id}"
        fasterq-dump ${accession_id} -e ${task.cpus} -p ${optionFile}
        if [ ${params.compress_fastq{{ param_id }}} = true ]
        then
            echo "Compressing FastQ files..."
            if [ -f ${accession_id}_1.fastq ]
            then
                pigz -p ${task.cpus} ${accession_id}_1.fastq ${accession_id}_2.fastq
            elif [ -f ${accession_id}_3.fastq ]
            then
                echo "No paired end reads were found to compress."
                pigz -p ${task.cpus} ${accession_id}_3.fastq
            else
                echo "FastQ files weren't compressed. Check if FastQ files were downloaded."
            fi
        else
            echo "FastQ files won't be compressed because compress_fastq options was set to: '${params.compress_fastq{{ param_id }}}.'"
        fi
    } || {
        # If exit code other than 0
        if [ \$? -eq 0 ]
        then
            echo "pass" > .status
        else
            echo "fail" > .status
            echo "Could not download accession $accession_id" > .fail
        fi
    }
    """
}


================================================
FILE: flowcraft/generator/templates/fastqc.nf
================================================
IN_adapters_{{ pid }} = Channel.value(params.adapters{{ param_id }})

process fastqc2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "reports/fastqc_{{ pid }}/", pattern: "*.html"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val ad from IN_adapters_{{ pid }}

    output:
    set sample_id, file(fastq_pair), file('pair_1*'), file('pair_2*') into MAIN_fastqc_out_{{ pid }}
    file "*html"
    {% with task_name="fastqc2" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "fastqc.py"
}


process fastqc2_report_{{ pid }} {

    // Send POST request to platform
    {% with overwrite="false" %}
    {% include "post.txt" ignore missing %}
    {% endwith %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1
    publishDir 'reports/fastqc_{{ pid }}/run_2/', pattern: '*summary.txt', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), file(result_p1), file(result_p2) from MAIN_fastqc_out_{{ pid }}
    val opts from Channel.value("")

    output:
    set sample_id, file(fastq_pair), '.status' into MAIN_fastqc_report_{{ pid }}
    file "*_status_report" into LOG_fastqc_report_{{ pid }}
    file "${sample_id}_*_summary.txt" optional true
    {% with task_name="fastqc2_report" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "fastqc_report.py"

}


process compile_fastqc_status2_{{ pid }} {

    publishDir 'reports/fastqc_{{ pid }}/', mode: 'copy'

    input:
    file rep from LOG_fastqc_report_{{ pid }}.collect()

    output:
    file 'FastQC_2run_report.csv'

    """
    echo Sample, Failed? >> FastQC_2run_report.csv
    cat $rep >> FastQC_2run_report.csv
    """

}

{{ output_channel }} = Channel.create()

MAIN_fastqc_report_{{ pid }}
        .filter{ it[2].text == "pass" }
        .map{ [it[0], it[1]] }
        .into({{ output_channel }})

{{ forks }}


================================================
FILE: flowcraft/generator/templates/fastqc_trimmomatic.nf
================================================
// Check sliding window parameter
if ( params.trimSlidingWindow{{ param_id }}.toString().split(":").size() != 2 ){
    exit 1, "'trimSlidingWindow{{ param_id }}' parameter must contain two values separated by a ':'. Provided value: '${params.trimSlidingWindow{{ param_id }}}'"
}
if ( !params.trimLeading{{ param_id }}.toString().isNumber() ){
    exit 1, "'trimLeading{{ param_id }}' parameter must be a number. Provide value: '${params.trimLeading{{ param_id }}}'"
}
if ( !params.trimTrailing{{ param_id }}.toString().isNumber() ){
    exit 1, "'trimTrailing{{ param_id }}' parameter must be a number. Provide value: '${params.trimTrailing{{ param_id }}}'"
}
if ( !params.trimMinLength{{ param_id }}.toString().isNumber() ){
    exit 1, "'trimMinLength{{ param_id }}' parameter must be a number. Provide value: '${params.trimMinLength{{ param_id }}}'"
}

IN_trimmomatic_opts_{{ pid }} = Channel.value([params.trimSlidingWindow{{ param_id }},params.trimLeading{{ param_id }},params.trimTrailing{{ param_id }},params.trimMinLength{{ param_id }}])
IN_adapters_{{ pid }} = Channel.value(params.adapters{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process fastqc_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "reports/fastqc_{{ pid }}/", pattern: "*.html"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val ad from Channel.value('None')

    output:
    set sample_id, file(fastq_pair), file('pair_1*'), file('pair_2*') into MAIN_fastqc_out_{{ pid }}
    file "*html"
    {% with task_name="fastqc" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "fastqc.py"
}

/** FASTQC_REPORT - MAIN
This process will parse the result files from a FastQC analyses and output
the optimal_trim information for Trimmomatic
*/
process fastqc_report_{{ pid }} {

    // Send POST request to platform
    {% with overwrite="false" %}
    {% include "post.txt" ignore missing %}
    {% endwith %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1
    publishDir 'reports/fastqc_{{ pid }}/run_1/', pattern: '*summary.txt', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), file(result_p1), file(result_p2) from MAIN_fastqc_out_{{ pid }}
    val opts from Channel.value("--ignore-tests")

    output:
    set sample_id, file(fastq_pair), 'optimal_trim', ".status" into _MAIN_fastqc_trim_{{ pid }}
    file '*_trim_report' into LOG_trim_{{ pid }}
    file "*_status_report" into LOG_fastqc_report_{{ pid }}
    file "${sample_id}_*_summary.txt" optional true
    {% with task_name="fastqc_report" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "fastqc_report.py"

}

MAIN_fastqc_trim_{{ pid }} = Channel.create()
_MAIN_fastqc_trim_{{ pid }}
        .filter{ it[3].text == "pass" }
        .map{ [it[0], it[1], file(it[2]).text] }
        .into(MAIN_fastqc_trim_{{ pid }})


/** TRIM_REPORT - PLUG-IN
This will collect the optimal trim points assessed by the fastqc_report
process and write the results of all samples in a single csv file
*/
process trim_report_{{ pid }} {

    publishDir 'reports/fastqc_{{ pid }}/', mode: 'copy'

    input:
    file trim from LOG_trim_{{ pid }}.collect()

    output:
    file "FastQC_trim_report.csv"

    """
    echo Sample,Trim begin, Trim end >> FastQC_trim_report.csv
    cat $trim >> FastQC_trim_report.csv
    """
}


process compile_fastqc_status_{{ pid }} {

    publishDir 'reports/fastqc_{{ pid }}/', mode: 'copy'

    input:
    file rep from LOG_fastqc_report_{{ pid }}.collect()

    output:
    file 'FastQC_1run_report.csv'

    """
    echo Sample, Failed? >> FastQC_1run_report.csv
    cat $rep >> FastQC_1run_report.csv
    """

}


/** TRIMMOMATIC - MAIN
This process will execute trimmomatic. Currently, the main channel requires
information on the trim_range and phred score.
*/
process trimmomatic_{{ pid }} {

    // Send POST request to platform
    {% with overwrite="false" %}
    {% include "post.txt" ignore missing %}
    {% endwith %}

    tag { sample_id }
    publishDir "results/trimmomatic_{{ pid }}", pattern: "*.gz"

    input:
    set sample_id, file(fastq_pair), trim_range, phred from MAIN_fastqc_trim_{{ pid }}.join(SIDE_phred_{{ pid }})
    val opts from IN_trimmomatic_opts_{{ pid }}
    val ad from IN_adapters_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, "${sample_id}_*trim.fastq.gz" into {{ output_channel }}
    file 'trimmomatic_report.csv'
    {% with task_name="trimmomatic" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "trimmomatic.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/filter_poly.nf
================================================
IN_adapter_{{ pid }} = Channel.value(params.adapter{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process filter_poly_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    echo true

    errorStrategy { task.exitStatus == 120 ? 'ignore' : 'retry' }

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val adapter from IN_adapter_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id , file("${sample_id}_filtered_{1,2}.fastq.gz") into {{ output_channel }}
    {% with task_name="filter_poly" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    gunzip -c ${fastq_pair[0]} >  ${sample_id}_1.fq
    gunzip -c ${fastq_pair[1]} >  ${sample_id}_2.fq

    for seqfile in *.fq;
    do if [ ! -s \$seqfile  ]
    then
        echo \$seqfile is empty && exit 120
    fi
    done

    prinseq-lite.pl --fastq ${sample_id}_1.fq  --fastq2 ${sample_id}_2.fq  --custom_params "${adapter}" -out_format 3 -out_good ${sample_id}_filtered

    gzip ${sample_id}_filtered_*.fastq

    #rm *.fq *.fastq

    if [ "$clear" = "true" ];
    then
        work_regex=".*/work/.{2}/.{30}/.*"
        file_source1=\$(readlink -f \$(pwd)/${fastq_pair[0]})
        file_source2=\$(readlink -f \$(pwd)/${fastq_pair[1]})
        if [[ "\$file_source1" =~ \$work_regex ]]; then
            rm \$file_source1 \$file_source2
        fi
    fi

    """
}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/haplotypecaller.nf
================================================
haplotypecallerIndexId_{{ pid }} = Channel.value(params.reference{{ param_id }}.split("/").last())
haplotypecallerRef_{{ pid }} = Channel.fromPath("${params.reference{{ param_id }}}.*").collect().toList()
interval_{{ pid }} = Channel.fromPath(params.intervals{{ param_id }})
           .ifEmpty { exit 1, "Interval list file for HaplotypeCaller not found: ${params.intervals}" }
           .splitText()
           .map { it -> it.trim() }

process haplotypecaller_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag "$interval"

    input:
    set sample_id, file(bam), file(bai) from {{ input_channel }}
    each interval from interval_{{pid}}
    each file(ref_files) from haplotypecallerRef_{{pid}}
    each index from haplotypecallerIndexId_{{pid}}
   
    output:
    file("*.vcf") into haplotypecallerGvcf
    file("*.vcf.idx") into gvcfIndex
    val(sample_id) into sampleId

    {% with task_name="haplotypecaller", suffix="_${interval}" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    gatk HaplotypeCaller \
      --java-options -Xmx${task.memory.toMega()}M \
      -R ${index}.fasta \
      -O ${sample_id}.vcf \
      -I $bam \
      -L $interval
    """
}

process merge_vcfs_{{ pid }} {

    {% include "post.txt" ignore missing %}

    publishDir "results/variant_calling/merge_vcfs_{{ pid }}"

    tag { sample_id }

    input:
    file('*.vcf') from haplotypecallerGvcf.collect()
    file('*.vcf.idx') from gvcfIndex.collect()
    val(sample_id) from sampleId.first()

    output:
    set file("${sample_id}.vcf.gz"), file("${sample_id}.vcf.gz.tbi") into {{ output_channel }}
    {% with task_name="merge_vcfs" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    ## make list of input variant files
    for vcf in \$(ls *vcf); do
      echo \$vcf >> input_variant_files.list
    done

    gatk MergeVcfs \
      --INPUT= input_variant_files.list \
      --OUTPUT= ${sample_id}.vcf.gz
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/init.nf
================================================

// Placeholder for main input channels
{{ main_inputs }}

// Placeholder for secondary input channels
{{ secondary_inputs }}

// Placeholder for extra input channels
{{ extra_inputs }}

// Placeholder to fork the raw input channel
{{ forks }}


================================================
FILE: flowcraft/generator/templates/integrity_coverage.nf
================================================
IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit(1, "The genomeSize parameter must be a number or a float. Provided value: '${params.genomeSize_{{ param_id }}}'")}

IN_min_coverage_{{ pid }} = Channel.value(params.minCoverage{{ param_id }})
    .map{it -> it.toString().isNumber() ? it : exit(1, "The minCoverage parameter must be a number or a float. Provided value: '${params.minCoverage_{{ param_id }}}'")}

process integrity_coverage_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val gsize from IN_genome_size_{{ pid }}
    val cov from IN_min_coverage_{{ pid }}
    // This channel is for the custom options of the integrity_coverage.py
    // script. See the script's documentation for more information.
    val opts from Channel.value('')

    output:
    set sample_id,
        file(fastq_pair),
        file('*_encoding'),
        file('*_phred'),
        file('*_coverage'),
        file('*_max_len') into MAIN_integrity_{{ pid }}
    file('*_report') optional true into LOG_report_coverage1_{{ pid }}
    {% with task_name="integrity_coverage" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "integrity_coverage.py"

}

// TRIAGE OF CORRUPTED SAMPLES
LOG_corrupted_{{ pid }} = Channel.create()
MAIN_PreCoverageCheck_{{ pid }} = Channel.create()
// Corrupted samples have the 2nd value with 'corrupt'
MAIN_integrity_{{ pid }}.choice(LOG_corrupted_{{ pid }}, MAIN_PreCoverageCheck_{{ pid }}) {
    a -> a[2].text == "corrupt" ? 0 : 1
}

// TRIAGE OF LOW COVERAGE SAMPLES
{{ output_channel }} = Channel.create()
SIDE_phred_{{ pid }} = Channel.create()
SIDE_max_len_{{ pid }} = Channel.create()

MAIN_PreCoverageCheck_{{ pid }}
// Low coverage samples have the 4th value of the Channel with 'fail'
    .filter{ it[4].text != "fail" }
// For the channel to proceed with FastQ in 'sample_good' and the
// Phred scores for each sample in 'SIDE_phred'
    .separate({{ output_channel }}, SIDE_phred_{{ pid }}, SIDE_max_len_{{ pid }}){
        a -> [ [a[0], a[1]], [a[0], a[3].text], [a[0], a[5].text]  ]
    }

/** REPORT_COVERAGE - PLUG-IN
This process will report the expected coverage for each non-corrupted sample
and write the results to 'reports/coverage/estimated_coverage_initial.csv'
*/
process report_coverage_{{ pid }} {

    // This process can only use a single CPU
    cpus 1
    publishDir 'reports/coverage_{{ pid }}/'

    input:
    file(report) from LOG_report_coverage1_{{ pid }}.filter{ it.text != "corrupt" }.collect()

    output:
    file 'estimated_coverage_initial.csv'

    """
    echo Sample,Estimated coverage,Status >> estimated_coverage_initial.csv
    cat $report >> estimated_coverage_initial.csv
    """
}

/** REPORT_CORRUPT - PLUG-IN
This process will report the corrupted samples and write the results to
'reports/corrupted/corrupted_samples.txt'
*/
process report_corrupt_{{ pid }} {

    // This process can only use a single CPU
    cpus 1
    publishDir 'reports/corrupted_{{ pid }}/'

    input:
    val sample_id from LOG_corrupted_{{ pid }}.collect{it[0]}

    output:
    file 'corrupted_samples.txt'

    """
    echo ${sample_id.join(",")} | tr "," "\n" >> corrupted_samples.txt
    """

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/kraken.nf
================================================
IN_kraken_DB_{{ pid }} = Channel.value(params.krakenDB{{ param_id }})


//Process to run Kraken
process kraken_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/taxonomy/kraken/", pattern: "*.txt"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val krakenDB from IN_kraken_DB_{{ pid }}

    output:
    file("${sample_id}_kraken_report.txt")
    {% with task_name="kraken" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    kraken --preload --fastq-input --db ${krakenDB} --threads $task.cpus --output ${sample_id}_kraken.txt --paired --gzip-compressed ${fastq_pair[0]} ${fastq_pair[1]}

    kraken-report --db ${krakenDB} ${sample_id}_kraken.txt > ${sample_id}_kraken_report.txt
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/kraken2.nf
================================================
IN_kraken2_DB_{{ pid }} = Channel.value(params.kraken2DB{{ param_id }})


//Process to run Kraken2
process kraken2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/taxonomy/kraken2/", pattern: "*.txt"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val krakenDB from IN_kraken2_DB_{{ pid }}

    output:
    file("${sample_id}_kraken_report.txt")
    {% with task_name="kraken2" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    kraken2 --memory-mapping --threads $task.cpus --report ${sample_id}_kraken_report.txt --db ${krakenDB} --paired \
    --gzip-compressed ${fastq_pair[0]} ${fastq_pair[1]}
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mafft.nf
================================================
// True when a dengue_typing secondary channel is connected
has_ref_{{pid}} = binding.hasVariable('_ref_seqTyping_{{ pid }}')

if ( has_ref_{{pid}} ){
    {{ input_channel }}.map{ it[1] }.collect().mix(_ref_seqTyping_{{pid}}.unique{it.name}).set{mafft_input}
} else {
    {{ input_channel }}.map{ it[1] }.collect().set{mafft_input}
}

//{{ input_channel }}.map{ it[1] }.mix(_ref_seqTyping_{{ pid }}.unique()).set{mafft_input}

process mafft_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { 'mafft' }

    publishDir "results/alignment/mafft_{{ pid }}/"

    input:
    file(assembly) from mafft_input.collect()

    output:
    file ("*.align") into {{ output_channel }}
    {% with task_name="mafft", sample_id="val('single')" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    cat ${assembly} > all_assemblies.fasta

    mafft --adjustdirection --thread $task.cpus --auto all_assemblies.fasta > ${workflow.scriptName}.align
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mapping_patlas.nf
================================================
// checks if cutoff value is higher than 0
if (Float.parseFloat(params.cov_cutoff{{ param_id }}.toString()) == 0) {
    exit 1, "Cutoff value of 0 will output every plasmid in the database with coverage 0. Provide a value higher than 0."
}

IN_index_files_{{ pid }} = Channel.value(params.refIndex{{ param_id }})
IN_samtools_indexes_{{ pid }} = Channel.value(params.samtoolsIndex{{ param_id }})
IN_length_json_{{ pid }} = Channel.value(params.lengthJson{{ param_id }})
IN_cov_cutoff_{{ pid }} = Channel.value(params.cov_cutoff{{ param_id }})


// process that runs bowtie2 and samtools
process mappingBowtie_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(reads) from {{ input_channel }}
    val bowtie2Index from IN_index_files_{{ pid }}
    val samtoolsIdx from IN_samtools_indexes_{{ pid }}

    output:
    set sample_id, file("samtoolsDepthOutput*.txt") into samtoolsResults
    {% with task_name="mappingBowtie" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:

    //if (params.singleEnd == true) {
    //    readsString = "-U ${reads}"
    //}
    //else {
    readsString = "-1 ${reads[0]} -2 ${reads[1]}"
    //}

    """
    bowtie2 -x ${bowtie2Index} ${readsString} -p ${task.cpus} -a -5 ${params.trim5{{ param_id }}} | \
    samtools view -b -t ${samtoolsIdx} -@ ${task.cpus} - | \
    samtools sort -@ ${task.cpus} -o samtoolsSorted_${sample_id}.bam
    samtools index samtoolsSorted_${sample_id}.bam
    samtools depth samtoolsSorted_${sample_id}.bam > \
    samtoolsDepthOutput_${sample_id}.txt
    rm samtoolsSorted_${sample_id}.bam*
    """
}

/**
* These dumping process parses the depth file for each sample and filters it
* depending on the cutoff set by the user.
*/
process jsonDumpingMapping_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir 'results/mapping/mapping_json_{{ pid }}/'

    input:
    set sample_id, file(depthFile) from samtoolsResults
    val lengthJson from IN_length_json_{{ pid }}
    val cov_cutoff from IN_cov_cutoff_{{ pid }}

    output:
    set sample_id, file("samtoolsDepthOutput*.txt_mapping.json") optional true into mappingOutputChannel_{{ pid }}
    {% with task_name="jsonDumpingMapping", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "mapping2json.py"
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mark_duplicates.nf
================================================
process mark_duplicates_{{ pid }} {

    {% include "post.txt" ignore missing %}

    input:
    set sample_id, file(bam), file(bai) from {{ input_channel }}
   
    output:
    set val(sample_id), file("${sample_id}_mark_dup.bam"), file("${sample_id}_mark_dup.bai") into {{ output_channel }}
    set file("metrics.txt") into markDupMultiQC_{{pid}}
    {% with task_name="mark_duplicates" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    gatk MarkDuplicates \
      -I $bam \
      -M metrics.txt \
      -O ${sample_id}_mark_dup.bam \
      --CREATE_INDEX
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mash_dist.nf
================================================
IN_shared_hashes_{{ pid }} = Channel.value(params.shared_hashes{{ param_id }})

IN_mash_dist_input = Channel.create()
// If the side channel with the sketch exists, join the corresponding .msh file
// with the appropriate sample_id
if (binding.hasVariable("SIDE_mashSketchOutChannel_{{ pid }}")){
    {{ input_channel }}
        .join(SIDE_mashSketchOutChannel_{{ pid }})
        .into(IN_mash_dist_input)
// Otherwise, always use the .msh file provided in the docker image
} else {
    {{ input_channel }}
        .map{ it -> [it[0], it[1], params.refFile{{ param_id }}] }
        .into(IN_mash_dist_input)
}

// runs mash dist
process runMashDist_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir 'results/mashdist/mashdist_txt_{{ pid }}/'

    input:
    set sample_id, file(fasta), refFile from IN_mash_dist_input

    output:
    set sample_id, file(fasta), file("*_mashdist.txt") into mashDistOutChannel_{{ pid }}
    {% with task_name="runMashDist", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    mash dist -i -p ${task.cpus} -v ${params.pValue{{ param_id }}} \
    -d ${params.mash_distance{{ param_id }}} ${refFile} ${fasta} > ${fasta}_mashdist.txt
    """

}

// parses mash dist output to a json file that can be imported into pATLAS
process mashDistOutputJson_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir 'results/mashdist/mashdist_json_{{ pid }}/'

    input:
    set sample_id, fasta, file(mashtxt) from mashDistOutChannel_{{ pid }}
    val shared_hashes from IN_shared_hashes_{{ pid }}

    output:
    set sample_id, file("*.json") optional true into {{ output_channel }}
    {% with task_name="mashDistOutputJson", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "mashdist2json.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/mash_screen.nf
================================================
if (binding.hasVariable("SIDE_mashSketchOutChannel_{{ pid }}")){
    IN_reference_file_{{ pid }} = SIDE_mashSketchOutChannel_{{ pid }}
} else {
    IN_reference_file_{{ pid }} = Channel.value(params.refFile{{ param_id }})
}

// check if noWinner is provided or not
winnerVar = (params.noWinner{{ param_id }} == false) ? "-w" : ""

// process to run mashScreen and sort the output into
// sortedMashScreenResults_{sampleId}.txt
process mashScreen_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(reads) from {{ input_channel }}
    val refFile from IN_reference_file_{{ pid }}

    output:
    set sample_id, file("sortedMashScreenResults*.txt") into mashScreenResults_{{ pid }}
    {% with task_name="mashScreen", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    mash screen -i ${params.identity{{ param_id }}} -v ${params.pValue{{ param_id }}} -p \
    ${task.cpus} ${winnerVar} ${refFile} ${reads} > mashScreenResults_${sample_id}.txt
    sort -gr mashScreenResults_${sample_id}.txt > sortedMashScreenResults_${sample_id}.txt
    """
}

// process to parse the output to json format
process mashOutputJson_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir 'results/mashscreen/mashscreen_json_{{ pid }}', mode: 'copy'

    input:
    set sample_id, file(mashtxt) from mashScreenResults_{{ pid }}

    output:
    set sample_id, file("sortedMashScreenResults*.json") optional true into mashScreenOutputChannel_{{ pid }}
    {% with task_name="mashOutputJson", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "mashscreen2json.py"
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mash_sketch_fasta.nf
================================================
IN_kmerSize_{{ pid }} = Channel.value(params.kmerSize{{ param_id }})
IN_sketchSize_{{ pid }} = Channel.value(params.sketchSize{{ param_id }})

// runs mash sketch
process mashSketchFasta_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(fasta) from {{ input_channel }}
    val kmer from IN_kmerSize_{{ pid }}
    val sketch from IN_sketchSize_{{ pid }}

    output:
    set sample_id, file(fasta) into  {{ output_channel }}
    set sample_id, file("*.msh") into SIDE_mashSketchOutChannel_{{ pid }}
    {% with task_name="mashSketchFasta", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    mash sketch -i -k ${kmer} -s ${sketch} ${fasta}
    """

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/mash_sketch_fastq.nf
================================================
IN_kmerSize_{{ pid }} = Channel.value(params.kmerSize{{ param_id }})
IN_sketchSize_{{ pid }} = Channel.value(params.sketchSize{{ param_id }})
//IN_genomeSize_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})
IN_minKmer_{{ pid }} = Channel.value(params.minKmer{{ param_id }})


// checks if genomeSize was provided
genomeSize = (params.genomeSize{{ param_id }} == false) ? "" : "-g ${params.genomeSize{{ param_id }}}"

// runs mash sketch
process mashSketchFastq_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(fastq) from {{ input_channel }}
    val kmer from IN_kmerSize_{{ pid }}
    val sketch from IN_sketchSize_{{ pid }}
    val minKmer from IN_minKmer_{{ pid }}

    output:
    set sample_id, file(fastq) into  {{ output_channel }}
    file("*.msh") into SIDE_mashSketchOutChannel_{{ pid }}
    {% with task_name="mashSketchFastq", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    mash sketch -r -k ${kmer} -s ${sketch} -m ${minKmer} ${genomeSize} ${fastq}
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/maxbin2.nf
================================================
IN_min_contig_lenght_{{ pid }} = Channel.value(params.min_contig_lenght{{ param_id }})
IN_max_iteration_{{ pid }} = Channel.value(params.max_iteration{{ param_id }})
IN_prob_threshold_{{ pid }} = Channel.value(params.prob_threshold{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process maxbin2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/assembly/binning/maxbin2_{{ pid }}/${sample_id}/"

    input:
    set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})
    val minContigLenght from IN_min_contig_lenght_{{ pid }}
    val maxIterations from IN_max_iteration_{{ pid }}
    val probThreshold from IN_prob_threshold_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file(assembly), file ('*_maxbin.*.fasta'), file ('bin_status.txt') into binCh_{{ pid }}
    file '*_maxbin.{abundance,log,summary}'
    set sample_id, file("*_maxbin.summary") into intoReport_{{ pid }}

    {% with task_name="maxbin2" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        run_MaxBin.pl -contig ${assembly} -out ${sample_id}_maxbin -reads ${fastq[0]} -reads2 ${fastq[1]} \
        -thread $task.cpus -min_contig_length ${minContigLenght} -max_iteration ${maxIterations} \
        -prob_threshold ${probThreshold}

        echo pass > .status

        #in case maxbin fails to bin sequences for a sample:
        if ls *_maxbin.*.fasta 1> /dev/null 2>&1; then echo "true" > bin_status.txt; else echo "false" \
        > false_maxbin.0.fasta; echo "false" > bin_status.txt; fi


        if [ "$clear" = "true" ];
        then
            work_regex=".*/work/.{2}/.{30}/.*"
            file_source1=\$(readlink -f \$(pwd)/${fastq[0]})
            file_source2=\$(readlink -f \$(pwd)/${fastq[1]})
            assembly_file=\$(readlink -f \$(pwd)/${assembly})
            if [[ "\$file_source1" =~ \$work_regex ]]; then
                rm \$file_source1 \$file_source2 \$assembly_file
            fi
        fi
    } || {
        echo fail > .status
    }
    """
}

process report_maxbin2_{{ pid }}{

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(tsv) from  intoReport_{{ pid }}

    output:
    {% with task_name="report_maxbin2" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_tsv.py"

}

// If maxbin fails to obtain bins for a sample, the workflow continues with the original assembly
{{ output_channel }} = Channel.create()

OUT_binned = Channel.create()
OUT_unbinned = Channel.create()

failedBinning = Channel.create()
successfulBinning = Channel.create()

binCh_{{ pid }}.choice(failedBinning, successfulBinning){ it -> it[3].text == "false\n" ? 0 : 1 }

failedBinning.map{ it -> [it[0], it[1]] }.into(OUT_unbinned)

successfulBinning.map{ it -> [it[2].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'), it[2]]}
    .transpose()
    .map{it -> [it[1].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'),it[1]]}
    .into(OUT_binned)

OUT_binned.mix(OUT_unbinned).set{ {{ output_channel }} }


{{ forks }}

================================================
FILE: flowcraft/generator/templates/megahit.nf
================================================
if ( params.megahitKmers{{ param_id }}.toString().split(" ").size() <= 1 ){
    if (params.megahitKmers{{ param_id }}.toString() != 'auto'){
        exit 1, "'megahitKmers{{ param_id }}' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.megahitKmers{{ param_id }}}"
    }
}
IN_megahit_kmers_{{ pid }} = Channel.value(params.megahitKmers{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process megahit_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/megahit_{{ pid }}/', pattern: '*_megahit*.fasta', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), max_len from {{ input_channel }}.join(SIDE_max_len_{{ pid }})
    val kmers from IN_megahit_kmers_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file('*megahit*.fasta') into {{ output_channel }}
    set sample_id, file('megahit/intermediate_contigs/k*.contigs.fa') into IN_fastg{{ pid }}
    {% with task_name="megahit" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "megahit.py"

}

fastg = params.fastg{{ param_id }} ? "true" : "false"
process megahit_fastg_{{ pid }}{

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "results/assembly/megahit_{{ pid }}/$sample_id", pattern: "*.fastg"

    input:
    set sample_id, file(kmer_files) from IN_fastg{{ pid }}
    val run_fastg from fastg

    output:
    file "*.fastg" optional true
    {% with task_name="megahit_fastg" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    if [ ${run_fastg} == "true" ]
    then
        for kmer_file in ${kmer_files};
        do
            echo \$kmer_file
            k=\$(echo \$kmer_file | cut -d '.' -f 1);
            echo \$k
            megahit_toolkit contig2fastg \$k \$kmer_file > \$kmer_file'.fastg';
        done
    fi
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/metabat2.nf
================================================
IN_contig_percentage_{{ pid }} = Channel.value(params.maxPercentage{{ param_id }})
IN_length_threshold_{{ pid }} = Channel.value(params.minContig{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process metabat2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    //publishDir "results/assembly/binning/metabat2_{{ pid }}/${sample_id}/"

    input:
    set sample_id, file(assembly), file(bam_file), file(bam_index) from {{ input_channel }}
    val contig_percentage from IN_contig_percentage_{{ pid }}
    val length_threshold from IN_length_threshold_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file(assembly), file('*metabat-bins*/*.fa'), file ('bin_status.txt') into binCh_{{ pid }}
    set sample_id, file('*metabat-bins*/*.fa') into intoReport_{{ pid }}
    {% with task_name="metabat2"%}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # prevent indexing errors
        samtools sort ${bam_file} sorted
        samtools index sorted.bam

        # run METABAT2
        runMetaBat.sh -m ${length_threshold} --unbinned --maxP ${contig_percentage} ${assembly} sorted.bam

        # In case no sequences are binned
        if [ -z "\$(ls -A *metabat-bins*/)" ]; then
            echo "false" > false_bin.fa
            mv false_bin.fa *metabat-bins*/
            echo "false" > bin_status.txt;
        else
            echo "true" > bin_status.txt
        fi

    } || {
        echo fail > .status
    }
    """
}

process report_metabat2_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(bins) from intoReport_{{ pid }}

    output:
    {% with task_name="report_metabat2" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_metabat.py"
}

// If maxbin fails to obtain bins for a sample, the workflow continues with the original assembly
{{ output_channel }} = Channel.create()

OUT_binned = Channel.create()
OUT_unbinned = Channel.create()

failedBinning = Channel.create()
successfulBinning = Channel.create()

binCh_{{ pid }}.choice(failedBinning, successfulBinning){ it -> it[3].text == "false\n" ? 0 : 1 }

failedBinning.map{ it -> [it[0], it[1]] }.into(OUT_unbinned)

successfulBinning.map{ it -> [it[2].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'), it[2]]}
    .transpose()
    .map{it -> [it[1].toString().tokenize('/').last().tokenize('.')[0..-2].join('.'),it[1]]}
    .into(OUT_binned)

OUT_binned.mix(OUT_unbinned).set{ {{ output_channel }} }

{{ forks }}

================================================
FILE: flowcraft/generator/templates/metamlst.nf
================================================
IN_metamlstDB_{{ pid }} = Channel.value(params.metamlstDB{{ param_id }})
IN_metamlstDB_index_{{ pid }} = Channel.value(params.metamlstDB_index{{ param_id }})


process metamlst_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "results/annotation/metamlst_{{ pid }}/${sample_id}", saveAs: { it.split("/").last() }

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val metamlstDB from IN_metamlstDB_{{ pid }}
    val metamlstDB_index from IN_metamlstDB_index_{{ pid }}

    output:
    file 'out/merged/*.txt' optional true
    {% with task_name="metamlst" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    bowtie2 --very-sensitive-local -a --no-unal -x ${metamlstDB_index} -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} | samtools view -bS - > ${sample_id}.bam

    metamlst.py -d ${metamlstDB} ${sample_id}.bam

    metamlst-merge.py -d ${metamlstDB} out/
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/metaprob.nf
================================================
IN_feature_{{ pid }} = Channel.value(params.feature{{ param_id }})
IN_metaProbQMer_{{ pid }} = Channel.value(params.metaProbQMer{{ param_id }})

// runs metaProb
process metaProb_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir 'results/metaprob/'

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val feature from IN_feature_{{ pid }}
    val metaProbQMer from IN_metaProbQMer_{{ pid }}

    output:
    set sample_id, file("*clusters.csv") into metaProbOutChannel_{{ pid }}
    {% with task_name="metaProb", sample_id="sample_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    gunzip -c ${fastq_pair[0]} > ${sample_id}_read1.fastq
    gunzip -c ${fastq_pair[1]} > ${sample_id}_read2.fastq

    MetaProb -pi ${sample_id}_read1.fastq ${sample_id}_read2.fastq -feature ${feature} -m ${pmetaProbQMer}
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/metaspades.nf
================================================
if ( params.metaspadesKmers{{ param_id }}.toString().split(" ").size() <= 1 ){
    if (params.metaspadesKmers{{ param_id }}.toString() != 'auto'){
        exit 1, "'metaspadesKmers{{ param_id }}' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.metaspadesKmers{{ param_id }}}"
    }
}
IN_metaspades_kmers_{{pid}} = Channel.value(params.metaspadesKmers{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process metaspades_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/metaspades_{{ pid }}/', pattern: '*_metaspades*.fasta', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), max_len from {{ input_channel }}.join(SIDE_max_len_{{ pid }})
    val kmers from IN_metaspades_kmers_{{pid}}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file('*_metaspades*.fasta') into {{ output_channel }}
    {% with task_name="metaspades" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "metaspades.py"

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/midas_species.nf
================================================
if (params.midasDB{{ param_id }} == null){
    exit 1, "The path to the midas database must be provided with the 'midasDB{{ param_id }}' option."
}

IN_midas_DB_{{ pid }} = Channel.value(params.midasDB{{ param_id }})

process midas_species_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/taxonomy/midas/", pattern: "*.txt"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val midasDB from IN_midas_DB_{{ pid }}

    output:
    file("${sample_id}_midas_species_profile.txt")
    {% with task_name="midas_species" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    run_midas.py species midas/ -d ${midasDB} -t $task.cpus -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} --remove_temp

    mv midas/species/species_profile.txt ./${sample_id}_midas_species_profile.txt
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/mlst.nf
================================================

process mlst_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1

    input:
    set sample_id, file(assembly) from {{ input_channel }}

    output:
    file '*.mlst.txt' into LOG_mlst_{{ pid }}
    set sample_id, file(assembly), file(".status") into MAIN_mlst_out_{{ pid }}
    {% with task_name="mlst" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        expectedSpecies=${params.mlstSpecies{{ param_id }}}
        mlst $assembly >> ${sample_id}.mlst.txt
        mlstSpecies=\$(cat *.mlst.txt | cut -f2)
        json_str="{'expectedSpecies':\'\$expectedSpecies\',\
            'species':'\$mlstSpecies',\
            'st':'\$(cat *.mlst.txt | cut -f3)',\
            'tableRow':[{'sample':'${sample_id}','data':[\
                {'header':'MLST species','value':'\$mlstSpecies','table':'typing'},\
                {'header':'MLST ST','value':'\$(cat *.mlst.txt | cut -f3)','table':'typing'}]}]}"
        echo \$json_str > .report.json

        if [ ! \$mlstSpecies = \$expectedSpecies ];
        then
            printf fail > .status
        else
            printf pass > .status
        fi

    } || {
        printf fail > .status
    }
    """
}

process compile_mlst_{{ pid }} {

    publishDir "results/annotation/mlst_{{ pid }}/"

    input:
    file res from LOG_mlst_{{ pid }}.collect()

    output:
    file "mlst_report.tsv"

    script:
    """
    cat $res >> mlst_report.tsv
    """
}

{{ output_channel }} = Channel.create()
MAIN_mlst_out_{{ pid }}
    .filter{ it[2].text != "fail" }
    .map{ [it[0], it[1]] }
    .set{ {{output_channel}} }


{{ forks }}


================================================
FILE: flowcraft/generator/templates/momps.nf
================================================

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process momps_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})
    val clear from checkpointClear_{{ pid }}

    output:
    file("*_st.tsv") into momps_st_{{ pid }}
    file("*_profile.tsv") into momps_profile_{{ pid }}
    {% with task_name="momps" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # Stage in momps source files. This cannot be a symlink because the files
        # need to be writable.
        cp -r /NGStools/mompS/* .
        momps.pl -r ${fastq[0]} -f ${fastq[1]} -a $assembly -o res -p $sample_id -t ${task.cpus}
        # Get the ST for the sample
        if [ -f "res/${sample_id}.MLST_res.txt" ]
        then
            st=\$(grep -oP "ST = \\K\\w+" res/*.MLST_res.txt)
            # If the ST cannot be determined, set string to ND
            if [ -z \$st ]
            then
                st="ND"
            fi
            echo $sample_id\t\${st}> ${sample_id}_st.tsv
            # Add ST information to report JSON
            json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'mompS','value':'\$st','table':'typing'}]}]}"
            echo \$json_str > .report.json
            # Get the profile for the sample
            echo $sample_id\t\$(awk "NR == 7" res/*.MLST_res.txt) > ${sample_id}_profile.tsv
            rm -r res

            # Remove temporary input files when the clearInput option is used
            if [ "$clear" = "true" ];
            then
                work_regex=".*/work/.{2}/.{30}/.*"
                file_source1=\$(readlink -f \$(pwd)/${fastq[0]})
                file_source2=\$(readlink -f \$(pwd)/${fastq[1]})
                if [[ "\$file_source1" =~ \$work_regex ]]; then
                    rm \$file_source1 \$file_source2
                fi
            fi
        else
            echo fail > .status
            rm -r res
        fi
    } || {
        echo fail > .status
        # Remove results directory
        rm -r res
    }
    """

}


process momps_report_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}
    publishDir "results/typing/momps_{{ pid }}/", pattern: "*.tsv"

    input:
    file(st_file) from momps_st_{{ pid }}.collect()
    file(profile_file) from momps_profile_{{ pid }}.collect()

    output:
    file "*.tsv"


    script:
    """
    cat $st_file >> momps_st.tsv
    cat $profile_file >> momps_profile.tsv
    """

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/nextflow.config
================================================
manifest {
    name = "{{ pipeline_name }}"
    mainScript = "{{ nf_file }}"
}

params {
    platformHTTP = null
    reportHTTP = null

    // Settings this option to true, will trigger the removal of temporary
    // data (usually fastq reads) at particular checkpoint processes that
    // modify that data. These checkpoint processes include 'trimmomatic',
    // 'spades' and 'skesa'.
    // WARNING: This will remove temporary fastq files that are not necessary
    // for the completion of the pipeline but, consequently, will disable
    // the resume functionality of the pipeline. However, it is often necessary
    // for very large pipelines and whenever disk space is critical.
    // More precisely, these checkpoint components will check whether the
    // putative temporary files are inside the nextflow work directory by
    // matching the regex: ".*/work/.{2}/.{30}/.*"
    // If it is a match, then the file is assumed to be a temporary one and
    // will be removed.
    clearAtCheckpoint = false
}

env {
    PYTHONPATH = "$baseDir/templates:\$PYTHONPATH"
    PATH = "$baseDir/templates:\$PATH"
}

process {
    cpus = 1
    memory = "1GB"

    errorStrategy = { task.attempt <= 7 ? "retry" : "ignore" }
    maxRetries = 7
    container = "flowcraft/flowcraft_base:1.0.0-1"
}

docker {
    // Added default docker option to avoid docker permission errors. See issue
    // #142
    runOptions = "-u \$(id -u):\$(id -g)"
}


executor {
  $local {
      cpus = 4
  }
}

singularity {
    cacheDir = "$HOME/.singularity_cache"
    autoMounts = true
}

trace {
    enabled = true
    file = "pipeline_stats.txt"
    fields = "task_id,\
              hash,\
              process,\
              tag,\
              status,\
              exit,\
              start,\
              container,\
              cpus,\
              time,\
              disk,\
              memory,\
              duration,\
              realtime,\
              queue,\
              %cpu,\
              %mem,\
              rss,\
              vmem,\
              rchar,\
              wchar"
}

//                             PROFILE OPTIONS                               //
///////////////////////////////////////////////////////////////////////////////

profiles {

    oneida {

        process.executor = "slurm"
        docker.enabled = true

        process{

            // MEMORY USAGE PER PROCESS //
            // general memory usage
            memory = "4GB"

        }

    }

    // INCD PROFILE
    incd {

        process.executor = "slurm"
        singularity.enabled = true

        singularity {
            cacheDir = "/mnt/singularity_cache"
            autoMounts = true
        }

        // Error and retry strategies
        process.errorStrategy = "retry"
        maxRetries = 3

        process.$chewbbaca.queue = "chewBBACA"

        process {

            // MEMORY USAGE PER PROCESS //
            // general memory usage
            memory = "4GB"

        }

    }

    // SLURM PROFILE
    slurm {

        // Change executor for SLURM
        process.executor = "slurm"
        // Change container engine for Shifter
        shifter.enabled = true

        process {

            clusterOptions = "--qos=oneida"

            errorStrategy = "retry"
            maxRetries = 5

            // MEMORY USAGE PER PROCESS //
            // general memory usage
            memory = "4GB"

        }

    }

    // SLURM PROFILE
    slurmOneida {

        // Change executor for SLURM
        process.executor = "slurm"
        // Change container engine for Shifter
        shifter.enabled = true

        process {

            clusterOptions = "--qos=oneida"

            // MEMORY USAGE PER PROCESS //
            // general memory usage
            memory = "4GB"

            // Set QOS for chewbbaca in order to run a single job
            $chewbbaca.clusterOptions = "--qos=chewbbaca"
        }
    }
}

includeConfig "profiles.config"
includeConfig "resources.config"
includeConfig "containers.config"
includeConfig "params.config"
includeConfig "user.config"


================================================
FILE: flowcraft/generator/templates/params.config
================================================
params {

{{ params_info }}

}

================================================
FILE: flowcraft/generator/templates/patho_typing.nf
================================================
if ( !params.species{{ param_id }}){ exit 1, "'species' parameter missing" }
if ( params.species{{ param_id }}.toString().split(" ").size() != 2 ){
    exit 1, "'species' parameter must contain two values (e.g.: 'escherichia coli'). Provided value: ${params.species{{ param_id }}}"
}

IN_pathoSpecies_{{ pid }} = Channel.value(params.species{{ param_id }})

process patho_typing_{{ pid }} {

    validExitStatus 0, 2

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    errorStrategy "ignore"
    publishDir "results/pathotyping/${sample_id}/"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val species from IN_pathoSpecies_{{ pid }}

    output:
    file "patho_typing*" optional true
    {% with task_name="patho_typing" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # Prevents read-only issues
        mkdir rematch_temp
        cp -r /NGStools/ReMatCh rematch_temp
        export PATH="\$(pwd)/rematch_temp/ReMatCh:\$PATH"

        patho_typing.py -f \$(pwd)/${fastq_pair[0]} \$(pwd)/${fastq_pair[1]} -o \$(pwd) -j $task.cpus --trueCoverage --species $species

        # Add information to dotfiles
        version_str="[{'program':'patho_typing.py','version':'0.4'}]"
        echo \$version_str > .versions

        rm -r rematch_temp
        echo pass > .status

        if [ -s patho_typing.report.txt ];
        then
            json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'\$(cat patho_typing.report.txt)','table':'typing'}]}]}"
            echo \$json_str > .report.json
            echo pass > .status
        else
            json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'NA','table':'typing'}]}]}"
            echo \$json_str > .report.json
            echo fail > .status
        fi
    } || {
        echo fail > .status
        json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'NA','table':'typing'}]}]}"
        echo \$json_str > .report.json
    }
    """

}


================================================
FILE: flowcraft/generator/templates/patlas_consensus.nf
================================================

/**
* A process that creates a consensus from all the outputted json files
*/
process fullConsensus {

    tag { sample_id }

    publishDir 'results/consensus_{{ pid }}/'

    input:
    set sample_id, file(infile_list) from {{ compile_channels }}

    output:
    file "consensus_*.json"

    script:
    template "pATLAS_consensus_json.py"

}

================================================
FILE: flowcraft/generator/templates/pilon.nf
================================================

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process pilon_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    echo false
    publishDir 'results/assembly/pilon_{{ pid }}/', mode: 'copy', pattern: "*.fasta"

    input:
    set sample_id, file(assembly), file(bam_file), file(bam_index) from {{ input_channel }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, '*_polished.fasta' into {{ output_channel }}, pilon_report_{{ pid }}
    {% with task_name="pilon" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        pilon_mem=${String.valueOf(task.memory).substring(0, String.valueOf(task.memory).length() - 1).replaceAll("\\s", "")}
        java -jar -Xms256m -Xmx\${pilon_mem} /NGStools/pilon-1.22.jar --genome $assembly --frags $bam_file --output ${assembly.name.replaceFirst(~/\.[^\.]+$/, '')}_polished --changes --threads $task.cpus >> .command.log 2>&1
        echo pass > .status

        if [ "$clear" = "true" ];
        then
            work_regex=".*/work/.{2}/.{30}/.*"
            assembly_file=\$(readlink -f \$(pwd)/${assembly})
            bam_file=\$(readlink -f \$(pwd)/${bam_file})
            if [[ "\$assembly_file" =~ \$work_regex ]]; then
                rm \$assembly_file \$bam_file
            fi
        fi

    } || {
        echo fail > .status
    }
    """

}

process pilon_report_{{ pid }} {

    {% with overwrite="false" %}
    {% include "report_post.txt" ignore missing %}
    {% endwith %}

    tag { sample_id }

    input:
    set sample_id, file(assembly), file(coverage_bp) from pilon_report_{{ pid }}.join(SIDE_BpCoverage_{{ pid }})

    output:
    file "*_assembly_report.csv" into pilon_report_out_{{ pid }}
    {% with task_name="pilon_report" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "assembly_report.py"

}


process compile_pilon_report_{{ pid }} {

    publishDir "reports/assembly/pilon_{{ pid }}/", mode: 'copy'

    input:
    file(report) from pilon_report_out_{{ pid }}.collect()

    output:
    file "pilon_assembly_report.csv"

    """
    echo Sample,Number of contigs,Average contig size,N50,Total assembly length,GC content,Missing data > pilon_assembly_report.csv
    cat $report >> pilon_assembly_report.csv
    """
}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/pipeline_graph.html
================================================
<!DOCTYPE html>
<title>FlowCraft DAG tool</title>
<meta charset="utf-8"/>
<style>
    .node circle {
        stroke: steelblue;
        stroke-width: 3px;
    }

    .node text {
        font: 14px sans-serif;
        font-weight: bold;
    }

    .link {
        fill: none;
        stroke: #acacac;
        stroke-width: 2px;
    }

    div.tooltip {
        position: absolute;
        text-align: center;
        padding: 10px 15px 10px 15px;
        font: 14px sans-serif;
        background: lightsteelblue;
        box-shadow: 1px 2px 8px #626262;
        border-radius: 8px;
        pointer-events: none;
    }
</style>
<body>
</body>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script>
    // fetchs data using jinja
    const inputData = {{ data }}

    /**
     * This function creates a tooltip with the node/process information
     * on mouse over in the respective node
     *
     * @param {Object} d - stores information of the node data (containing
     * name, input, output, etc) and parent info for this node
     */
    const mouseover = (d) => {
        div.transition()
            .duration(200)
            .style("opacity", .9)
        div.html(`<b>pid:</b> ${d.data.process.pid},<br>
            <b>lane:</b> ${d.data.process.lane},<br>
            <b>input:</b> ${d.data.process.input},<br>
            <b>output:</b> ${d.data.process.output},<br>
            <b>directives:</b><br>
            ${d.data.process.directives}
            `)
            .style("left", (d3.event.pageX) + "px")
            .style("left", (d3.event.pageX) + "px")
            .style("top", (d3.event.pageY - 28) + "px")
            .style("text-align", "left")
    }

    /**
     * Function that hides the tooltip
     * @param {Object} d - stores information of the node data (containing
     * name, input, output, etc) and parent info for this node
     */
    const mouseout = (d) => {
        div.transition()
            .duration(500)
            .style("opacity", 0)
    }

    /**
     * Function that collapses nodes and all their childrens
     * @param {Object} d - stores information of the node data (containing
     * name, input, output, etc) and parent info for this node
     */
    // const collapse = (d) => {
    //     if(d.children) {
    //         d._children = d.children
    //         d._children.forEach(collapse)
    //         d.children = null
    //     }
        // }

    // Set the dimensions and margins of the diagram
    const margin = {top: 20, right: 20, bottom: 20, left: 20},
        width = 1870,
        height = 860

    const div = d3.select("body").append("div")
        .attr("class", "tooltip")
        .style("opacity", 0)

    let i = 0,
        duration = 750

    let root
    // Assigns parent, children, height, depth
    root = d3.hierarchy(inputData, (d) => { return d.children })
    root.x0 = height / 2
    root.y0 = 0

    // declares a tree layout and assigns the size
    const treemap = d3.tree().size([height, width])

    // Assigns the x and y position for the nodes
    const treeData = treemap(root)

    // append the svg object to the body of the page
    // appends a 'group' element to 'svg'
    // moves the 'group' element to the top left margin
    const svg = d3.select("body")
        .append("svg")
        .attr("width", width + margin.right + margin.left)
        .attr("height", height + margin.top + margin.bottom)
        .call(d3.zoom().on("zoom", function () {
            svg.attr("transform", d3.event.transform)
        }))
        .on("dblclick.zoom", null)
        .append("g")
        .attr("transform", "translate("
            + margin.left + "," + margin.top + ")"
        )

    /**
     * Function that updates the graph on load and on node clicks
     *
     * @param {Object} source - Stores the full tree information, including
     * the root node, which will be deleted by filter on nodes and links.
     */
    const update = (source) => {

        // Creates a curved (diagonal) path from parent to the child nodes
        /**
         * Creates a curved (diagonal) path from parent to the child nodes
         *
         * @param {Object} s
         * @param {Object} d
         * @returns {string}
         */
        const diagonal = (s, d) => {
            path = `M ${s.y} ${s.x}
            C ${(s.y + d.y) / 2} ${s.x},
              ${(s.y + d.y) / 2} ${d.x},
              ${d.y} ${d.x}`
            return path
        }

        /**
         * Function that toggles childrens on click
         *
         * @param {Object} d - stores information of the node data (containing
         * name, input, output, etc) and parent info for this node
         */
        const click = (d) => {
            if (d.children) {
                d._children = d.children
                d.children = null
            } else {
                d.children = d._children
                d._children = null
            }
            update(d)
        }

        // Compute the new tree layout.
        let nodes = treeData.descendants(),
            links = treeData.descendants().slice(1)

        // hide root node
        nodes = nodes.filter( (d) => {
            return d.depth
        })

        // hide links to root
        links = links.filter( (d) => {
            return d.depth !== 1
        })

        // ****************** Nodes section ***************************

        // Update the nodes...
        const node = svg.selectAll('g.node')
            .data(nodes, (d) => { return d.id || (d.id = ++i) })

        // Enter any new modes at the parent's previous position.
        const nodeEnter = node.enter().append('g')
            .attr('class', 'node')
            .attr("transform", (d) => {
                return "translate(" + source.y0 + "," + source.x0 + ")"
            })
            .on('click', click)
            .on("mouseover", mouseover)
            .on("mouseout", mouseout)

        // Add Circle for the nodes
        nodeEnter.append('circle')
            .attr('class', 'node')
            .attr('r', 1e-6)
        // .style("fill", (d) => {
        //     return d._children ? "lightsteelblue" : "#fff"
        // })

        // Add labels for the nodes
        nodeEnter.append('text')
            .attr("y", "-20")
            .attr("text-anchor", "middle")
            .text( (d) => { return d.data.name } )

                // gets labels variable
        const labels = d3.selectAll("text")
        // returns the label with max width value
        const maxTextWidth = d3.max(labels.nodes(),
            n => n.getComputedTextLength())

        // Normalize for fixed-depth, according to max_width
        nodes.forEach( (d) => { d.y = d.depth * maxTextWidth} )

        // UPDATE
        const nodeUpdate = nodeEnter.merge(node)

        // Transition to the proper position for the node
        nodeUpdate.transition()
            .duration(duration)
            .attr("transform", (d) => {
                return "translate(" + d.y + "," + d.x + ")"
            })

        // Update the node attributes and style
        nodeUpdate.select('circle.node')
            .attr('r', 10)
            .style("fill", (d) => {
                return d._children ? "#ffad6b" : "lightsteelblue"
            })
            .attr('cursor', 'pointer')


        // Remove any exiting nodes
        const nodeExit = node.exit().transition()
            .duration(duration)
            .attr("transform", (d) => {
                return "translate(" + source.y + "," + source.x + ")"
            })
            .remove()

        // On exit reduce the node circles size to 0
        nodeExit.select('circle')
            .attr('r', 1e-6)

        // On exit reduce the opacity of text labels
        nodeExit.select('text')
            .style('fill-opacity', 1e-6)

        // ****************** links section ***************************

        // Update the links...
        const link = svg.selectAll('path.link')
            .data(links, (d) => { return d.id })

        // Enter any new links at the parent's previous position.
        const linkEnter = link.enter().insert('path', "g")
            .attr("class", "link")
            .attr('d', (d) => {
                const o = {x: source.x0, y: source.y0}
                return diagonal(o, o)
            })

        // merge links
        const linkUpdate = linkEnter.merge(link)

        // Transition back to the parent element position
        linkUpdate.transition()
            .duration(duration)
            .attr('d', function(d){ return diagonal(d, d.parent) })

        // Remove any existing links
        const linkExit = link.exit().transition()
            .duration(duration)
            .attr('d', (d) => {
                const o = {x: source.x, y: source.y}
                return diagonal(o, o)
            })
            .remove()

        // Store the old positions for transition.
        nodes.forEach( (d) => {
            d.x0 = d.x
            d.y0 = d.y
        })

    }
    // Collapse after the second level
    // root.children.forEach(collapse);

    update(root)

</script>


================================================
FILE: flowcraft/generator/templates/post.txt
================================================
    if ( params.platformHTTP != null ) {
        beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; export PATH; set_dotfiles.sh; startup_POST.sh $params.projectId $params.pipelineId {{ pid }} $params.platformHTTP"
        afterScript "final_POST.sh $params.projectId $params.pipelineId {{ pid }} $params.platformHTTP; report_POST.sh $params.projectId $params.pipelineId {{ pid }} $params.sampleName $params.reportHTTP $params.currentUserName $params.currentUserId {{ template }}_{{ pid }} \"$params.platformSpecies\" {{ overwrite|default("true") }}"
    } else {
        beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; set_dotfiles.sh"
        }

================================================
FILE: flowcraft/generator/templates/process_skesa.nf
================================================
if ( !params.skesaMinKmerCoverage{{ param_id }}.toString().isNumber() ){ 
    exit 1, "'skesaMinKmerCoverage{{ param_id }}' parameter must be a number. Provided value: ${params.skesaMinKmerCoverage{{ param_id }}}"
}
if ( !params.skesaMinContigLen{{ param_id }}.toString().isNumber() ){ 
    exit 1, "'skesaMinContigLen{{ param_id }}' parameter must be a number. Provided value: ${params.skesaMinContigLen{{ param_id }}}"
}
if ( !params.skesaMaxContigs{{ param_id }}.toString().isNumber() ){ 
    exit 1, "'skesaMaxContigs{{ param_id }}' parameter must be a number. Provided value: ${params.skesaMaxContigs{{ param_id }}}"
}

IN_process_skesa_opts_{{ pid }} = Channel.value([params.skesaMinContigLen{{ param_id }},params.skesaMinKmerCoverage{{ param_id }},params.skesaMaxContigs{{ param_id }}])
IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})

process process_skesa_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1
    publishDir "reports/assembly/skesa_filter_{{ pid }}", pattern: '*.report.csv', mode: 'copy'

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    val opts from IN_process_skesa_opts_{{ pid }}
    val gsize from IN_genome_size_{{ pid }}
    val assembler from Channel.value("skesa")

    output:
    set sample_id, file('*.fasta') into {{ output_channel }}
    file '*.report.csv' optional true
    {% with task_name="process_skesa" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_assembly.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/process_spades.nf
================================================
if ( !params.spadesMinKmerCoverage{{ param_id }}.toString().isNumber()){
    exit 1, "'spadesMinKmerCoverage' parameter must be a number. Provided value: ${params.spadesMinKmerCoverage{{ param_id }}}"
}
if ( !params.spadesMinContigLen{{ param_id }}.toString().isNumber() ){
    exit 1, "'spadesMinContigLen' parameter must be a number. Provided value: ${params.spadesMinContigLen{{ param_id }}}"
}
if ( !params.spadesMaxContigs{{ param_id }}.toString().isNumber() ){
    exit 1, "'spadesMaxContigs' parameter must be a number. Provided value: ${params.spadesMaxContigs{{ param_id }}}"
}

IN_process_spades_opts_{{ pid }} = Channel.value([params.spadesMinContigLen{{ param_id }}, params.spadesMinKmerCoverage{{ param_id }}, params.spadesMaxContigs{{ param_id }}])
IN_genome_size_{{ pid }} = Channel.value(params.genomeSize{{ param_id }})

process process_spades_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    // This process can only use a single CPU
    cpus 1
    publishDir "reports/assembly/spades_filter_{{ pid }}", pattern: '*.report.csv', mode: 'copy'

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    val opts from IN_process_spades_opts_{{ pid }}
    val gsize from IN_genome_size_{{ pid }}
    val assembler from Channel.value("spades")

    output:
    set sample_id, file('*.fasta') into {{ output_channel }}
    file '*.report.csv' optional true
    {% with task_name="process_spades" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_assembly.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/progressive_mauve.nf
================================================
process progressive_mauve_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { 'progressive_mauve' }

    publishDir "results/alignment/progressive_mauve_{{ pid }}/", pattern: '*.align*', mode: 'copy'

    input:
    file(assembly) from {{ input_channel }}.map{ it[1] }.collect()

    output:
    file ("*.align") into {{ output_channel }}
    {% with task_name="progressive_mauve", sample_id="val('single')" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    progressiveMauve --output=${workflow.scriptName}.align --collinear ${assembly}
    """

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/prokka.nf
================================================

IN_centre_{{ pid }} = Channel.value(params.centre{{ param_id }})

IN_kingdom_{{ pid }} = Channel.value(params.kingdom{{ param_id }})

// check if genus is provided or not
genusVar = (params.genus{{ param_id }} == false) ? "" : "--usegenus --genus ${params.genus{{param_id}}} "

process prokka_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir "results/annotation/prokka_{{ pid }}/${sample_id}"

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    val centre from IN_centre_{{ pid }}
    val kingdom from IN_kingdom_{{ pid }}

    output:
    file "${sample_id}/*"
    {% with task_name="prokka" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        prokka --outdir $sample_id --cpus $task.cpus --centre ${centre} \
        --compliant --kingdom ${kingdom} ${genusVar} --increment 10 $assembly
        echo pass > .status
    } || {
        echo fail > .status
    }
    """

}


================================================
FILE: flowcraft/generator/templates/quast.nf
================================================
if (params.reference{{param_id}} == null && params.genomeSizeBp{{param_id}} == null)
    exit 1, "Specify at least one of reference or genomeSizeBp"
if (params.reference{{param_id}} != null && params.genomeSizeBp{{param_id}} != null)
    exit 1, "Specify only one of reference or genomeSizeBp"

if (params.reference{{param_id}} != null) {
    process quast_{{pid}} {
        {% include "post.txt" ignore missing %}

        tag { sample_id }
        publishDir "results/assembly/quast_{{pid}}/$sample_id", pattern: "*.tsv"
        publishDir "reports/assembly/quast_{{pid}}/$sample_id"

        input:
        set sample_id, file(assembly) from {{input_channel}}
        file reference from Channel.fromPath(params.reference{{param_id}})

        output:
        file "*"
        {% with task_name="quast" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}

        script:
        "/usr/bin/time -v quast -o . -r $reference -s $assembly -l $sample_id -t $task.cpus >> .command.log 2>&1"
    }
} else if (params.genomeSizeBp{{param_id}} != null) {
    process quast_{{pid}} {
        {% include "post.txt" ignore missing %}

        tag { sample_id }
        publishDir "results/assembly/quast_{{pid}}/$sample_id", pattern: "*.tsv"
        publishDir "reports/assembly/quast_{{pid}}/$sample_id"

        input:
        set sample_id, file(assembly) from {{input_channel}}
        val genomeSizeBp from Channel.value(params.genomeSizeBp{{param_id}})

        output:
        file "*"
        {% with task_name="quast" %}
        {%- include "compiler_channels.txt" ignore missing -%}
        {% endwith %}

        script:
        "/usr/bin/time -v quast -o . --est-ref-size=$genomeSizeBp -s $assembly -l $sample_id -t $task.cpus >> .command.log 2>&1"
    }
}


================================================
FILE: flowcraft/generator/templates/raxml.nf
================================================
IN_substitution_model_{{ pid }} = Channel.value(params.substitutionModel{{ param_id }})
IN_seed_number_{{ pid }} = Channel.value(params.seedNumber{{ param_id }})
IN_bootstrap_number_{{ pid }} = Channel.value(params.bootstrap{{ param_id }})
IN_simple_label_{{ pid}} = Channel.value(params.simpleLabel{{ param_id }})

process raxml_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { 'raxml' }

    publishDir "results/phylogeny/raxml_{{ pid }}/"

    input:
    file(alignment) from {{ input_channel }}
    val substitution_model from IN_substitution_model_{{ pid }}
    val seednumber from IN_seed_number_{{ pid }}
    val bootstrapnumber from IN_bootstrap_number_{{ pid }}

    output:
    file ("RAxML_*") into {{ output_channel }}
    file ("RAxML_bipartitions.*.nf") into into_json_{{ pid }}
    {% with task_name="raxml", sample_id="val('single')" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    raxmlHPC -s ${alignment} -p 12345 -m ${substitution_model} -T $task.cpus -n $workflow.scriptName -f a -x ${seednumber} -N ${bootstrapnumber}

    # Add information to dotfiles
    version_str="[{'program':'raxmlHPC','version':'8.2.11'}]"
    echo \$version_str > .versions
    """

}

process report_raxml_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { 'raxml' }

    input:
    file(newick) from into_json_{{ pid }}
    val label from IN_simple_label_{{ pid}}

    output:
    {% with task_name="report_raxml", sample_id="val('single')"  %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_newick.py"

}


{{ forks }}

================================================
FILE: flowcraft/generator/templates/reads_download.nf
================================================
if (params.asperaKey{{ param_id }}){
    if (file(params.asperaKey{{ param_id }}).exists()){
        IN_asperaKey_{{ pid }} = Channel.fromPath(params.asperaKey{{ param_id }})
    } else {
        IN_asperaKey_{{ pid }} = Channel.value("")
    }
} else {
    IN_asperaKey_{{ pid }} = Channel.value("")
}

process reads_download_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { accession_id }
    publishDir "reads", pattern: "${accession_id}/*fq.gz"
    maxRetries 1

    input:
    set val(accession_id), val(name) from reads_download_in_1_0.splitText(){ it.trim() }.unique().filter{ it != "" }.map{ it.split().length > 1 ? ["accession": it.split()[0], "name": it.split()[1]] : [it.split()[0], null] }
    each file(aspera_key) from IN_asperaKey_{{ pid }}

    output:
    set val({ "$name" != "null" ? "$name" : "$accession_id" }), file("${accession_id}/*fq.gz") optional true into {{ output_channel }}
    {% with task_name="reads_download", sample_id="accession_id" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # getSeqENA requires accession numbers to be provided as a text file
        echo "${accession_id}" >> accession_file.txt
        # Set default status value. It will be overwritten if anything goes wrong
        echo "pass" > ".status"

        if [ -f $aspera_key ]; then
            asperaOpt="-a $aspera_key"
        else
            asperaOpt=""
        fi

        getSeqENA.py -l accession_file.txt \$asperaOpt -o ./ --SRAopt --downloadCramBam

        # If a name has been provided along with the accession, rename the
        # fastq files.
        if [ $name != null ];
        then
            echo renaming pattern '${accession_id}' to '${name}' && cd ${accession_id} && rename "s/${accession_id}/${name}/" *.gz
        fi
    } || {
        # If exit code other than 0
        if [ \$? -eq 0 ]
        then
            echo "pass" > .status
        else
            echo "fail" > .status
            echo "Could not download accession $accession_id" > .fail
        fi
    }
    version_str="{'version':[{'program':'getSeqENA.py','version':'1.3'}]}"
    echo \$version_str > .versions
    """

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/remove_host.nf
================================================
IN_index_files_{{ pid }} = Channel.value(params.refIndex{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process remove_host_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/mapping/remove_host_{{ pid }}/', pattern: '*_bowtie2.log', mode: 'copy'

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val bowtie2Index from IN_index_files_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id , file("${sample_id}*.headersRenamed_*.fq.gz") into {{ output_channel }}
    set sample_id, file("*_bowtie2.log") into into_json_{{ pid }}
    {% with task_name="remove_host" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        bowtie2 -x ${bowtie2Index} -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} -p $task.cpus 1> ${sample_id}.bam 2> ${sample_id}_bowtie2.log

        samtools view -buh -f 12 -o ${sample_id}_samtools.bam -@ $task.cpus ${sample_id}.bam

        rm ${sample_id}.bam

        samtools fastq -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq ${sample_id}_samtools.bam

        rm ${sample_id}_samtools.bam

        renamePE_samtoolsFASTQ.py -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq

        gzip *.headersRenamed_*.fq
        rm *.fq

        if [ "$clear" = "true" ];
        then
            work_regex=".*/work/.{2}/.{30}/.*"
            file_source1=\$(readlink -f \$(pwd)/${fastq_pair[0]})
            file_source2=\$(readlink -f \$(pwd)/${fastq_pair[1]})
            if [[ "\$file_source1" =~ \$work_regex ]]; then
                rm \$file_source1 \$file_source2
            fi
        fi

    } || {
        echo fail > .status
    }
    """
}


process report_remove_host_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(bowtie_log) from into_json_{{ pid }}

    output:
    {% with task_name="report_remove_host" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_mapping.py"

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/report_compiler.nf
================================================

/** Reports
Compiles the reports from every process
*/
process report {

    tag { sample_id }

    input:
    set sample_id,
            task_name,
            pid,
            report_json,
            version_json,
            trace from {{ compile_channels }}

    output:
    file "*" optional true into master_report

    """
    prepare_reports.py $report_json $version_json $trace $sample_id $task_name 1 $pid $workflow.scriptId $workflow.runName
    """

}

File forkTree = new File("${workflow.projectDir}/.forkTree.json")
File treeDag = new File("${workflow.projectDir}/.treeDag.json")
File js = new File("${workflow.projectDir}/resources/main.js.zip")


forks_channel = forkTree.exists() ?  Channel.fromPath("${workflow.projectDir}/.forkTree.json") : Channel.value(null)
dag_channel = forkTree.exists() ?  Channel.fromPath("${workflow.projectDir}/.treeDag.json") : Channel.value(null)
js_channel = forkTree.exists() ?  Channel.fromPath("${workflow.projectDir}/resources/main.js.zip") : Channel.value(null)

process compile_reports {

    publishDir "pipeline_report/", mode: "copy"

    if ( params.reportHTTP != null ){
        beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; export PATH;"
        afterScript "metadata_POST.sh $params.projectId $params.pipelineId 0 $params.sampleName $params.reportHTTP $params.currentUserName $params.currentUserId 0 \"$params.platformSpecies\""
    }

   input:
   file report from master_report.collect()
   file forks from forks_channel
   file dag from dag_channel
   file js from js_channel

    output:
    file "pipeline_report.json"
    file "pipeline_report.html"
    file "src/main.js"

    script:
    template "compile_reports.py"
}


================================================
FILE: flowcraft/generator/templates/report_post.txt
================================================
    if ( params.platformHTTP != null ) {
        beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; export PATH; set_dotfiles.sh"
        afterScript "report_POST.sh $params.projectId $params.pipelineId {{ pid }} $params.sampleName $params.reportHTTP $params.currentUserName $params.currentUserId {{ template }}_{{ pid }} \"$params.platformSpecies\" {{ overwrite|default("true") }}"
    } else {
        beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; export PATH; set_dotfiles.sh"
        }

================================================
FILE: flowcraft/generator/templates/resources.config
================================================
process {
{{ process_info }}

}

================================================
FILE: flowcraft/generator/templates/retrieve_mapped.nf
================================================
process retrieve_mapped_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/mapping/retrieve_mapped_{{ pid }}/'

    input:
    set sample_id, file(bam) from {{ input_channel }}

    output:
    set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }}
    {% with task_name="retrieve_mapped" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    samtools view -buh -F 12 -o ${sample_id}_samtools.bam -@ $task.cpus ${bam}

    rm ${bam}

    samtools fastq -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq ${sample_id}_samtools.bam

    rm ${sample_id}_samtools.bam

    renamePE_samtoolsFASTQ.py -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq

    gzip *.headersRenamed_*.fq

    rm *.fq
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/seq_typing.nf
================================================
file(params.referenceFileO{{ param_id }}) ? params.referenceFileO{{ param_id }} : exit(1, "'referenceFileO{{ param_id }}' parameter missing")
IN_refO_{{ pid }} = Channel.fromPath(params.referenceFileO{{ param_id }})
    .map{ it -> it.exists() ? it : exit(1, "referenceFileO file was not found: '${params.referenceFileO{{ param_id }}}'")}

file(params.referenceFileH{{ param_id }}) ? params.referenceFileH{{ param_id }} : exit(1, "'referenceFileH{{ param_id }}' parameter missing")
IN_refH_{{ pid }} = Channel.fromPath(params.referenceFileH{{ param_id }})
    .map{ it -> it.exists() ? it : exit(1, "referenceFileH file was not found: '${params.referenceFileH{{ param_id }}}'")}

process seq_typing_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    errorStrategy "ignore"
    publishDir "results/seqtyping/${sample_id}/"

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    each file(refO) from IN_refO_{{ pid }}
    each file(refH) from IN_refH_{{ pid }}

    output:
    file "seq_typing*"
    {% with task_name="seq_typing" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # Prevents read-only issues
        mkdir rematch_temp
        cp -r /NGStools/ReMatCh rematch_temp
        export PATH="\$(pwd)/rematch_temp/ReMatCh:\$PATH"

        seq_typing.py -f ${fastq_pair[0]} ${fastq_pair[1]} -r \$(pwd)/$refO \$(pwd)/$refH -o ./ -j $task.cpus --extraSeq 0 --mapRefTogether --minGeneCoverage 60

        # Add information to dotfiles
        json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'\$(cat seq_typing.report.txt)','table':'typing'}]}]}"
        echo \$json_str > .report.json
        version_str="[{'program':'seq_typing.py','version':'0.1'}]"
        echo \$version_str > .versions

        rm -r rematch_temp

        if [ -s seq_typing.report.txt ];
        then
            echo pass > .status
        else
            echo fail > .status
        fi
    } || {
        echo fail > .status
        json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'NA','table':'typing'}]}]}"
        echo \$json_str > .report.json
    }
    """

}


================================================
FILE: flowcraft/generator/templates/seroba.nf
================================================
Coverage_{{ pid }} = Channel.value(params.coverage{{ param_id }})

process seroba_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(fastq) from {{ input_channel }}
    val coverage from Coverage_{{ pid }}

    output:
    file("pred.tsv") into LOG_seroba_{{ pid }}
    {% with task_name="seroba" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        # create a directory in /tmp to store the results
        mkdir /tmp/results
        #rename input files for seroba (avoid match error)
        mv ${fastq[0]} ${sample_id}_1.fq.gz
        mv ${fastq[1]} ${sample_id}_2.fq.gz
        # run seroba typing module
        seroba runSerotyping --coverage ${coverage} /seroba/database/ ${sample_id}_1.fq.gz ${sample_id}_2.fq.gz /tmp/results/${sample_id}

        # Get the ST for the sample
        if [ -f "/tmp/results/${sample_id}/pred.tsv" ];
        then
            cp /tmp/results/${sample_id}/pred.tsv .
            sed -i -- 's|/tmp/results/||g' pred.tsv
            # Add ST information to report JSON
            json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'serotype','value':'\$(cat pred.tsv | cut -f2)','table':'typing'}]}]}"
            echo \$json_str > .report.json
        else
            echo fail > .status
            rm -r /tmp/results/
        fi
    } || {
        echo fail > .status
        # Remove results directory
        rm -r /tmp/results/
    }
    """

}

process compile_seroba_{{ pid }} {

    publishDir "results/typing/seroba_{{ pid }}/"

    input:
    file res from LOG_seroba_{{ pid }}.collect()

    output:
    file "seroba_report.tsv"

    script:
    """
    cat $res >> seroba_report.tsv
    """
}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/sistr.nf
================================================

process sistr_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/typing/sistr_{{ pid }}', pattern: ".tab", mode: "copy"

    input:
    set sample_id, file(assembly) from {{ input_channel }}

    output:
    {% with task_name="sistr" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    """
    {
        sistr --qc -vv -t $task.cpus -f tab -o ${sample_id}_sistr.tab ${assembly}
        json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'sistr','value':'\$(awk \"FNR == 2\" *.tab | cut -f14)','table':'typing'}]}]}"
        echo \$json_str > .report.json
        sistr_version=\$(sistr --version | cut -d" " -f2)
        version_str="[{'program':'sistr','version':'\$sistr_version'}]"
        echo \$version_str > .versions

        if [ -s ${sample_id}_sistr.tab ];
        then
            echo pass > .status
        else
            echo fail > .status
        fi

    } || {
        echo fail > .status
    }
    """
}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/skesa.nf
================================================

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process skesa_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/skesa_{{ pid }}', pattern: '*skesa*.fasta', mode: 'copy'

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, file('*.fasta') into {{ output_channel }}
    {% with task_name="skesa" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "skesa.py"

}

{{ forks }}

================================================
FILE: flowcraft/generator/templates/spades.nf
================================================
if ( !params.spadesMinCoverage{{ param_id }}.toString().isNumber() ){
    exit 1, "'spadesMinCoverage{{ param_id }}' parameter must be a number. Provided value: '${params.spadesMinCoverage{{ param_id }}}'"
}
if ( !params.spadesMinKmerCoverage{{ param_id }}.toString().isNumber()){
    exit 1, "'spadesMinKmerCoverage{{ param_id }}' parameter must be a number. Provided value: '${params.spadesMinKmerCoverage{{ param_id }}}'"
}

IN_spades_opts_{{ pid }} = Channel.value(
    [params.spadesMinCoverage{{ param_id }},
     params.spadesMinKmerCoverage{{ param_id }}
     ])

if ( params.spadesKmers{{ param_id }}.toString().split(" ").size() <= 1 ){
    if (params.spadesKmers{{ param_id }}.toString() != 'auto'){
        exit 1, "'spadesKmers{{ param_id }}' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.spadesKmers{{ param_id }}}"
    }
}
IN_spades_kmers_{{pid}} = Channel.value(params.spadesKmers{{ param_id }})

clear = params.clearInput{{ param_id }} ? "true" : "false"
disable_rr = params.disableRR{{ param_id }} ? "true" : "false"

checkpointClear_{{ pid }} = Channel.value(clear)
disableRR_{{ pid }} = Channel.value(disable_rr)

process spades_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/spades_{{ pid }}/', pattern: '*_spades*.fasta', mode: 'copy'
    publishDir "results/assembly/spades_{{ pid }}/$sample_id", pattern: "*.gfa", mode: "copy"
    publishDir "results/assembly/spades_{{ pid }}/$sample_id", pattern: "*.fastg", mode: "copy"

    input:
    set sample_id, file(fastq_pair), max_len from {{ input_channel }}.join(SIDE_max_len_{{ pid }})
    val opts from IN_spades_opts_{{ pid }}
    val kmers from IN_spades_kmers_{{ pid }}
    val clear from checkpointClear_{{ pid }}
    val disable_rr from disableRR_{{ pid }}

    output:
    set sample_id, file('*_spades*.fasta') into {{ output_channel }}
    file "*.fastg" optional true
    file "*.gfa" into gfa1_{{ pid }}
    {% with task_name="spades" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "spades.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/split_assembly.nf
================================================
// Check for the presence of absence of the minimum contig size parameter
if (params.size{{ param_id }} == null){
    exit 1, "A minimum contig size must be provided."
}

IN_min_contig_size_{{ pid }} = Channel.value(params.size{{ param_id }})

process split_assembly_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    publishDir "results/assembly/split_assembly_{{ pid }}/${sample_id}/"

    input:
    set sample_id, file(assembly) from {{ input_channel }}
    val min_contig_size from IN_min_contig_size_{{ pid }}

    output:
    file('*.fasta') into splitCh_{{ pid }}
    {% with task_name="split_assembly" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "split_fasta.py"


}

{{ output_channel }} = Channel.create()

splitCh_{{ pid }}.flatMap().map{ it -> [it.toString().tokenize('/').last().tokenize('.')[0..-2].join('.'), it]}.into( {{ output_channel }} )

{{ forks }}

================================================
FILE: flowcraft/generator/templates/status_compiler.nf
================================================

/** STATUS
Reports the status of a sample in any given process.
*/
process status {

    tag { sample_id }
    publishDir "pipeline_status/$task_name"

    input:
    set sample_id, task_name, status, warning, fail, file(log) from {{ compile_channels }}

    output:
    file '*.status' into master_status
    file '*.warning' into master_warning
    file '*.fail' into master_fail
    file '*.log'

    """
    echo $sample_id, $task_name, \$(cat $status) > ${sample_id}_${task_name}.status
    echo $sample_id, $task_name, \$(cat $warning) > ${sample_id}_${task_name}.warning
    echo $sample_id, $task_name, \$(cat $fail) > ${sample_id}_${task_name}.fail
    echo "\$(cat .command.log)" > ${sample_id}_${task_name}.log
    """
}

process compile_status_buffer {

    input:
    file status from master_status.buffer( size: 5000, remainder: true)
    file warning from master_warning.buffer( size: 5000, remainder: true)
    file fail from master_fail.buffer( size: 5000, remainder: true)

    output:
    file 'master_status_*.csv' into compile_status_buffer
    file 'master_warning_*.csv' into compile_warning_buffer
    file 'master_fail_*.csv' into compile_fail_buffer

    """
    cat $status >> master_status_${task.index}.csv
    cat $warning >> master_warning_${task.index}.csv
    cat $fail >> master_fail_${task.index}.csv
    """
}

process compile_status {

    publishDir 'reports/status'

    input:
    file status from compile_status_buffer.collect()
    file warning from compile_warning_buffer.collect()
    file fail from compile_fail_buffer.collect()

    output:
    file "*.csv"

    """
    cat $status >> master_status.csv
    cat $warning >> master_warning.csv
    cat $fail >> master_fail.csv
    """

}


================================================
FILE: flowcraft/generator/templates/trace_compiler.nf
================================================


process compile_traces {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    input:
    set sample_id, vals from {{ input_channel }}

   script:
   template "pipeline_status.py"

}


================================================
FILE: flowcraft/generator/templates/trimmomatic.nf
================================================
// Check sliding window parameter
if ( params.trimSlidingWindow{{ param_id }}.toString().split(":").size() != 2 ){
    exit 1, "'trimSlidingWindow{{ param_id}}' parameter must contain two values separated by a ':'. Provided value: '${params.trimSlidingWindow{{ param_id}}}'"
}
if ( !params.trimLeading{{ param_id}}.toString().isNumber() ){
    exit 1, "'trimLeading{{ param_id}}' parameter must be a number. Provide value: '${params.trimLeading_{{pid}}}'"
}
if ( !params.trimTrailing{{ param_id}}.toString().isNumber() ){
    exit 1, "'trimTrailing{{ param_id}}' parameter must be a number. Provide value: '${params.trimTrailing{{ param_id}}}'"
}
if ( !params.trimMinLength{{ param_id}}.toString().isNumber() ){
    exit 1, "'trimMinLength{{ param_id}}' parameter must be a number. Provide value: '${params.trimMinLength{{ param_id}}}'"
}

IN_trimmomatic_opts_{{ pid }} = Channel.value([params.trimSlidingWindow{{ param_id}},params.trimLeading{{ param_id}},params.trimTrailing{{ param_id}},params.trimMinLength{{ param_id}}])
IN_adapters_{{ pid }} = Channel.value(params.adapters{{ param_id}})

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClear_{{ pid }} = Channel.value(clear)

process trimmomatic_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    publishDir "results/trimmomatic_{{ pid }}", pattern: "*.gz"

    tag { sample_id }

    input:
    set sample_id, file(fastq_pair), phred from {{ input_channel }}.join(SIDE_phred_{{ pid }})
    val trim_range from Channel.value("None")
    val opts from IN_trimmomatic_opts_{{ pid }}
    val ad from IN_adapters_{{ pid }}
    val clear from checkpointClear_{{ pid }}

    output:
    set sample_id, "${sample_id}_*trim.fastq.gz" into {{ output_channel }}
    file 'trimmomatic_report.csv'
    {% with task_name="trimmomatic" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "trimmomatic.py"

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/true_coverage.nf
================================================
if ( !params.species{{ param_id }}){
    exit 1, "'species{{ param_id }}' parameter missing"
}
if ( params.species{{ param_id }}.toString().split(" ").size() != 2 ){
    exit 1, "'species{{ param_id }}' parameter must contain two values (e.g.: 'escherichia coli').Provided value: '${params.species{{ param_id }}}'"
}

IN_pathoSpecies_{{ pid }} = Channel.value(params.species{{ param_id }})

process true_coverage_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(fastq_pair) from {{ input_channel }}
    val species from IN_pathoSpecies_{{ pid }}

    output:
    set sample_id, file(fastq_pair) into {{ output_channel }}
    {% with task_name="true_coverage" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    """
    {
        trueCoverage_rematch.py -f $fastq_pair --species $species \
        -i /NGStools/true_coverage/data --json
        if ls failing* 1> /dev/null 2>&1;
        then
            parse_true_coverage.py sample_*.json failing*.json
        else
            parse_true_coverage.py sample_*.json
        fi
        echo pass > .status
    } || {
        echo fail > .status
    }
    """

}

{{ forks }}


================================================
FILE: flowcraft/generator/templates/unicycler.nf
================================================
process unicycler_{{pid}} {
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/unicycler_{{pid}}/', pattern: 'assembly.fasta'
    publishDir 'results/assembly/unicycler_{{pid}}/', pattern: 'assembly.gfa'

    input:
    set sample_id, file(fastq_pair) from {{input_channel}}

    output:
    set sample_id, file('assembly.fasta') into {{output_channel}}
    file "assembly.gfa" into gfa1_{{pid}}
    {% with task_name="unicycler" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    "unicycler -t $task.cpus -o . --no_correct --no_pilon -1 ${fastq_pair[0]} -2 ${fastq_pair[1]}"
}

{{forks}}


================================================
FILE: flowcraft/generator/templates/user.config
================================================
// User configuration file that is not overwritten by flowcraft
// Use this file to provide persistent configurations in the same pipeline
// directory


================================================
FILE: flowcraft/generator/templates/viral_assembly.nf
================================================
//MAIN INPUT - FASTQ FILES
spades_in = Channel.create()
megahit_in = Channel.create()
{{ input_channel }}.into{ spades_in; megahit_in }

//EXPECTED GENOME SIZE
if ( !params.minimumContigSize{{ param_id }}.toString().isNumber() ){
    exit 1, "'minimumContigSize{{ param_id }}' parameter must be a number. Provided value: '${params.minimumContigSize{{ param_id }}}'"
}

//SPADES OPTIONS
if ( !params.spadesMinCoverage{{ param_id }}.toString().isNumber() ){
    exit 1, "'spadesMinCoverage{{ param_id }}' parameter must be a number. Provided value: '${params.spadesMinCoverage{{ param_id }}}'"
}
if ( !params.spadesMinKmerCoverage{{ param_id }}.toString().isNumber()){
    exit 1, "'spadesMinKmerCoverage{{ param_id }}' parameter must be a number. Provided value: '${params.spadesMinKmerCoverage{{ param_id }}}'"
}

if ( params.spadesKmers{{ param_id }}.toString().split(" ").size() <= 1 ){
    if (params.spadesKmers{{ param_id }}.toString() != 'auto'){
        exit 1, "'spadesKmers{{ param_id }}' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.spadesKmers{{ param_id }}}"
    }
}

clear = params.clearInput{{ param_id }} ? "true" : "false"
checkpointClearSpades_{{ pid }} = Channel.value(clear)
checkpointClearMegahit_{{ pid }} = Channel.value(clear)

//MEGAHIT OPTIONS
if ( params.megahitKmers{{ param_id }}.toString().split(" ").size() <= 1 ){
    if (params.megahitKmers{{ param_id }}.toString() != 'auto'){
        exit 1, "'megahitKmers{{ param_id }}' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.megahitKmers{{ param_id }}}"
    }
}

//SPADES INPUT CHANNELS
IN_spades_opts_{{ pid }} = Channel.value([params.spadesMinCoverage{{ param_id }},params.spadesMinKmerCoverage{{ param_id }}])
IN_spades_kmers_{{ pid }} = Channel.value(params.spadesKmers{{ param_id }})

//MEGAGIT INPUT CHANNELS
IN_megahit_kmers_{{ pid }} = Channel.value(params.megahitKmers{{ param_id }})

SIDE_max_len_spades = Channel.create()
SIDE_max_len_megahit = Channel.create()
SIDE_max_len_{{ pid }}.into{SIDE_max_len_spades ; SIDE_max_len_megahit}

disableRR_{{ pid }} = "false"

process va_spades_{{ pid }} {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    validExitStatus 0,1

    tag { sample_id }
    publishDir 'results/assembly/spades_{{ pid }}/', pattern: '*_spades*.fasta', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), max_len from spades_in.join(SIDE_max_len_spades)
    val opts from IN_spades_opts_{{ pid }}
    val kmers from IN_spades_kmers_{{ pid }}
    val clear from checkpointClearSpades_{{ pid }}
    val disable_rr from disableRR_{{ pid }}

    output:
    set sample_id, file({task.exitStatus == 1 ? ".exitcode" : '*_spades*.fasta'}) into assembly_spades
    {% with task_name="va_spades" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "spades.py"

}

class VerifyCompletness {

    public static boolean contigs(String filename, int threshold){
        BufferedReader reader = new BufferedReader(new FileReader(filename));
        boolean result = processContigs(reader, threshold);
        reader.close()

        return result;
    }

    private static boolean processContigs(BufferedReader reader, int threshold){
        String line;
        int lineThreshold = 0;
        List splittedLine

        while ((line = reader.readLine()) != null) {
            if (line.startsWith('>')) {
                splittedLine = line.split('_')
                lineThreshold = splittedLine[3].toInteger()
                if(lineThreshold >= threshold) {
                    return true;
                }
             }
        }

        return false;
    }
}

megahit = Channel.create()
good_assembly = Channel.create()
assembly_spades.choice(good_assembly, megahit){a -> a[1].toString() == "null" ? false : VerifyCompletness.contigs(a[1].toString(), params.minimumContigSize{{ param_id }}.toInteger()) == true ? 0 : 1}


process va_megahit_{{ pid }}  {

    // Send POST request to platform
    {% include "post.txt" ignore missing %}

    tag { sample_id }
    publishDir 'results/assembly/megahit_{{ pid }}/', pattern: '*_megahit*.fasta', mode: 'copy'

    input:
    set sample_id, file(fastq_pair), max_len from megahit_in.join(megahit).map{ ot -> [ot[0], ot[1]] }.join(SIDE_max_len_megahit)
    val kmers from IN_megahit_kmers_{{ pid }}
    val clear from checkpointClearSpades_{{ pid }}

    output:
    set sample_id, file('*megahit*.fasta') into megahit_assembly
    {% with task_name="va_megahit" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "megahit.py"

}


good_assembly.mix(megahit_assembly).into{ to_report_{{ pid }} ; {{ output_channel }} }
orf_size = Channel.value(params.minimumContigSize{{ param_id }})


process report_viral_assembly_{{ pid }} {

    {% include "post.txt" ignore missing %}

    tag { sample_id }

    input:
    set sample_id, file(assembly) from to_report_{{ pid }}
    val min_size from orf_size

    output:
    {% with task_name="report_viral_assembly" %}
    {%- include "compiler_channels.txt" ignore missing -%}
    {% endwith %}

    script:
    template "process_viral_assembly.py"

}


{{ forks }}

================================================
FILE: flowcraft/generator/utils.py
================================================
import re

try:
    import generator.error_handling as eh
except ImportError:
    import flowcraft.generator.error_handling as eh


def get_nextflow_filepath(log_file):
    """Gets the nextflow file path from the nextflow log file. It searches for
    the nextflow run command throughout the file.

    Parameters
    ----------
    log_file : str
        Path for the .nextflow.log file

    Returns
    -------
    str
        Path for the nextflow file
    """

    with open(log_file) as fh:
        # Searches for the first occurence of the nextflow pipeline
        # file name in the .nextflow.log file
        while 1:
            line = fh.readline()
            if not line:
                # file is empty
                raise eh.LogError("Nextflow command path could not be found - Is "
                                 ".nextflow.log empty?")
            try:
                # Regex supports absolute paths and relative paths
                pipeline_path = re.match(".*\s(.*.nf).*", line) \
                    .group(1)
                return pipeline_path
            except AttributeError:
                continue


================================================
FILE: flowcraft/lib/CheckParams.groovy
================================================
class Params {

    static void check(Map params) {

        // Checks genomeSize for type
        try {
            params.genomeSize as Double
        } catch (e) {
            print_error("The genomeSize option must be a number")
        }

        // Checks minCoverage for type
        try {
            params.minCoverage as Double
        } catch (e) {
            print_error("the minCoverage option must be a number")
        }

        // Check if fastqc adapters file exists
        if (!params.adapters.equalsIgnoreCase("none")) {
            File f = new File(params.adapters)
            if (!f.exists()) {
                print_error("The provided adapters file does " +
                            "not exist ($params.adapters)")
            }
        }

        // Check for trimmomatic parameters
        try {
            params.trimLeading as Double
            params.trimTrailing as Double
            params.trimMinLength as Double
        } catch (e) {
            print_error("The trimLeading ($params.trimLeading), " +
                        "trimTrailing ($params.trimTrailing) and " +
                        "trimMinLength ($params.trimMinLength) " +
                        "options must be numbers")
        }

        // Check for Spades parameters
        [
            "spadesMincoverage": params.spadesMinCoverage,
            "spadesMinKmerCoverage": params.spadesMinKmerCoverage,
            "spadesMinContigLen": params.spadesMinContigLen,
            "spadesMaxContigs": params.spadesMaxContigs
        ].each { k, v ->
            try {
                v as Integer
            } catch (e) {
                print_error("The spades parameter $k ($v) must be an integer")
            }
         }

    }

    static def print_error(String msg) {

        println "\nERROR: $msg"
        System.exit(1)

    }

}

================================================
FILE: flowcraft/profiles.config
================================================
// Compilation of commonly used profile combinations of executor and container
// engine
profiles {

    standard {
        singularity.enabled = true
    }

    docker {
        docker.enabled = true
    }

    // SLURM executor
    slurm_sing {
        singularity.enabled = true
        process.executor = "slurm"
    }

    slurm_docker {
        docker.enabled = true
        process.executor = "slurm"
    }

    slurm_shifter {
        shifter.enabled = true
        process.executor = "slurm"
    }
    
    // SGE executor
    sge_sing {
        singularity.enabled = true
        process.executor = "sge"
    }

    sge_docker {
        docker.enabled = true
        process.executor = "sge"
    }

    sge_shifter {
        shifter.enabled = true
        process.executor = "sge"
    }

    // LSF executor
    lsf_sing {
        singularity.enabled = true
        process.executor = "lsf"
    }

    lsf_docker {
        docker.enabled = true
        process.executor = "lsf"
    }

    lsf_shifter {
        shifter.enabled = true
        process.executor = "lsf"
    }

    // PBS executor
    pbs_sing {
        singularity.enabled = true
        process.executor = "pbs"
    }

    pbs_docker {
        docker.enabled = true
        process.executor = "pbs"
    }

    pbs_shifter {
        shifter.enabled = true
        process.executor = "pbs"
    }
    
    // NQSII executor
    nqsii_sing {
        singularity.enabled = true
        process.executor = "nqsii"
    }

    nqsii_docker {
        docker.enabled = true
        process.executor = "nqsii"
    }

    nqsii_shifter {
        shifter.enabled = true
        process.executor = "nqsii"
    }
    
    // HTCondor executor
    condor_sing {
        singularity.enabled = true
        process.executor = "condor"
    }

    condor_docker {
        docker.enabled = true
        process.executor = "condor"
    }

    condor_shifter {
        shifter.enabled = true
        process.executor = "condor"
    }

}

================================================
FILE: flowcraft/templates/README.md
================================================
# Templates

A bunch of templates for processing HTS data. Particularly
useful for using with nextflow pipelines.

## Quick reference

* process_assembly_mapping.py - Processes the coverage report and checks
assembly filters from the `assembly_mapping` process. [[changelog](https://github.com/ODiogoSilva/templates/wiki/process_assembly_mapping-changelog), [API](http://assemblerflow.readthedocs.io/en/doc_galore/assemblerflow.templates.process_assembly_mapping.html)]

* mapping2json.py - exports results from a samtool depth file to a json
file that contains a `key:value` such as `accession number:coverage` .

* mashdist2json.py - exports results from `mash dist` to a json file
that contains a `key:value` such as `accession number:distance` .

* mashscreen2json.py - exports results from `mash screen` to a json
file that contains a `key:[values]` such as `accession number:[copy number, identity]` .

## How to use as a submodule

### Add templates to your project

```
git submodule add https://github.com/ODiogoSilva/templates.git templates
```

### Update templates on your project

```
git submodule foreach git pull origin master
```

================================================
FILE: flowcraft/templates/__init__.py
================================================
"""
Placeholder for template generation docs
"""

================================================
FILE: flowcraft/templates/assembly_report.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to provide a summary report for a given assembly
in Fasta format.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``assembly`` : Path to assembly file in Fasta format.
    - e.g.: ``'assembly.fasta'``

Generated output
----------------

- ``${sample_id}_assembly_report.csv`` : CSV with summary information of the \
    assembly.
    - e.g.: ``'SampleA_assembly_report.csv'``

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "16012018"
__template__ = "assembly_report-nf"

import os
import re
import json
import traceback
import subprocess

from collections import OrderedDict
from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_pilon():

    pilon_path = "/NGStools/pilon-1.22.jar"

    try:

        cli = ["java", "-jar", pilon_path, "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.split()[2].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "Pilon",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY_FILE = '$assembly'
    COVERAGE_BP_FILE = '$coverage_bp'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY_FILE: {}".format(ASSEMBLY_FILE))
    logger.debug("COVERAGE_BP_FILE: {}".format(COVERAGE_BP_FILE))


class Assembly:
    """Class that parses and filters an assembly file in Fasta format.

    This class parses an assembly file, collects a number
    of summary statistics and metadata from the contigs and reports.

    Parameters
    ----------
    assembly_file : str
        Path to assembly file.
    sample_id : str
        Name of the sample for the current assembly.
    """

    def __init__(self, assembly_file, sample_id):

        self.summary_info = OrderedDict([
            ("ncontigs", 0),
            ("avg_contig_size", []),
            ("n50", 0),
            ("total_len", 0),
            ("avg_gc", []),
            ("missing_data", 0)
        ])
        """
        OrderedDict: Initialize summary information dictionary. Contains keys:

            - ``ncontigs``: Number of contigs
            - ``avg_contig_size``: Average size of contigs
            - ``n50``: N50 metric
            - ``total_len``: Total assembly length
            - ``avg_gc``: Average GC proportion
            - ``missing_data``: Count of missing data characters
        """

        self.contigs = OrderedDict()
        """
        OrderedDict: Object that maps the contig headers to the corresponding
        sequence
        """

        self.contig_coverage = OrderedDict()
        """
        OrderedDict: Object that maps the contig headers to the corresponding
        list of per-base coverage
        """

        self.sample = sample_id
        """
        str: Sample id
        """

        self.contig_boundaries = {}
        """
        dict: Maps the boundaries of each contig in the genome
        """

        self._parse_assembly(assembly_file)

    def _parse_assembly(self, assembly_file):
        """Parse an assembly file in fasta format.

        This is a Fasta parsing method that populates the
        :py:attr:`Assembly.contigs` attribute with data for each contig in the
         assembly.

        Parameters
        ----------
        assembly_file : str
            Path to the assembly fasta file.

        """

        with open(assembly_file) as fh:

            header = None
            logger.debug("Starting iteration of assembly file: {}".format(
                assembly_file))

            for line in fh:

                # Skip empty lines
                if not line.strip():
                    continue

                if line.startswith(">"):
                    # Add contig header to contig dictionary
                    header = line[1:].strip()
                    self.contigs[header] = []

                else:
                    # Add sequence string for the current contig
                    self.contigs[header].append(line.strip())

            # After populating the contigs dictionary, convert the values
            # list into a string sequence
            self.contigs = OrderedDict(
                (header, "".join(seq)) for header, seq in self.contigs.items())

    @staticmethod
    def _get_contig_id(contig_str):
        """Tries to retrieve contig id. Returns the original string if it
        is unable to retrieve the id.

        Parameters
        ----------
        contig_str : str
            Full contig string (fasta header)

        Returns
        -------
        str
            Contig id
        """

        contig_id = contig_str

        try:
            contig_id = re.search(".*NODE_([0-9]*)_.*", contig_str).group(1)
        except AttributeError:
            pass

        try:
            contig_id = re.search(".*Contig_([0-9]*)_.*", contig_str).group(1)
        except AttributeError:
            pass

        return contig_id

    def get_summary_stats(self, output_csv=None):
        """Generates a CSV report with summary statistics about the assembly

        The calculated statistics are:

            - Number of contigs
            - Average contig size
            - N50
            - Total assembly length
            - Average GC content
            - Amount of missing data

        Parameters
        ----------
        output_csv: str
            Name of the output CSV file.
        """

        contig_size_list = []

        self.summary_info["ncontigs"] = len(self.contigs)

        for contig_id, sequence in self.contigs.items():

            logger.debug("Processing contig: {}".format(contig_id))

            # Get contig sequence size
            contig_len = len(sequence)

            # Add size for average contig size
            contig_size_list.append(contig_len)

            # Add to total assembly length
            self.summary_info["total_len"] += contig_len

            # Add to average gc
            self.summary_info["avg_gc"].append(
                sum(map(sequence.count, ["G", "C"])) / contig_len
            )

            # Add to missing data
            self.summary_info["missing_data"] += sequence.count("N")

        # Get average contig size
        logger.debug("Getting average contig size")
        self.summary_info["avg_contig_size"] = \
            sum(contig_size_list) / len(contig_size_list)

        # Get average gc content
        logger.debug("Getting average GC content")
        self.summary_info["avg_gc"] = \
            sum(self.summary_info["avg_gc"]) / len(self.summary_info["avg_gc"])

        # Get N50
        logger.debug("Getting N50")
        cum_size = 0
        for l in sorted(contig_size_list, reverse=True):
            cum_size += l
            if cum_size >= self.summary_info["total_len"] / 2:
                self.summary_info["n50"] = l
                break

        if output_csv:
            logger.debug("Writing report to csv")
            # Write summary info to CSV
            with open(output_csv, "w") as fh:
                summary_line = "{}, {}\\n".format(
                    self.sample, ",".join(
                        [str(x) for x in self.summary_info.values()]))
                fh.write(summary_line)

    def _get_window_labels(self, window):
        """Returns the mapping between sliding window points and their contigs,
        and the x-axis position of contig

        Parameters
        ----------
        window : int
            Size of the window.

        Returns
        -------
        xbars : list
            The x-axis position of the ending for each contig.
        labels : list
            The x-axis labels for each data point in the sliding window

        """

        # Get summary stats, if they have not yet been triggered
        if not self.summary_info:
            self.get_summary_stats()

        # Get contig boundary positon
        c = 0
        xbars = []
        for contig, seq in self.contigs.items():
            contig_id = self._get_contig_id(contig)
            self.contig_boundaries[contig_id] = [c, c + len(seq)]
            c += len(seq)
            xbars.append((contig_id, c, contig))

        return xbars

    @staticmethod
    def _gc_prop(s, length):
        """Get proportion of GC from a string

        Parameters
        ----------
        s : str
            Arbitrary string

        Returns
        -------
        x : float
            GC proportion.
        """

        gc = sum(map(s.count, ["c", "g"]))

        return gc / length

    def get_gc_sliding(self, window=2000):
        """Calculates a sliding window of the GC content for the assembly


        Returns
        -------
        gc_res : list
            List of GC proportion floats for each data point in the sliding
            window
        """

        gc_res = []

        # Get complete sequence to calculate sliding window values
        complete_seq = "".join(self.contigs.values()).lower()

        for i in range(0, len(complete_seq), window):

            seq_window = complete_seq[i:i + window]

            # Get GC proportion
            gc_res.append(round(self._gc_prop(seq_window, len(seq_window)), 2))

        return gc_res

    def _get_coverage_from_file(self, coverage_file):
        """

        Parameters
        ----------
        coverage_file

        Returns
        -------

        """

        with open(coverage_file) as fh:

            for line in fh:

                fields = line.strip().split()

                # Get header
                header = fields[0]
                coverage = int(fields[2])

                if header not in self.contig_coverage:
                    self.contig_coverage[header] = [coverage]
                else:
                    self.contig_coverage[header].append(coverage)

    def get_coverage_sliding(self, coverage_file, window=2000):
        """

        Parameters
        ----------
        coverage_file : str
            Path to file containing the coverage info at the per-base level
            (as generated by samtools depth)
        window : int
            Size of sliding window

        Returns
        -------

        """

        if not self.contig_coverage:
            self._get_coverage_from_file(coverage_file)

        # Stores the coverage results
        cov_res = []

        # Make flat list of coverage values across genome
        complete_cov = [x for y in self.contig_coverage.values() for x in y]

        for i in range(0, len(complete_cov), window):
            # Get coverage values for current window
            cov_window = complete_cov[i:i + window]
            # Get mean coverage
            cov_res.append(int(sum(cov_window) / len(cov_window)))

        return cov_res


@MainWrapper
def main(sample_id, assembly_file, coverage_bp_file=None):
    """Main executor of the assembly_report template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly_file : str
        Path to assembly file in Fasta format.

    """

    logger.info("Starting assembly report")
    assembly_obj = Assembly(assembly_file, sample_id)

    logger.info("Retrieving summary statistics for assembly")
    assembly_obj.get_summary_stats("{}_assembly_report.csv".format(sample_id))

    size_dist = [len(x) for x in assembly_obj.contigs.values()]
    json_dic = {
        "tableRow": [{
            "sample": sample_id,
            "data": [
                {"header": "Contigs",
                 "value": assembly_obj.summary_info["ncontigs"],
                 "table": "assembly",
                 "columnBar": True},
                {"header": "Assembled BP",
                 "value": assembly_obj.summary_info["total_len"],
                 "table": "assembly",
                 "columnBar": True},
            ]
        }],
        "plotData": [{
            "sample": sample_id,
            "data": {
                "size_dist": size_dist
            }
        }]
    }

    if coverage_bp_file:
        try:
            window = 2000
            gc_sliding_data = assembly_obj.get_gc_sliding(window=window)
            cov_sliding_data = \
                assembly_obj.get_coverage_sliding(coverage_bp_file,
                                                  window=window)

            # Get total basepairs based on the individual coverage of each
            # contig bpx
            total_bp = sum(
                [sum(x) for x in assembly_obj.contig_coverage.values()]
            )

            # Add data to json report
            json_dic["plotData"][0]["data"]["genomeSliding"] = {
                "gcData": gc_sliding_data,
                "covData": cov_sliding_data,
                "window": window,
                "xbars": assembly_obj._get_window_labels(window),
                "assemblyFile": os.path.basename(assembly_file)
            }
            json_dic["plotData"][0]["data"]["sparkline"] = total_bp

        except:
            logger.error("Unexpected error creating sliding window data:\\n"
                         "{}".format(traceback.format_exc()))

    # Write json report
    with open(".report.json", "w") as json_report:

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY_FILE, COVERAGE_BP_FILE)


================================================
FILE: flowcraft/templates/compile_reports.py
================================================
#!/usr/bin/python3
import os
import sys
import json
import zipfile
import logging

REPORTS = "${report}".split()
FORKS = "${forks}"
DAG = "${dag}"
MAIN_JS = "${js}"


html_template = """
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
  <title>FlowCraft App</title>
</head>
<body style="background-color: #f2f2f2">
    <div id="app"><!-- React --></div>
</body>
<script> const _fileReportData = {} </script>
<script src="./src/main.js"></script>
</html>
"""


def main(reports, forks, dag, main_js):

    metadata = {
        "nfMetadata": {
            "scriptId": "${workflow.scriptId}",
            "scriptName": "${workflow.scriptName}",
            "profile": "${workflow.profile}",
            "container": "${workflow.container}",
            "containerEngine": "${workflow.containerEngine}",
            "commandLine": "${workflow.commandLine}",
            "runName": "${workflow.runName}",
            "sessionId": "${workflow.sessionId}",
            "projectDir": "${workflow.projectDir}",
            "launchDir": "${workflow.launchDir}",
            "startTime": "${workflow.start}"
        }
    }

    # Add nextflow metadata
    storage = []

    # Add forks dictionary
    try:
        with open(forks) as fh:
            forks = json.load(fh)
            metadata["nfMetadata"]["forks"] = forks
    except json.JSONDecodeError:
        logging.warning("Could not parse versions JSON: {}".format(
            dag))

    # Add tree DAG in JSON format
    try:
        with open(dag) as fh:
            dag = json.load(fh)
            metadata["nfMetadata"]["dag"] = dag
    except json.JSONDecodeError:
        logging.warning("Could not parse versions JSON: {}".format(
            dag))

    storage.append(metadata)
    # Write metadata information to dotfile. This dotfile is then sent to the
    # ReportHTTP, when available in the afterScript process directive.
    with open(".metadata.json", "w") as fh:
        fh.write(json.dumps(metadata, separators=(",", ":")))

    for r in reports:
        with open(r) as fh:
            rjson = json.load(fh)
            storage.append(rjson)
            print("{}: {}".format(rjson["processName"],
                                  sys.getsizeof(json.dumps(rjson))))

    with open("pipeline_report.html", "w") as html_fh:
        html_fh.write(html_template.format(
            json.dumps({"data": {"results": storage}}, separators=(",", ":"))))

    with zipfile.ZipFile(main_js) as zf:
        os.mkdir("src")
        zf.extractall("./src")

    with open("pipeline_report.json", "w") as rep_fh:
        rep_fh.write(json.dumps({"data": {"results": storage}},
                                separators=(",", ":")))


if __name__ == "__main__":
    main(REPORTS, FORKS, DAG, MAIN_JS)


================================================
FILE: flowcraft/templates/dengue_typing_assembly.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module intends to type DENV genome assembly with seqTyping (BLAST mode)

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fasta`` : A fasta file path.
    - e.g.: ``'SampleA.fasta'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``

Generated output
----------------

-  The sample fasta file path or, if a complete ORF isn't obtained, a consesus sequence
-  The closest reference fasta file path
"""

__version__ = "0.0.2"
__build__ = "01022019"
__template__ = "dengue_typing-nf"

import json
import os
import sys
import subprocess
from subprocess import PIPE
from itertools import groupby
from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY = '$assembly'
    REFERENCE = '$reference'
    RESULT = '$get_reference'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY: {}".format(ASSEMBLY))
    logger.debug("REFERENCE: {}".format(REFERENCE))
    logger.debug("RESULT: {}".format(RESULT))


def __get_version_seq_typing():
    """
    Gets Seq_typing software version
    Returns
    -------
    version : str
        Seqtyping version"""

    try:
        cli = ["seq_typing.py", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout = p.communicate()[0]

        version = stdout.splitlines()[0].split()[-1].decode("utf8")
    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return version

def replace_char(text):
    """
    Cleans the string from problematic chars

    Parameters
    ----------
    text : str
        String to clean"""

    for ch in ['/', '`', '*', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '\$', ':', '|']:
        text = text.replace(ch, "_")
    return text


def getSequence(ref, fasta):
    """
     Gets the fasta sequence from the Database with the header "ref"

     Parameters
     ----------
     ref : str
         Reference whose sequence needs to be fetched
     fasta: str
        Path to the multifasta"""

    fasta_header = ""

    fh_fasta = open(fasta, "r")
    entry = (x[1] for x in groupby(fh_fasta, lambda line: line[0] == ">"))

    for header in entry:
        headerStr = header.__next__()[1:].strip()

        seq = "".join(s.strip() for s in entry.__next__())

        if ref == headerStr.replace('>',''):
            filename = os.path.join(os.getcwd(), ref.replace('/','_').split('|')[0])
            fasta_header = replace_char(headerStr)

            with open(filename + '.fa', "w") as output_file:
                output_file.write(">" + fasta_header + "\\n" + seq.upper() + "\\n")

    fh_fasta.close()
    return fasta_header


def get_reference_header(file):
    """
    Gets the header for the closest reference from the seqtyping report

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    with open(file, "r") as typing_report:
        lines = typing_report.readlines()
    return lines[1].split('\\t')[3]


def getType(file):
    """
    Gets the typing result from the seqtyping report

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    with open(file, "r") as result:
        return result.readline().strip()


def getScore(file):
    """
    Method to write QC warnings based on the mapping statistics
    (sequence covered and identity)

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    with open(file, "r") as typing_report:
        lines = typing_report.readlines()

        sequence_covered = float(lines[1].split("\\t")[4])
        sequence_identity = float(lines[1].split("\\t")[6])

        if sequence_covered < 70:
            logger.fail("Sequence coverage below 70% on the best hit.")
            with open(".fails", "w") as fails:
                fails.write("Sequence coverage below 70% on the best hit.")

        elif 90 > sequence_covered < 70:
            logger.warning("Sequence coverage lower than 90% on the best hit.")
            with open(".warnings", "w") as fails:
                fails.write("Sequence coverage below 70% on the best hit.")

        return sequence_identity, sequence_covered

@MainWrapper
def main(sample_id, assembly, reference, result):
    """Main executor of the dengue_typing template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly : str
        Assembly file.
    fastq_pair: list
        FastQ files
    result: str
        String stating is the reference genome is to be recovered"""

    json_report = {}

    st_version = __get_version_seq_typing()

    cli = ["seq_typing.py",
           "assembly",
           "-b", os.path.join(os.getcwd(), reference),
           "-j", "${task.cpus}",
           "-f", assembly,
           "-t", "nucl"]

    logger.info("Runnig seq_typing subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished seq_typing index subprocess with STDOUT:\\n"
                "======================================\\n{}".format(
        stdout))
    logger.info("Fished seq_typing index subprocesswith STDERR:\\n"
                "======================================\\n{}".format(
        stderr))
    logger.info("Finished seq_typing index with return code: {}".format(
        p.returncode))

    if p.returncode == 0:

        typing_result = getType("seq_typing.report.txt")

        logger.info("Type found: {}".format(typing_result))

        # write appropriate QC dot files based on blast statistics
        identity = 0
        coverage = 0

        if typing_result != "NT":
            # write appropriate QC dot files based on blast statistics
            identity, coverage = getScore("seq_typing.report_types.tab")

            best_reference = get_reference_header("seq_typing.report_types.tab")

            reference_name = getSequence(best_reference, os.path.join(os.getcwd(), reference))

        else:
            logger.info("No typing information was obtained.")

        if result == "true":

            json_report = {'tableRow': [{
                'sample': sample_id,
                'data': [
                    {'header': 'seqtyping',
                     'value': typing_result,
                     'table': 'typing'},
                    {'header': 'Identity',
                     'value': round(identity,2),
                     'table': 'typing'},
                    {'header': 'Coverage',
                     'value': round(coverage,2),
                     'table': 'typing'},
                    {'header': 'Reference',
                     'value': reference_name.replace("gb_", "gb:").split("_")[0],
                     'table': 'typing'}
                ]}],
                'metadata': [
                    {'sample': sample_id,
                     'treeData': typing_result,
                     'column': 'typing'},
                    {'sample': reference_name,
                     'treeData': typing_result,
                     'column': 'typing'}]}

        else:
            json_report = {'tableRow': [{
                'sample': sample_id,
                'data': [
                    {'header': 'seqtyping',
                     'value': typing_result,
                     'table': 'typing'},
                    {'header': 'Identity',
                     'value': round(identity),
                     'table': 'typing'},
                    {'header': 'Coverage',
                     'value': round(coverage),
                     'table': 'typing'},
                    {'header': 'Reference',
                     'value': reference_name.replace("gb_", "gb:").split("_")[1],
                     'table': 'typing'}
                ]}],
                'metadata': [
                    {'sample': sample_id,
                     'treeData': typing_result,
                     'column': 'typing'}]}

    else:
        logger.error("Failed to run seq_typing for Dengue Virus.")
        with open(".status", "w") as status:
            status.write("fail")
        sys.exit(1)

    # Add information to dotfiles
    with open(".report.json", "w") as report, \
            open(".status", "w") as status, \
            open(".version", "w") as version:
        report.write(json.dumps(json_report, separators=(",", ":")))
        status.write("pass")
        version.write(st_version)


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY, REFERENCE, RESULT)


================================================
FILE: flowcraft/templates/dengue_typing_reads.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module intends to type DENV genome assembly with seqTyping
(mapping mode)

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fasta`` : A fasta file path.
    - e.g.: ``'SampleA.fasta'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``

Generated output
----------------

-  The sample fasta file path or, if a complete ORF isn't obtained, a consesus sequence
-  The closest reference fasta file path
"""

__version__ = "0.0.2"
__build__ = "01022019"
__template__ = "dengue_typing-nf"

import glob
import json
import os
import sys
import subprocess
from subprocess import PIPE
from itertools import groupby
from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY = '$assembly'
    FASTQ_PAIR = '$fastq_pair'.split()
    REFERENCE = '$reference'
    RESULT = '$get_reference'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY: {}".format(ASSEMBLY))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("REFERENCE: {}".format(REFERENCE))
    logger.debug("RESULT: {}".format(RESULT))


def __get_version_seq_typing():
    """
    Gets Seq_typing software version
    Returns
    -------
    version : str
        Seqtyping version"""

    try:
        cli = ["seq_typing.py", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout = p.communicate()[0]

        version = stdout.splitlines()[0].split()[-1].decode("utf8")
    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return version


def replace_char(text):
    """
    Cleans the string from problematic chars

    Parameters
    ----------
    text : str
        String to clean"""

    for ch in ['/', '`', '*', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '\$', ':', '|']:
        text = text.replace(ch, "_")
    return text


def getSequence(ref, fasta):
    """
     Gets the fasta sequence from the Database with the header "ref"

     Parameters
     ----------
     ref : str
         Reference whose sequence needs to be fetched
     fasta: str
        Path to the multifasta"""

    fasta_header = ""

    fh_fasta = open(fasta, "r")
    entry = (x[1] for x in groupby(fh_fasta, lambda line: line[0] == ">"))

    for header in entry:
        headerStr = header.__next__()[1:].strip()

        seq = "".join(s.strip() for s in entry.__next__())

        if ref == headerStr.replace('>',''):
            filename = os.path.join(os.getcwd(), ref.replace('/','_').split('|')[0])
            fasta_header = replace_char(headerStr)

            with open(filename + '.fa', "w") as output_file:
                output_file.write(">" + fasta_header + "\\n" + seq.upper() + "\\n")
    fh_fasta.close()

    return fasta_header


def get_reference_header(file):
    """
    Gets the header for the closest reference from the seqtyping report

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    with open(file, "r") as typing_report:
        lines = typing_report.readlines()
    return lines[1].split('\\t')[3]


def getType(file):
    """
    Gets the typing result from the seqtyping report

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    with open(file, "r") as result:
        return result.readline().strip()


def getConsesusSequence(best_reference, consensus, sample_id):
    """
    Gets the consensus sequence for the sample based
    on the closest reference

    Parameters
    ----------
    best_reference: str
        Closest reference whose consensus is to be retrieved
    consensus: str
        Path to the consensus file produced by rematch
    sample_id: str
        sample id"""

    gb_ID = best_reference.split('|')[0].replace(":", "_")
    fh_consensus = open(consensus, "r")

    entry = (x[1] for x in groupby(fh_consensus, lambda line: line[0] == ">"))

    for header in entry:

        headerStr = header.__next__()[1:].strip()
        seq = "".join(s.strip() for s in entry.__next__())

        if gb_ID in headerStr:
            with open(sample_id + '_consensus.fasta', "w") as output_file:
                output_file.write(">" + sample_id + "_consensus_" +
                                  replace_char(best_reference.split("_")[0]) + "\\n" + seq.upper() + "\\n")

    fh_consensus.close()


def getScore(file):
    """
    Method to write QC warnings based on the mapping statistics
    (sequence covered and identity)

    Parameters
    ----------
    file: str
     Path to the seqtyping report"""

    identity = 0
    coverage = 0

    with open(file, "r") as typing_report:
        lines = typing_report.readlines()

        sequence_covered = float(lines[1].split("\\t")[4])
        sequence_identity = float(lines[1].split("\\t")[6])

        if sequence_covered < 70:
            logger.fail("Sequence coverage below 70% on the best hit.")
            with open(".fails", "w") as fails:
                fails.write("Sequence coverage below 70% on the best hit.")

        elif 90 > sequence_covered < 70:
            logger.warning("Sequence coverage lower than 90% on the best hit.")
            with open(".warnings", "w") as fails:
                fails.write("Sequence coverage below 70% on the best hit.")

        return sequence_identity, sequence_covered


@MainWrapper
def main(sample_id, assembly, fastq_pair, reference, result):
    """Main executor of the dengue_typing template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly : str
        Assembly file.
    fastq_pair: list
        FastQ files
    reference: str
        Reference multi-fasta to be mapped against
    result: str
        String stating is the reference genome is to be recovered"""

    json_report = {}

    st_version = __get_version_seq_typing()

    cli = ["seq_typing.py",
           "reads",
           "-r", reference,
           "-j", "${task.cpus}",
           "--debug",
           '--bowtieAlgo="--very-fast"',
           "--doNotRemoveConsensus",
           "-f", fastq_pair[0], fastq_pair[1]]

    logger.info("Runnig seq_typing subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished seq_typing index subprocess with STDOUT:\\n"
                "======================================\\n{}".format(
        stdout))
    logger.info("Fished seq_typing index subprocesswith STDERR:\\n"
                "======================================\\n{}".format(
        stderr))
    logger.info("Finished seq_typing index with return code: {}".format(
        p.returncode))

    if p.returncode == 0:

        typing_result = getType("seq_typing.report.txt")

        logger.info("Type found: {}".format(typing_result))

        best_reference = get_reference_header("seq_typing.report_types.tab")

        if typing_result != "NT":
            logger.info("Getting consensus sequenceq")
            getConsesusSequence(best_reference,
                                glob.glob("rematch/*/sample.noMatter.fasta")[0],
                                sample_id)

            # check confidence and emmit appropriate warnings
            identity, coverage = getScore("seq_typing.report_types.tab")

            reference_name = getSequence(best_reference, os.path.join(os.getcwd(), reference))

        else:
            logger.error("Failed to obtain a close reference sequence in read mode. No consensus sequence is obtained.")
            with open(".status", "w") as status:
                status.write("fail")
            sys.exit(120)

        if result == "true":

            json_report = {'tableRow': [{
                'sample': sample_id,
                'data': [
                    {'header': 'seqtyping',
                     'value': typing_result,
                     'table': 'typing'},
                    {'header': 'Identity',
                     'value': round(identity, 2),
                     'table': 'typing'},
                    {'header': 'Coverage',
                     'value': round(coverage, 2),
                     'table': 'typing'},
                    {'header': 'Reference',
                     'value': reference_name.replace("gb_", "gb:").split("_")[0],
                     'table': 'typing'}
                ]}],
                'metadata': [
                    {'sample': sample_id,
                     'treeData': typing_result,
                     'column': 'typing'},
                    {'sample': reference_name,
                     'treeData': typing_result,
                     'column': 'typing'}]}

        else:

            json_report = {'tableRow': [{
                'sample': sample_id,
                'data': [
                    {'header': 'seqtyping',
                     'value': typing_result,
                     'table': 'typing'},
                    {'header': 'Identity',
                     'value': round(identity, 2),
                     'table': 'typing'},
                    {'header': 'Coverage',
                     'value': round(coverage, 2),
                     'table': 'typing'},
                    {'header': 'Reference',
                     'value': reference_name.replace("gb_", "gb:").split("_")[1],
                     'table': 'typing'}
                ]}],
                'metadata': [
                    {'sample': sample_id,
                     'treeData': typing_result,
                     'column': 'typing'}]}

    else:
        logger.error("Failed to run seq_typing for Dengue Virus.")
        with open(".status", "w") as status:
            status.write("fail")
        sys.exit(1)

    # Add information to dotfiles
    with open(".report.json", "w") as report, \
            open(".status", "w") as status, \
            open(".version", "w") as version:
        report.write(json.dumps(json_report, separators=(",", ":")))
        status.write("pass")
        version.write(st_version)


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY, FASTQ_PAIR, REFERENCE, RESULT)


================================================
FILE: flowcraft/templates/downsample_fastq.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to sub-sample FastQ files to a certain coverage, based
on the expected genome size.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``gsize`` : *Expected genome size*
    - e.g.: ``'2.5'``
- ``depth`` : Maximum depth threshold above which the subsampling will be
    performed.
    - e.g.: ``100``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Generated output
----------------

- ``*_ss.fq.gz`` : Subsample fastq reads
    - e.g.: ``sampleA_ss.fq.gz``

Code documentation
------------------

"""

__version__ = "1.0.0"
__build__ = "30072018"
__template__ = "sample_fastq-nf"

import os
import re
import json
import subprocess

from os.path import basename

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    GSIZE = float('$gsize'.strip())
    DEPTH = float('$depth'.strip())
    CLEAR = '$clear'
    SEED = '$seed'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("GENOME_SIZE: {}".format(GSIZE))
    logger.debug("DEPTH: {}".format(DEPTH))
    logger.debug("CLEAR: {}".format(CLEAR))
    logger.debug("SEED: {}".format(SEED))


def __get_version_spades():

    try:

        cli = ["seqtk"]
        p = subprocess.Popen(cli, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        _, stderr = p.communicate()

        _version = stderr.splitlines()[2]
        try:
            version = re.match(
                "Version: (.*)", _version.decode("utf8")).group(1)
        except AttributeError:
            version = "undefined"

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "seqtk",
        "version": version,
    }


@MainWrapper
def main(sample_id, fastq_pair, genome_size, depth, clear, seed):

    genome_size = genome_size
    target_depth = depth
    p1 = fastq_pair[0]
    p2 = fastq_pair[1]
    bn1 = ".".join(basename(p1).split('.')[:-2])
    bn2 = ".".join(basename(p2).split('.')[:-2])

    R1_fqchk = subprocess.Popen(['seqtk', 'fqchk', p1], stdout=subprocess.PIPE)
    R1_stdout, R1_stderr = R1_fqchk.communicate()
    B_P1 = int(R1_stdout.splitlines()[2].split()[1])
    logger.debug("Bases p1: {}".format(B_P1))

    R2_fqchk = subprocess.Popen(['seqtk', 'fqchk', p2], stdout=subprocess.PIPE)
    R2_stdout, R2_stderr = R2_fqchk.communicate()
    B_P2= int(R2_stdout.splitlines()[2].split()[1])
    logger.debug("Bases p2: {}".format(B_P2))

    estimated_coverage = (B_P1 + B_P2) / (genome_size * 1E6)
    logger.debug("Estimated coverage: {}".format(estimated_coverage))
    ratio = target_depth/estimated_coverage
    logger.debug("Estimated ration: {}".format(ratio))

    # if seed param is specified then use it, otherwise use the default -s100
    if seed:
        # through flowcraft everything should pass through here
        parsed_seed = "-s{}".format(str(seed))
        logger.info("Using seed parameter: {}.".format(parsed_seed))
    else:
        logger.debug("Seed parameter not specified. Using default value -s100.")
        parsed_seed = "-s100"

    if ratio < 1:
        # print ("Writing R1.fq.gz")
        ps = subprocess.Popen(('seqtk', 'sample', parsed_seed, p1, str(ratio)),
                              stdout=subprocess.PIPE)
        with open('{}_ss.fq.gz'.format(bn1), 'w') as outfile:
            subprocess.Popen(('gzip', '--fast', '-c'),
                             stdin=ps.stdout, stdout=outfile )
        ps.wait()

        # print ("Writing R2.fq.gz")
        ps = subprocess.Popen(('seqtk', 'sample', parsed_seed, p2, str(ratio)),
                              stdout=subprocess.PIPE)
        with open('{}_ss.fq.gz'.format(bn2), 'w') as outfile:
            subprocess.Popen(('gzip', '--fast', '-c'),
                             stdin=ps.stdout, stdout=outfile)
        ps.wait()

        if clear == "true":
            # Get real path of the symlink
            for fq in [p1, p2]:
                rp = os.path.realpath(fq)
                print("removing temporary fastq file path: {}".format(rp))
                # remove only when the file is in the work directory
                if re.match(".*/work/.{2}/.{30}/.*", rp):
                    os.remove(rp)

    else:
        os.symlink(p1, "{}._ss.fq.gz".format(bn1))
        os.symlink(p2, "{}._ss.fq.gz".format(bn2))

    # Record the original estimated coverage
    with open(".report.json", "w") as fh:
        json_dic = {
            "tableRow": [
                {
                    "sample": sample_id,
                    "data": [{
                        "header": "Coverage",
                        "value": round(estimated_coverage, 1),
                        "table": "qc"
                    }]
                 }
            ]
        }
        fh.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == "__main__":
        main(SAMPLE_ID, FASTQ_PAIR, GSIZE, DEPTH, CLEAR, SEED)


================================================
FILE: flowcraft/templates/fasta_spliter.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to split all fastas in a multifasta file into different
fasta files.

Code documentation
------------------

"""

import os
import sys


def main():

    cwd = os.getcwd()
    # a var to check if out_handle is started and if so it enables to control
    # how it should be closed
    out_handle = False
    # opens the input file of the process
    input_file = open(sys.argv[1])
    # a file with the list of all paths to fasta files that will be used by
    # fastANI
    list_files = open("files_fastani.txt", "w")
    # iterates by each entry in the fasta file
    for line in input_file:
        if line.startswith(">"):
            if out_handle:
                out_handle.close()
            # writes the output to fasta store folder inside cwd, respective
            # workdir
            path_to_file = os.path.join(cwd, "fasta_store",
                                        "_".join(line.split("_")[0:3])
                                        .replace(">", "") + ".fas")
            # writes to list of files
            list_files.write(path_to_file + "\n")
            out_handle = open(path_to_file, "w")
            out_handle.write(line)
        else:
            out_handle.write(line)

    out_handle.close()
    input_file.close()
    list_files.close()


if __name__ == "__main__":
    main()


================================================
FILE: flowcraft/templates/fastqc.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to run FastQC on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``fastq_pair`` : *Pair of FastQ file paths*
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``

Generated output
----------------

The generated output are output files that contain an object, usually a string.

- ``pair_{1,2}_data`` : File containing FastQC report at the nucleotide level\
    for each pair
    - e.g.: ``'pair_1_data'`` and ``'pair_2_data'``
- ``pair_{1,2}_summary``: File containing FastQC report for each category and\
    for each pair
    - e.g.: ``'pair_1_summary'`` and ``'pair_2_summary'``

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "28032018"
__template__ = "fastqc-nf"

import os
import subprocess

from subprocess import PIPE
from os.path import exists, join

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_fastqc():

    try:

        cli = ["fastqc", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[1][1:].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "FastQC",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    FASTQ_PAIR = '$fastq_pair'.split()
    ADAPTER_FILE = '$ad'
    CPUS = '$task.cpus'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("ADAPTER_FILE: {}".format(ADAPTER_FILE))
    logger.debug("CPUS: {}".format(CPUS))


def convert_adatpers(adapter_fasta):
    """Generates an adapter file for FastQC from a fasta file.

    The provided adapters file is assumed to be a simple fasta file with the
    adapter's name as header and the corresponding sequence::

        >TruSeq_Universal_Adapter
        AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
        >TruSeq_Adapter_Index 1
        GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG

    Parameters
    ----------
    adapter_fasta : str
        Path to Fasta file with adapter sequences.

    Returns
    -------
    adapter_out : str or None
        The path to the reformatted adapter file. Returns ``None`` if the
        adapters file does not exist or the path is incorrect.
    """

    adapter_out = "fastqc_adapters.tab"
    logger.debug("Setting output adapters file to: {}".format(adapter_out))

    try:

        with open(adapter_fasta) as fh, \
                open(adapter_out, "w") as adap_fh:

            for line in fh:
                if line.startswith(">"):

                    head = line[1:].strip()
                    # Get the next line with the sequence string
                    sequence = next(fh).strip()

                    adap_fh.write("{}\\t{}\\n".format(head, sequence))

        logger.info("Converted adapters file")

        return adapter_out

    # If an invalid adapters file is provided, return None.
    except FileNotFoundError:
        logger.warning("Could not find the provided adapters file: {}".format(
            adapter_fasta))
        return


@MainWrapper
def main(fastq_pair, adapter_file, cpus):
    """ Main executor of the fastq template.

    Parameters
    ----------
    fastq_pair : list
        Two element list containing the paired FastQ files.
    adapter_file : str
        Path to adapters file.
    cpus : int or str
        Number of cpu's that will be by FastQC.

    """

    logger.info("Starting fastqc")

    # If an adapter file was provided, convert it to FastQC format
    if os.path.exists(adapter_file):
        logger.info("Adapters file provided: {}".format(adapter_file))
        adapters = convert_adatpers(adapter_file)
    else:
        logger.info("Adapters file '{}' not provided or does not "
                    "exist".format(adapter_file))
        adapters = None

    # Setting command line for FastQC
    cli = [
        "fastqc",
        "--extract",
        "--nogroup",
        "--format",
        "fastq",
        "--threads",
        str(cpus)
    ]

    # Add adapters file to command line, if it exists
    if adapters:
        cli += ["--adapters", "{}".format(adapters)]

    # Add FastQ files at the end of command line
    cli += fastq_pair

    logger.debug("Running fastqc subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE, shell=False)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)

    logger.info("Finished fastqc subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished fastqc subprocesswith STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished fastqc with return code: {}".format(
        p.returncode))

    logger.info("Checking if FastQC output was correctly generated")
    # Check if the FastQC output was correctly generated.
    with open(".status", "w") as status_fh:
        for fastq in fastq_pair:
            fpath = join(fastq.rsplit(".", 2)[0] + "_fastqc",
                         "fastqc_data.txt")
            logger.debug("Checking path: {}".format(fpath))
            # If the FastQC output does not exist, pass the STDERR to
            # the output status channel and exit
            if not exists(fpath):
                logger.warning("Path does not exist: {}".format(fpath))
                status_fh.write("fail")
                return

            logger.debug("Found path: {}".format(fpath))

        # If the output directories exist, write 'pass' to the output status
        # channel
            status_fh.write("pass")

    logger.info("Retrieving relevant FastQC output files")

    # Both FastQC have been correctly executed. Get the relevant FastQC
    # output files for the output channel
    for i, fastq in enumerate(fastq_pair):
        # Get results for each pair
        fastqc_dir = fastq.rsplit(".", 2)[0] + "_fastqc"

        summary_file = join(fastqc_dir, "summary.txt")
        logger.debug("Retrieving summary file: {}".format(summary_file))
        fastqc_data_file = join(fastqc_dir, "fastqc_data.txt")
        logger.debug("Retrieving data file: {}".format(fastqc_data_file))

        # Rename output files to a file name that is easier to handle in the
        # output channel
        os.rename(fastqc_data_file, "pair_{}_data".format(i + 1))
        os.rename(summary_file, "pair_{}_summary".format(i + 1))


if __name__ == "__main__":

    main(FASTQ_PAIR, ADAPTER_FILE, CPUS)


================================================
FILE: flowcraft/templates/fastqc_report.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended parse the results of FastQC for paired end FastQ \
samples. It parses two reports:

    - Categorical report
    - Nucleotide level report.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample identification string
    - e.g.: ``'SampleA'``

- ``result_p1`` : Path to both FastQC result files for pair 1
    - e.g.: ``'SampleA_1_data SampleA_1_summary'``

- ``result_p2`` : Path to both FastQC result files for pair 2
    - e.g.: ``'SampleA_2_data SampleA_2_summary'``

- ``opts`` : *Specify additional arguments for executing fastqc_report. \
    The arguments should be a string of command line arguments,\
    The accepted arguments are:*
    - ``'--ignore-tests'`` : Ignores test results from FastQC categorical\
    summary. This is used in the first run of FastQC.

Generated output
----------------

The generated output are output files that contain an object, usually a string.

- ``fastqc_health`` : Stores the health check for the current sample. If it
    passes all checks, it contains only the string 'pass'. Otherwise, contains
    the summary categories and their respective results
    - e.g.: ``'pass'``
- ``optimal_trim`` : Stores a tuple with the optimal trimming positions for 5'
    and 3' ends of the reads.
    - e.g.: ``'15 151'``

Code documentation
------------------

"""

__version__ = "1.0.2"
__build__ = "12052018"
__template__ = "fastqc_report-nf"

import os
import json

from collections import OrderedDict

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    RESULT_P1 = '$result_p1'.split()
    RESULT_P2 = '$result_p2'.split()
    SAMPLE_ID = '$sample_id'
    OPTS = '$opts'.split()
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("RESULT_P1: {}".format(RESULT_P1))
    logger.debug("RESULT_P2: {}".format(RESULT_P2))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("OPTS: {}".format(OPTS))


def _get_quality_stats(d, start_str, field_start=1, field_end=2):
    """

    Parameters
    ----------
    d

    Returns
    -------

    """

    min_parsed = False
    parse = False
    report = []
    start_str = start_str
    end_str = ">>END_MODULE"

    with open(d) as fh:

        for line in fh:

            if line.startswith(start_str):
                next(fh)
                parse = True
                status = line.strip().split()[-1]

            # Exit parser when end string is found
            elif parse and line.startswith(end_str):
                return report, status

            elif parse:

                fields = line.strip().split()

                # This is triggered when the first value of a line series is
                # not 1. If the starting point of the series is a number
                # different from 1, fill the report with 0 until that point
                if not min_parsed:
                    if fields[0] != "1":
                        try:
                            blank_points = int(fields[0]) - 1
                            report.extend([0] * blank_points)
                        except ValueError:
                            pass
                    min_parsed = True

                report.append(";".join([
                    str(round(float(x), 2)) for x in
                    fields[field_start: field_end]
                ]))


def write_json_report(sample_id, data1, data2):
    """Writes the report

    Parameters
    ----------
    data1
    data2

    Returns
    -------

    """

    parser_map = {
        "base_sequence_quality": ">>Per base sequence quality",
        "sequence_quality": ">>Per sequence quality scores",
        "base_gc_content": ">>Per sequence GC content",
        "base_n_content": ">>Per base N content",
        "sequence_length_dist": ">>Sequence Length Distribution",
        "per_base_sequence_content": ">>Per base sequence content"
    }

    json_dic = {
        "plotData": [{
            "sample": sample_id,
            "data": {
                "base_sequence_quality": {"status": None, "data": []},
                "sequence_quality": {"status": None, "data": []},
                "base_gc_content": {"status": None, "data": []},
                "base_n_content": {"status": None, "data": []},
                "sequence_length_dist": {"status": None, "data": []},
                "per_base_sequence_content": {"status": None, "data": []}
            }
        }]
    }

    for cat, start_str in parser_map.items():

        if cat == "per_base_sequence_content":
            fs = 1
            fe = 5
        else:
            fs = 1
            fe = 2

        report1, status1 = _get_quality_stats(data1, start_str,
                                              field_start=fs, field_end=fe)
        report2, status2 = _get_quality_stats(data2, start_str,
                                              field_start=fs, field_end=fe)

        status = None
        for i in ["fail", "warn", "pass"]:
            if i in [status1, status2]:
                status = i

        json_dic["plotData"][0]["data"][cat]["data"] = [report1, report2]
        json_dic["plotData"][0]["data"][cat]["status"] = status

    return json_dic


def get_trim_index(biased_list):
    """Returns the trim index from a ``bool`` list

    Provided with a list of ``bool`` elements (``[False, False, True, True]``),
    this function will assess the index of the list that minimizes the number
    of True elements (biased positions) at the extremities. To do so,
    it will iterate over the boolean list and find an index position where
    there are two consecutive ``False`` elements after a ``True`` element. This
    will be considered as an optimal trim position. For example, in the
    following list::

        [True, True, False, True, True, False, False, False, False, ...]

    The optimal trim index will be the 4th position, since it is the first
    occurrence of a ``True`` element with two False elements after it.

    If the provided ``bool`` list has no ``True`` elements, then the 0 index is
    returned.

    Parameters
    ----------
    biased_list: list
        List of ``bool`` elements, where ``True`` means a biased site.

    Returns
    -------
        x : index position of the biased list for the optimal trim.

    """

    # Return index 0 if there are no biased positions
    if set(biased_list) == {False}:
        return 0

    if set(biased_list[:5]) == {False}:
        return 0

    # Iterate over the biased_list array. Keep the iteration going until
    # we find a biased position with the two following positions unbiased
    # (e.g.: True, False, False).
    # When this condition is verified, return the last biased position
    # index for subsequent trimming.
    for i, val in enumerate(biased_list):
        if val and set(biased_list[i+1:i+3]) == {False}:
            return i + 1

    # If the previous iteration could not find and index to trim, it means
    # that the whole list is basically biased. Return the length of the
    # biased_list
    return len(biased_list)


def trim_range(data_file):
    """Assess the optimal trim range for a given FastQC data file.

    This function will parse a single FastQC data file, namely the
    *'Per base sequence content'* category. It will retrieve the A/T and G/C
    content for each nucleotide position in the reads, and check whether the
    G/C and A/T proportions are between 80% and 120%. If they are, that
    nucleotide position is marked as biased for future removal.

    Parameters
    ----------
    data_file: str
        Path to FastQC data file.

    Returns
    -------
    trim_nt: list
        List containing the range with the best trimming positions for the
        corresponding FastQ file. The first element is the 5' end trim index
        and the second element is the 3' end trim index.
    """

    logger.debug("Starting trim range assessment")

    # Target string for nucleotide bias assessment
    target_nuc_bias = ">>Per base sequence content"
    logger.debug("Target string to start nucleotide bias assessment set to "
                 "{}".format(target_nuc_bias))
    # This flag will become True when gathering base proportion data
    # from file.
    gather = False

    # This variable will store a boolean array on the biased/unbiased
    # positions. Biased position will be True, while unbiased positions
    # will be False
    biased = []

    with open(data_file) as fh:

        for line in fh:
            # Start assessment of nucleotide bias
            if line.startswith(target_nuc_bias):
                # Skip comment line
                logger.debug("Found target string at line: {}".format(line))
                next(fh)
                gather = True
            # Stop assessment when reaching end of target module
            elif line.startswith(">>END_MODULE") and gather:
                logger.debug("Stopping parsing at line: {}".format(line))
                break
            elif gather:
                # Get proportions of each nucleotide
                g, a, t, c = [float(x) for x in line.strip().split()[1:]]
                # Get 'GC' and 'AT content
                gc = (g + 0.1) / (c + 0.1)
                at = (a + 0.1) / (t + 0.1)
                # Assess bias
                if 0.8 <= gc <= 1.2 and 0.8 <= at <= 1.2:
                    biased.append(False)
                else:
                    biased.append(True)

    logger.debug("Finished bias assessment with result: {}".format(biased))

    # Split biased list in half to get the 5' and 3' ends
    biased_5end, biased_3end = biased[:int(len(biased)/2)],\
        biased[int(len(biased)/2):][::-1]

    logger.debug("Getting optimal trim range from biased list")
    trim_nt = [0, 0]
    # Assess number of nucleotides to clip at 5' end
    trim_nt[0] = get_trim_index(biased_5end)
    logger.debug("Optimal trim range at 5' end set to: {}".format(trim_nt[0]))
    # Assess number of nucleotides to clip at 3' end
    trim_nt[1] = len(biased) - get_trim_index(biased_3end)
    logger.debug("Optimal trim range at 3' end set to: {}".format(trim_nt[1]))

    return trim_nt


def get_sample_trim(p1_data, p2_data):
    """Get the optimal read trim range from data files of paired FastQ reads.

    Given the FastQC data report files for paired-end FastQ reads, this
    function will assess the optimal trim range for the 3' and 5' ends of
    the paired-end reads. This assessment will be based on the *'Per sequence
    GC content'*.

    Parameters
    ----------
    p1_data: str
        Path to FastQC data report file from pair 1
    p2_data: str
        Path to FastQC data report file from pair 2

    Returns
    -------
    optimal_5trim: int
        Optimal trim index for the 5' end of the reads
    optima_3trim: int
        Optimal trim index for the 3' end of the reads

    See Also
    --------
    trim_range

    """

    sample_ranges = [trim_range(x) for x in [p1_data, p2_data]]

    # Get the optimal trim position for 5' end
    optimal_5trim = max([x[0] for x in sample_ranges])
    # Get optimal trim position for 3' end
    optimal_3trim = min([x[1] for x in sample_ranges])

    return optimal_5trim, optimal_3trim


def get_summary(summary_file):
    """Parses a FastQC summary report file and returns it as a dictionary.

    This function parses a typical FastQC summary report file, retrieving
    only the information on the first two columns. For instance, a line could
    be::

        'PASS	Basic Statistics	SH10762A_1.fastq.gz'

    This parser will build a dictionary with the string in the second column
    as a key and the QC result as the value. In this case, the returned
    ``dict`` would be something like::

        {"Basic Statistics": "PASS"}

    Parameters
    ----------
    summary_file: str
        Path to FastQC summary report.

    Returns
    -------
    summary_info: :py:data:`OrderedDict`
        Returns the information of the FastQC summary report as an ordered
        dictionary, with the categories as strings and the QC result as values.

    """

    summary_info = OrderedDict()
    logger.debug("Retrieving summary information from file: {}".format(
        summary_file))

    with open(summary_file) as fh:
        for line in fh:
            # Skip empty lines
            if not line.strip():
                continue
            # Populate summary info
            fields = [x.strip() for x in line.split("\t")]
            summary_info[fields[1]] = fields[0]

    logger.debug("Retrieved summary information from file: {}".format(
        summary_info))

    return summary_info


def check_summary_health(summary_file, **kwargs):
    """Checks the health of a sample from the FastQC summary file.

    Parses the FastQC summary file and tests whether the sample is good
    or not. There are four categories that cannot fail, and two that
    must pass in order for the sample pass this check. If the sample fails
    the quality checks, a list with the failing categories is also returned.

    Categories that cannot fail::

        fail_sensitive = [
            "Per base sequence quality",
            "Overrepresented sequences",
            "Sequence Length Distribution",
            "Per sequence GC content"
        ]

    Categories that must pass::

        must_pass = [
            "Per base N content",
            "Adapter Content"
        ]

    Parameters
    ----------
    summary_file: str
        Path to FastQC summary file.

    Returns
    -------
    x : bool
        Returns ``True`` if the sample passes all tests. ``False`` if not.
    summary_info : list
        A list with the FastQC categories that failed the tests. Is empty
        if the sample passes all tests.
    """

    # Store the summary categories that cannot fail. If they fail, do not
    # proceed with this sample
    fail_sensitive = kwargs.get("fail_sensitive", [
        "Per base sequence quality",
        "Overrepresented sequences",
        "Sequence Length Distribution",
        "Per sequence GC content"
    ])
    logger.debug("Fail sensitive categories: {}".format(fail_sensitive))

    # Store summary categories that must pass. If they do not, do not proceed
    # with that sample
    must_pass = kwargs.get("must_pass", [
        "Per base N content",
        "Adapter Content"
    ])
    logger.debug("Must pass categories: {}".format(must_pass))

    warning_fail_sensitive = kwargs.get("warning_fail_sensitive", [
        "Per base sequence quality",
        "Overrepresented sequences",

    ])

    warning_must_pass = kwargs.get("warning_must_pass", [
        "Per base sequence content"
    ])

    # Get summary dictionary
    summary_info = get_summary(summary_file)

    # This flag will change to False if one of the tests fails
    health = True
    # List of failing categories
    failed = []
    # List of warning categories
    warning = []

    for cat, test in summary_info.items():

        logger.debug("Assessing category {} with result {}".format(cat, test))

        # FAILURES
        # Check for fail sensitive
        if cat in fail_sensitive and test == "FAIL":
            health = False
            failed.append("{}:{}".format(cat, test))
            logger.error("Category {} failed a fail sensitive "
                         "category".format(cat))

        # Check for must pass
        if cat in must_pass and test != "PASS":
            health = False
            failed.append("{}:{}".format(cat, test))
            logger.error("Category {} failed a must pass category".format(
                cat))

        # WARNINGS
        # Check for fail sensitive
        if cat in warning_fail_sensitive and test == "FAIL":
            warning.append("Failed category: {}".format(cat))
            logger.warning("Category {} flagged at a fail sensitive "
                           "category".format(cat))

        if cat in warning_must_pass and test != "PASS":
            warning.append("Did not pass category: {}".format(cat))
            logger.warning("Category {} flagged at a must pass "
                           "category".format(cat))

    # Passed all tests
    return health, failed, warning


@MainWrapper
def main(sample_id, result_p1, result_p2, opts):
    """Main executor of the fastqc_report template.

    If the "--ignore-tests" option is present in the ``opts`` argument,
    the health check of the sample will be bypassed, and it will pass the
    check. This option is used in the first run of FastQC. In the second
    run (after filtering with trimmomatic) this option is not provided and
    the samples are submitted to a health check before proceeding in the
    pipeline.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    result_p1 : list
        Two element list containing the path to the FastQC report files to
        the first FastQ pair.
        The first must be the nucleotide level report and the second the
        categorical report.
    result_p2: list
        Two element list containing the path to the FastQC report files to
        the second FastQ pair.
        The first must be the nucleotide level report and the second the
        categorical report.
    opts : list
        List of arbitrary options. See `Expected input`_.

    """

    logger.info("Starting fastqc report")
    json_dic = {}

    with open("{}_trim_report".format(sample_id), "w") as trep_fh, \
            open("optimal_trim", "w") as trim_fh, \
            open("{}_status_report".format(sample_id), "w") as rep_fh, \
            open(".status", "w") as status_fh, \
            open(".warning", "w") as warn_fh, \
            open(".fail", "w") as fail_fh, \
            open(".report.json", "w") as report_fh:

        # Perform health check according to the FastQC summary report for
        # each pair. If both pairs pass the check, send the 'pass' information
        # to the 'fastqc_health' channel. If at least one fails, send the
        # summary report.
        if "--ignore-tests" not in opts:

            # Get reports for each category in json format
            json_dic = write_json_report(sample_id, result_p1[0],
                                         result_p2[0])

            logger.info("Performing FastQ health check")
            for p, fastqc_summary in enumerate([result_p1[1], result_p2[1]]):

                logger.debug("Checking files: {}".format(fastqc_summary))
                # Get the boolean health variable and a list of failed
                # categories, if any
                health, f_cat, warnings = check_summary_health(fastqc_summary)
                logger.debug("Health checked: {}".format(health))
                logger.debug("Failed categories: {}".format(f_cat))

                # Write any warnings
                if warnings:
                    json_dic["warnings"] = [{
                        "sample": sample_id,
                        "table": "qc",
                        "value": []
                    }]
                    for w in warnings:
                        warn_fh.write("{}\\n".format(w))
                        json_dic["warnings"][0]["value"].append(w)

                # Rename category summary file to the channel that will publish
                # The results
                output_file = "{}_{}_summary.txt".format(sample_id, p)
                os.rename(fastqc_summary, output_file)
                logger.debug("Setting summary file name to {}".format(
                    output_file))

                # If one of the health flags returns False, send the summary
                # report through the status channel
                if not health:
                    fail_msg = "Sample failed quality control checks:" \
                               " {}".format(",".join(f_cat))
                    logger.warning(fail_msg)
                    fail_fh.write(fail_msg)
                    json_dic["fail"] = [{
                        "sample": sample_id,
                        "table": "qc",
                        "value": [fail_msg]
                    }]
                    report_fh.write(
                        json.dumps(json_dic, separators=(",", ":")))
                    status_fh.write("fail")
                    trim_fh.write("fail")
                    rep_fh.write("{}, {}\\n".format(sample_id, ",".join(f_cat)))
                    trep_fh.write("{},fail,fail\\n".format(sample_id))

                    return

            logger.info("Sample passed quality control checks")

        status_fh.write("pass")
        rep_fh.write("{}, pass\\n".format(sample_id))

        logger.info("Assessing optimal trim range for sample")
        # Get optimal trimming range for sample, based on the per base sequence
        # content
        optimal_trim = get_sample_trim(result_p1[0], result_p2[0])
        logger.info("Optimal trim range set to: {}".format(optimal_trim))
        trim_fh.write("{}".format(" ".join([str(x) for x in optimal_trim])))

        trep_fh.write("{},{},{}\\n".format(sample_id, optimal_trim[0],
                                           optimal_trim[1]))

        # The json dict report is only populated when the FastQC quality
        # checks are performed, that is, when the --ignore-tests option
        # is not provide
        if json_dic:
            report_fh.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == '__main__':

    main(SAMPLE_ID, RESULT_P1, RESULT_P2, OPTS)


================================================
FILE: flowcraft/templates/flowcraft_utils/__init__.py
================================================


================================================
FILE: flowcraft/templates/flowcraft_utils/flowcraft_base.py
================================================
"""

"""

import os
import sys
import json
import logging
import traceback

from time import gmtime, strftime


def get_logger(filepath, level=logging.DEBUG):
    # create logger
    logger = logging.getLogger(os.path.basename(filepath))
    logger.setLevel(level)
    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(level)
    # create formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    # add formatter to ch
    ch.setFormatter(formatter)
    # add ch to logger
    logger.addHandler(ch)

    return logger


def log_error():
    """Nextflow specific function that logs an error upon unexpected failing
    """

    with open(".status", "w") as status_fh:
        status_fh.write("error")


class MainWrapper:

    def __init__(self, f):

        self.f = f
        self.context = self.f.__globals__
        self.logger = self.context.get("logger", None)

    def __call__(self, *args, **kwargs):

        self.logger.debug("Starting template at {}".format(
            strftime("%Y-%m-%d %H:%M:%S", gmtime())))
        self.logger.debug("Working directory: {}".format(os.getcwd()))

        try:
            self.build_versions()
            self.f(*args, **kwargs)
        except SystemExit as e:
            sys.exit(e)
        except:
            if self.logger:
                self.logger.error("Module exited unexpectedly with error:"
                                  "\\n{}".format(traceback.format_exc()))
            log_error()

        self.logger.debug("Finished template at {}".format(
            strftime("%Y-%m-%d %H:%M:%S", gmtime())))

    def build_versions(self):
        """Writes versions JSON for a template file

        This method creates the JSON file ``.versions`` based on the metadata
        and specific functions that are present in a given template script.

        It starts by fetching the template metadata, which can be specified
        via the ``__version__``, ``__template__`` and ``__build__``
        attributes. If all of these attributes exist, it starts to populate
        a JSON/dict array (Note that the absence of any one of them will
        prevent the version from being written).

        Then, it will search the
        template scope for functions that start with the substring
        ``__set_version`` (For example ``def __set_version_fastqc()`).
        These functions should gather the version of
        an arbitrary program and return a JSON/dict object with the following
        information::

            {
                "program": <program_name>,
                "version": <version>
                "build": <build>
            }

        This JSON/dict object is then written in the ``.versions`` file.
        """

        version_storage = []

        template_version = self.context.get("__version__", None)
        template_program = self.context.get("__template__", None)
        template_build = self.context.get("__build__", None)

        if template_version and template_program and template_build:
            if self.logger:
                self.logger.debug("Adding template version: {}; {}; "
                                  "{}".format(template_program,
                                              template_version,
                                              template_build))
            version_storage.append({
                "program": template_program,
                "version": template_version,
                "build": template_build
            })

        for var, obj in self.context.items():
            if var.startswith("__get_version"):
                ver = obj()
                version_storage.append(ver)
                if self.logger:
                    self.logger.debug("Found additional software version"
                                      "{}".format(ver))

        with open(".versions", "w") as fh:
            fh.write(json.dumps(version_storage, separators=(",", ":")))


================================================
FILE: flowcraft/templates/integrity_coverage.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module receives paired FastQ files, a genome size estimate and a minimum
coverage threshold and has three purposes while iterating over the FastQ files:

    - Checks the integrity of FastQ files (corrupted files).
    - Guesses the encoding of FastQ files (this can be turned off in the \
    ``opts`` argument).
    - Estimates the coverage for each sample.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : *Sample Identification string*
    - e.g.: ``'SampleA'``

- ``fastq_pair`` : *Pair of FastQ file paths*
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``

- ``gsize`` : *Expected genome size*
    - e.g.: ``'2.5'``

- ``cov`` : *Minimum coverage threshold*
    - e.g.: ``'15'``

- ``opts`` : *Specify additional arguments for executing integrity_coverage. \
    The arguments should be a string of command line arguments, such as \
    '-e'. The accepted arguments are:*
    - ``'-e'`` : Skip encoding guess.

Generated output
----------------

The generated output are output files that contain an object, usually a string.
(Values within ``${}`` are substituted by the corresponding variable.)

- ``${sample_id}_encoding`` : Stores the encoding for the sample FastQ. If no \
    encoding could be guessed, write 'None' to file.
    - e.g.: ``'Illumina-1.8'`` or ``'None'``

- ``${sample_id}_phred`` : Stores the phred value for the sample FastQ. If no \
    phred could be guessed, write 'None' to file.
    - ``'33'`` or ``'None'``

- ``${sample_id}_coverage`` : Stores the expected coverage of the samples, \
    based on a given genome size.
    - ``'112'`` or ``'fail'``

- ``${sample_id}_report`` : Stores the report on the expected coverage \
    estimation. This string written in this file will appear in the \
    coverage report.
    - ``'${sample_id}, 112, PASS'``

- ``${sample_id}_max_len`` : Stores the maximum read length for the current \
    sample.
    - ``'152'``

Notes
-----

In case of a corrupted sample, all expected output files should have
``'corrupt'`` written.


Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "03082018"
__template__ = "integrity_coverage-nf"

import os
import bz2
import gzip
import json
import zipfile

from itertools import chain

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

# Set constants when running from Nextflow
if __file__.endswith(".command.sh"):
    # CONSTANTS
    FASTQ_PAIR = '$fastq_pair'.split()
    SAMPLE_ID = '$sample_id'
    GSIZE = float('$gsize')
    MINIMUM_COVERAGE = float('$cov')
    OPTS = '$opts'.split()

    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("GSIZE: {}".format(GSIZE))
    logger.debug("MINIMUM_COVERAGE: {}".format(MINIMUM_COVERAGE))
    logger.debug("OPTS: {}".format(OPTS))

RANGES = {
    'Sanger': [33, (33, 73)],
    'Illumina-1.8': [33, (33, 74)],
    'Solexa': [64, (59, 104)],
    'Illumina-1.3': [64, (64, 104)],
    'Illumina-1.5': [64, (66, 105)]
}
"""
dict: Dictionary containing the encoding values for several fastq formats. The
key contains the format and the value contains a list with the corresponding
phred score and a list with the range of encodings.
"""

COPEN = {
    "gz": gzip.open,
    "bz2": bz2.open,
    "zip": zipfile.ZipFile
}

MAGIC_DICT = {
    b"\\x1f\\x8b\\x08": "gz",
    b"\\x42\\x5a\\x68": "bz2",
    b"\\x50\\x4b\\x03\\x04": "zip"
}
"""
dict: Dictionary containing the binary signatures for three compression formats
(gzip, bzip2 and zip).
"""


def guess_file_compression(file_path, magic_dict=None):
    """Guesses the compression of an input file.

    This function guesses the compression of a given file by checking for
    a binary signature at the beginning of the file. These signatures are
    stored in the :py:data:`MAGIC_DICT` dictionary. The supported compression
    formats are gzip, bzip2 and zip. If none of the signatures in this
    dictionary are found at the beginning of the file, it returns ``None``.

    Parameters
    ----------
    file_path : str
        Path to input file.
    magic_dict : dict, optional
        Dictionary containing the signatures of the compression types. The
        key should be the binary signature and the value should be the
        compression format. If left ``None``, it falls back to
        :py:data:`MAGIC_DICT`.

    Returns
    -------
    file_type : str or None
        If a compression type is detected, returns a string with the format.
        If not, returns ``None``.
    """

    if not magic_dict:
        magic_dict = MAGIC_DICT

    max_len = max(len(x) for x in magic_dict)

    with open(file_path, "rb") as f:
        file_start = f.read(max_len)

    logger.debug("Binary signature start: {}".format(file_start))

    for magic, file_type in magic_dict.items():
        if file_start.startswith(magic):
            return file_type

    return None


def get_qual_range(qual_str):
    """ Get range of the Unicode encode range for a given string of characters.

    The encoding is determined from the result of the :py:func:`ord` built-in.

    Parameters
    ----------
    qual_str : str
        Arbitrary string.

    Returns
    -------
    x : tuple
        (Minimum Unicode code, Maximum Unicode code).
    """

    vals = [ord(c) for c in qual_str]

    return min(vals), max(vals)


def get_encodings_in_range(rmin, rmax):
    """ Returns the valid encodings for a given encoding range.

    The encoding ranges are stored in the :py:data:`RANGES` dictionary, with
    the encoding name as a string and a list as a value containing the
    phred score and a tuple with the encoding range. For a given encoding
    range provided via the two first arguments, this function will return
    all possible encodings and phred scores.

    Parameters
    ----------
    rmin : int
        Minimum Unicode code in range.
    rmax : int
        Maximum Unicode code in range.

    Returns
    -------
    valid_encodings : list
        List of all possible encodings for the provided range.
    valid_phred : list
        List of all possible phred scores.

    """

    valid_encodings = []
    valid_phred = []

    for encoding, (phred, (emin, emax)) in RANGES.items():
        if rmin >= emin and rmax <= emax:
            valid_encodings.append(encoding)
            valid_phred.append(phred)

    return valid_encodings, valid_phred


@MainWrapper
def main(sample_id, fastq_pair, gsize, minimum_coverage, opts):
    """ Main executor of the integrity_coverage template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    gsize : float or int
        Estimate of genome size in Mb.
    minimum_coverage : float or int
        Minimum coverage required for a sample to pass the coverage check
    opts : list
        List of arbitrary options. See `Expected input`_.

    """

    logger.info("Starting integrity coverage main")

    # Check for runtime options
    if "-e" in opts:
        skip_encoding = True
    else:
        skip_encoding = False

    # Information for encoding guess
    gmin, gmax = 99, 0
    encoding = []
    phred = None

    # Information for coverage estimation
    chars = 0
    nreads = 0

    # Information on maximum read length
    max_read_length = 0

    # Get compression of each FastQ pair file
    file_objects = []
    for fastq in fastq_pair:

        logger.info("Processing file {}".format(fastq))

        logger.info("[{}] Guessing file compression".format(fastq))
        ftype = guess_file_compression(fastq)

        # This can guess the compression of gz, bz2 and zip. If it cannot
        # find the compression type, it tries to open a regular file
        if ftype:
            logger.info("[{}] Found file compression: {}".format(
                fastq, ftype))
            file_objects.append(COPEN[ftype](fastq, "rt"))
        else:
            logger.info("[{}] File compression not found. Assuming an "
                        "uncompressed file".format(fastq))
            file_objects.append(open(fastq))

    logger.info("Starting FastQ file parsing")

    # The '*_encoding' file stores a string with the encoding ('Sanger')
    # If no encoding is guessed, 'None' should be stored
    # The '*_phred' file stores a string with the phred score ('33')
    # If no phred is guessed, 'None' should be stored
    # The '*_coverage' file stores the estimated coverage ('88')
    # The '*_report' file stores a csv report of the file
    # The '*_max_len' file stores a string with the maximum contig len ('155')
    with open("{}_encoding".format(sample_id), "w") as enc_fh, \
            open("{}_phred".format(sample_id), "w") as phred_fh, \
            open("{}_coverage".format(sample_id), "w") as cov_fh, \
            open("{}_report".format(sample_id), "w") as cov_rep, \
            open("{}_max_len".format(sample_id), "w") as len_fh, \
            open(".report.json", "w") as json_report, \
            open(".status", "w") as status_fh, \
            open(".fail", "w") as fail_fh:

        try:
            # Iterate over both pair files sequentially using itertools.chain
            for i, line in enumerate(chain(*file_objects)):

                # Parse only every 4th line of the file for the encoding
                # e.g.: AAAA/EEEEEEEEEEE<EEEEEEEEEEEEEEEEEEEEEEEEE (...)
                if (i + 1) % 4 == 0 and not skip_encoding:
                    # It is important to strip() the line so that any newline
                    # character is removed and not accounted for in the
                    # encoding guess
                    lmin, lmax = get_qual_range(line.strip())

                    # Guess new encoding if the range expands the previously
                    # set boundaries of gmin and gmax
                    if lmin < gmin or lmax > gmax:
                        gmin, gmax = min(lmin, gmin), max(lmax, gmax)
                        encoding, phred = get_encodings_in_range(gmin, gmax)
                        logger.debug(
                            "Updating estimates at line {} with range {} to"
                            " '{}' (encoding) and '{}' (phred)".format(
                                i, [lmin, lmax], encoding, phred))

                # Parse only every 2nd line of the file for the coverage
                # e.g.: GGATAATCTACCTTGACGATTTGTACTGGCGTTGGTTTCTTA (...)
                if (i + 3) % 4 == 0:
                    read_len = len(line.strip())
                    chars += read_len
                    nreads += 1

                    # Evaluate maximum read length for sample
                    if read_len > max_read_length:
                        logger.debug("Updating maximum read length at line "
                                     "{} to {}".format(i, read_len))
                        max_read_length = read_len

            # End of FastQ parsing
            logger.info("Finished FastQ file parsing")

            # The minimum expected coverage for a sample to pass
            exp_coverage = round(chars / (gsize * 1e6), 2)

            # Set json report
            if "-e" not in opts:

                json_dic = {
                    "tableRow": [{
                        "sample": sample_id,
                        "data": [
                            {"header": "Raw BP",
                             "value": chars,
                             "table": "qc",
                             "columnBar": True},
                            {"header": "Reads",
                             "value": nreads,
                             "table": "qc",
                             "columnBar": True},
                            {"header": "Coverage",
                             "value": exp_coverage,
                             "table": "qc",
                             "columnBar": True,
                             "failThreshold": minimum_coverage
                             }
                        ]
                    }],
                    "plotData": [{
                        "sample": sample_id,
                        "data": {
                            "sparkline": chars
                        }
                    }],
                }
            else:
                json_dic = {
                    "tableRow": [{
                        "sample": sample_id,
                        "data": [
                            {"header": "Coverage",
                             "value": exp_coverage,
                             "table": "qc",
                             "columnBar": True,
                             "failThreshold": minimum_coverage
                             }
                        ],
                    }],
                }

            # Get encoding
            if len(encoding) > 0:
                encoding = set(encoding)
                phred = set(phred)
                # Get encoding and phred as strings
                # e.g. enc: Sanger, Illumina-1.8
                # e.g. phred: 64
                enc = "{}".format(",".join([x for x in encoding]))
                phred = "{}".format(",".join(str(x) for x in phred))
                logger.info("Encoding set to {}".format(enc))
                logger.info("Phred set to {}".format(enc))

                enc_fh.write(enc)
                phred_fh.write(phred)
            # Encoding not found
            else:
                if not skip_encoding:
                    encoding_msg = "Could not guess encoding and phred from " \
                                   "FastQ"
                    logger.warning(encoding_msg)
                    json_dic["warnings"] = [{
                        "sample": sample_id,
                        "table": "qc",
                        "value": [encoding_msg]
                    }]
                    enc_fh.write("None")
                    phred_fh.write("None")

            # Estimate coverage
            logger.info("Estimating coverage based on a genome size of "
                        "{}".format(gsize))
            logger.info("Expected coverage is {}".format(exp_coverage))

            if exp_coverage >= minimum_coverage:
                cov_rep.write("{},{},{}\\n".format(
                    sample_id, str(exp_coverage), "PASS"))
                cov_fh.write(str(exp_coverage))
                status_fh.write("pass")
            # Estimated coverage does not pass minimum threshold
            else:
                fail_msg = "Sample with low coverage ({}), below the {} " \
                           "threshold".format(exp_coverage, minimum_coverage)
                logger.error(fail_msg)
                fail_fh.write(fail_msg)
                cov_fh.write("fail")
                status_fh.write("fail")
                cov_rep.write("{},{},{}\\n".format(
                    sample_id, str(exp_coverage), "FAIL"))
                json_dic["fail"] = [{
                    "sample": sample_id,
                    "table": "qc",
                    "value": [fail_msg]
                }]

            json_report.write(json.dumps(json_dic, separators=(",", ":")))
            # Maximum read length
            len_fh.write("{}".format(max_read_length))

        # This exception is raised when the input FastQ files are corrupted
        except EOFError:
            logger.error("The FastQ files could not be correctly "
                         "parsed. They may be corrupt")
            for fh in [enc_fh, phred_fh, cov_fh, cov_rep, len_fh]:
                fh.write("corrupt")
                status_fh.write("fail")
                fail_fh.write("Could not read/parse FastQ. "
                              "Possibly corrupt file")


if __name__ == "__main__":

    main(SAMPLE_ID, FASTQ_PAIR, GSIZE, MINIMUM_COVERAGE, OPTS)


================================================
FILE: flowcraft/templates/mapping2json.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to generate a json output for mapping results that
can be imported in pATLAS.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``depth_file`` : String with the name of the mash screen output file.
    - e.g.: ``'samtoolsDepthOutput_sampleA.txt'``
- ``json_dict`` : the file that contains the dictionary with keys and values for
        accessions and their respective lengths.
    - e.g.: ``'reads_sample_result_length.json'``
- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum
        coverage results from mapping. This value may range between 0 and 1.
    - e.g.: ``0.6``


Code documentation
------------------

"""

__version__ = "1.1.0"
__build__ = "04072018"
__template__ = "mapping2json-nf"

import os
import json
import sys
from pympler.asizeof import asizeof

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    DEPTH_TXT = '$depthFile'
    JSON_LENGTH = '$lengthJson'
    CUTOFF = '$cov_cutoff'
    SAMPLE_ID = '$sample_id'
else:
    DEPTH_TXT = sys.argv[1]
    JSON_LENGTH = sys.argv[2]
    CUTOFF = sys.argv[3]
    SAMPLE_ID = sys.argv[4]

logger.debug("List of arguments given: {}".format([
    DEPTH_TXT,
    JSON_LENGTH,
    CUTOFF,
    SAMPLE_ID
]))

# check if all variables are assigned
if DEPTH_TXT and JSON_LENGTH and SAMPLE_ID and CUTOFF:
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT))
    logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH))
    logger.debug("CUTOFF: {}".format(CUTOFF))
else:
    logger.error("Args should be given to this template, either from sys.argv"
                 " or through nextflow variables")


def depth_file_reader(depth_file):
    """
    Function that parse samtools depth file and creates 3 dictionaries that
    will be useful to make the outputs of this script, both the tabular file
    and the json file that may be imported by pATLAS

    Parameters
    ----------
    depth_file: textIO
        the path to depth file for each sample

    Returns
    -------
    depth_dic_coverage: dict
            dictionary with the coverage per position for each plasmid
    """

    # dict to store the mean coverage for each reference
    depth_dic_coverage = {}

    for line in depth_file:
        tab_split = line.split()  # split by any white space
        reference = "_".join(tab_split[0].strip().split("_")[0:3])  # store
        # only the gi for the reference
        position = tab_split[1]
        num_reads_align = float(tab_split[2].rstrip())

        if reference not in depth_dic_coverage:
            depth_dic_coverage[reference] = {}

        depth_dic_coverage[reference][position] = num_reads_align

    logger.info("Finished parsing depth file.")
    depth_file.close()

    logger.debug("Size of dict_cov: {} kb".format(
        asizeof(depth_dic_coverage)/1024))

    return depth_dic_coverage


def generate_jsons(depth_dic_coverage, plasmid_length, cutoff):
    """

    Parameters
    ----------
    depth_dic_coverage: dict
         dictionary with the coverage per position for each plasmid

    Returns
    -------
    percentage_bases_covered: dict
    dict_cov:  dict

    """

    # initializes the dictionary with the mean coverage results per plasmid
    percentage_bases_covered = {}
    # dict to store coverage results for a given interval of points
    dict_cov = {}

    for ref in depth_dic_coverage:
        # calculates the percentage value per each reference
        perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \
            float(plasmid_length[ref])
        # checks if percentage value is higher or equal to the cutoff defined
        if perc_value_per_ref >= cutoff:
            percentage_bases_covered[ref] = round(perc_value_per_ref, 2)

            # starts parser to get the array with the coverage for all the
            # positions
            # first, sets the interval for the reference being parsed
            interval = round(int(plasmid_length[ref]) * 0.01,
                             ndigits=0)

            # if the sequence is smaller than 100 bp, which shouldn't happen
            # anyway
            if interval < 1:
                interval = 1

            # starts dict cov for the reference
            dict_cov[ref] = {
                "length": int(plasmid_length[ref]),
                "interval": int(interval),
                "values": []
            }

            # array to store the values of coverage for each interval
            array_of_cov = []
            # the counter that is used to output the values per interval
            reset_counter = 0
            # loop to generate dict_cov
            logger.info("Generating plot data for plasmid: {}".format(ref))
            for i in range(int(plasmid_length[ref])):
                # checks if key for a given position is in dict and if so
                # adds it to array of cov, otherwise it will add a 0
                try:
                    array_of_cov.append(int(depth_dic_coverage[ref][str(i)]))
                except KeyError:
                    array_of_cov.append(0)

                # if the counter equals the interval then output to dict_cov
                if reset_counter == interval:
                    dict_cov[ref]["values"].append(
                        int(sum(array_of_cov)/len(array_of_cov))
                    )
                    # reset counter
                    reset_counter = 0
                else:
                    # if counter is less than interval then sums 1
                    reset_counter += 1

    logger.info("Successfully generated dicts necessary for output json file "
                "and .report.json depth file.")
    logger.debug("Size of percentage_bases_covered: {} kb".format(
        asizeof(percentage_bases_covered)/1024))
    logger.debug("Size of dict_cov: {} kb".format(asizeof(dict_cov)/1024))
    return percentage_bases_covered, dict_cov


@MainWrapper
def main(depth_file, json_dict, cutoff, sample_id):
    """
    Function that handles the inputs required to parse depth files from bowtie
    and dumps a dict to a json file that can be imported into pATLAS.

    Parameters
    ----------
    depth_file: str
         the path to depth file for each sample
    json_dict: str
        the file that contains the dictionary with keys and values for
        accessions
        and their respective lengths
    cutoff: str
        the cutoff used to trim the unwanted matches for the minimum coverage
        results from mapping. This value may range between 0 and 1.
    sample_id: str
        the id of the sample being parsed

    """

    # check for the appropriate value for the cutoff value for coverage results
    logger.debug("Cutoff value: {}. Type: {}".format(cutoff, type(cutoff)))
    try:
        cutoff_val = float(cutoff)
        if cutoff_val < 0.4:
            logger.warning("This cutoff value will generate a high volume of "
                           "plot data. Therefore '.report.json' can be too big")
    except ValueError:
        logger.error("Cutoff value should be a string such as: '0.6'. "
                     "The outputted value: {}. Make sure to provide an "
                     "appropriate value for --cov_cutoff".format(cutoff))
        sys.exit(1)

    # loads dict from file, this file is provided in docker image

    plasmid_length = json.load(open(json_dict))
    if plasmid_length:
        logger.info("Loaded dictionary of plasmid lengths")
    else:
        logger.error("Something went wrong and plasmid lengths dictionary"
                     "could not be loaded. Check if process received this"
                     "param successfully.")
        sys.exit(1)

    # read depth file
    depth_file_in = open(depth_file)

    # first reads the depth file and generates dictionaries to handle the input
    # to a simpler format
    logger.info("Reading depth file and creating dictionary to dump.")
    depth_dic_coverage = depth_file_reader(depth_file_in)
    percentage_bases_covered, dict_cov = generate_jsons(depth_dic_coverage,
                                                        plasmid_length,
                                                        cutoff_val)

    if percentage_bases_covered and dict_cov:
        logger.info("percentage_bases_covered length: {}".format(
            str(len(percentage_bases_covered))))
        logger.info("dict_cov length: {}".format(str(len(dict_cov))))
    else:
        logger.error("Both dicts that dump to JSON file or .report.json are "
                     "empty.")

    # then dump do file
    logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file)))
    with open("{}_mapping.json".format(depth_file), "w") as output_json:
        output_json.write(json.dumps(percentage_bases_covered))

    json_dic = {
        "tableRow": [{
            "sample": sample_id,
            "data": [{
                "header": "Mapping",
                "table": "plasmids",
                "patlas_mapping": percentage_bases_covered,
                "value": len(percentage_bases_covered)
            }]
        }],
        "sample": sample_id,
        "patlas_mapping": percentage_bases_covered,
        "plotData": [{
            "sample": sample_id,
            "data": {
                "patlasMappingSliding": dict_cov
            },
        }]
    }

    logger.debug("Size of dict_cov: {} kb".format(asizeof(json_dic)/1024))
    logger.info("Writing to .report.json")
    with open(".report.json", "w") as json_report:
        json_report.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == "__main__":
    main(DEPTH_TXT, JSON_LENGTH, CUTOFF, SAMPLE_ID)


================================================
FILE: flowcraft/templates/mashdist2json.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to generate a json output for mash dist results that
can be imported in pATLAS.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``mash_output`` : String with the name of the mash screen output file.
    - e.g.: ``'fastaFileA_mashdist.txt'``


Code documentation
------------------

"""

__version__ = "1.4.0"
__build__ = "04092018"
__template__ = "mashsdist2json-nf"

import json
import os

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    MASH_TXT = '$mashtxt'
    HASH_CUTOFF = '$shared_hashes'
    SAMPLE_ID = '$sample_id'
    ASSEMBLY_IN = '$fasta'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("MASH_TXT: {}".format(MASH_TXT))
    logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY_IN: {}".format(ASSEMBLY_IN))


def send_to_output(master_dict, mash_output, sample_id, assembly_file):
    """Send dictionary to output json file
    This function sends master_dict dictionary to a json file if master_dict is
    populated with entries, otherwise it won't create the file

    Parameters
    ----------
    master_dict: dict
        dictionary that stores all entries for a specific query sequence
        in multi-fasta given to mash dist as input against patlas database
    last_seq: str
        string that stores the last sequence that was parsed before writing to
        file and therefore after the change of query sequence between different
        rows on the input file
    mash_output: str
        the name/path of input file to main function, i.e., the name/path of
        the mash dist output txt file.
    sample_id: str
        The name of the sample being parse to .report.json file

    Returns
    -------

    """

    plot_dict = {}

    # create a new file only if master_dict is populated
    if master_dict:
        out_file = open("{}.json".format(
            "".join(mash_output.split(".")[0])), "w")
        out_file.write(json.dumps(master_dict))
        out_file.close()

        # iterate through master_dict in order to make contigs the keys
        for k,v in master_dict.items():
            if not v[2] in plot_dict:
                plot_dict[v[2]] = [k]
            else:
                plot_dict[v[2]].append(k)

        number_hits = len(master_dict)
    else:
        number_hits = 0

    json_dic = {
        "tableRow": [{
            "sample": sample_id,
            "data": [{
                "header": "Mash Dist",
                "table": "plasmids",
                "patlas_mashdist": master_dict,
                "value": number_hits
            }]
        }],
        "plotData": [{
            "sample": sample_id,
            "data": {
                "patlasMashDistXrange": plot_dict
            },
            "assemblyFile": assembly_file
        }]
    }

    with open(".report.json", "w") as json_report:
        json_report.write(json.dumps(json_dic, separators=(",", ":")))


@MainWrapper
def main(mash_output, hash_cutoff, sample_id, assembly_file):
    """
    Main function that allows to dump a mash dist txt file to a json file

    Parameters
    ----------
    mash_output: str
        A string with the input file.
    hash_cutoff: str
        the percentage cutoff for the percentage of shared hashes between query
        and plasmid in database that is allowed for the plasmid to be reported
        to the results outputs
    sample_id: str
        The name of the sample.
    """

    input_f = open(mash_output, "r")

    master_dict = {}

    for line in input_f:

        tab_split = line.split("\t")
        current_seq = tab_split[1].strip()
        ref_accession = "_".join(tab_split[0].strip().split("_")[0:3])
        mash_dist = tab_split[2].strip()
        hashes_list = tab_split[-1].strip().split("/")

        # creates a percentage of the shared hashes between the sample and the
        # reference
        perc_hashes = float(hashes_list[0]) / float(hashes_list[1])

        # if ref_accession already in dict, i.e., if the same accession number
        # matches more than one contig.
        if ref_accession in master_dict.keys():
            current_seq += ", {}".format(master_dict[ref_accession][-1])

        # assures that only the hashes with a given shared percentage are
        # reported to json file
        if perc_hashes > float(hash_cutoff):

            master_dict[ref_accession] = [
                round(1 - float(mash_dist), 2),
                round(perc_hashes, 2),
                current_seq
            ]

    # assures that file is closed in last iteration of the loop
    send_to_output(master_dict, mash_output, sample_id, assembly_file)


if __name__ == "__main__":

    main(MASH_TXT, HASH_CUTOFF, SAMPLE_ID, ASSEMBLY_IN)


================================================
FILE: flowcraft/templates/mashscreen2json.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to generate a json output for mash screen results that
can be imported in pATLAS.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``mash_output`` : String with the name of the mash screen output file.
    - e.g.: ``'sortedMashScreenResults_SampleA.txt'``


Code documentation
------------------

"""

__version__ = "1.1.0"
__build__ = "04072018"
__template__ = "mashscreen2json-nf"

from statistics import median
import os
import json

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    MASH_TXT = '$mashtxt'
    SAMPLE_ID = '$sample_id'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("MASH_TXT: {}".format(MASH_TXT))
    logger.debug("SAMPLE_ID: {}".format(MASH_TXT))

@MainWrapper
def main(mash_output, sample_id):
    '''
    converts top results from mash screen txt output to json format

    Parameters
    ----------
    mash_output: str
        this is a string that stores the path to this file, i.e, the name of
        the file
    sample_id: str
        sample name

    '''
    logger.info("Reading file : {}".format(mash_output))
    read_mash_output = open(mash_output)

    dic = {}
    median_list = []
    filtered_dic = {}

    logger.info("Generating dictionary and list to pre-process the final json")
    for line in read_mash_output:
        tab_split = line.split("\t")
        identity = tab_split[0]
        # shared_hashes = tab_split[1]
        median_multiplicity = tab_split[2]
        # p_value = tab_split[3]
        query_id = tab_split[4]
        # query-comment should not exist here and it is irrelevant

        # here identity is what in fact interests to report to json but
        # median_multiplicity also is important since it gives an rough
        # estimation of the coverage depth for each plasmid.
        # Plasmids should have higher coverage depth due to their increased
        # copy number in relation to the chromosome.
        dic[query_id] = [identity, median_multiplicity]
        median_list.append(float(median_multiplicity))

    output_json = open(" ".join(mash_output.split(".")[:-1]) + ".json", "w")

    # median cutoff is twice the median of all median_multiplicity values
    # reported by mash screen. In the case of plasmids, since the database
    # has 9k entries and reads shouldn't have that many sequences it seems ok...
    if len(median_list) > 0:
        # this statement assures that median_list has indeed any entries
        median_cutoff = median(median_list)
        logger.info("Generating final json to dump to a file")
        for k, v in dic.items():
            # estimated copy number
            copy_number = int(float(v[1]) / median_cutoff)
            # assure that plasmid as at least twice the median coverage depth
            if float(v[1]) > median_cutoff:
                filtered_dic["_".join(k.split("_")[0:3])] = [
                    round(float(v[0]),2),
                    copy_number
                ]
        logger.info(
            "Exported dictionary has {} entries".format(len(filtered_dic)))
    else:
        # if no entries were found raise an error
        logger.error("No matches were found using mash screen for the queried reads")

    output_json.write(json.dumps(filtered_dic))
    output_json.close()

    json_dic = {
        "tableRow": [{
            "sample": sample_id,
            "data": [{
                "header": "Mash Screen",
                "table": "plasmids",
                "patlas_mashscreen": filtered_dic,
                "value": len(filtered_dic)
            }]
        }],
    }

    with open(".report.json", "w") as json_report:
        json_report.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == "__main__":

    main(MASH_TXT, SAMPLE_ID)


================================================
FILE: flowcraft/templates/megahit.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended execute megahit on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``kmers`` : Setting for megahit kmers. Can be either ``'auto'``, \
    ``'default'`` or a user provided list. All must be odd, in the range 15-255, increment <= 28
    - e.g.: ``'auto'`` or ``'default'`` or ``'55 77 99 113 127'``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Generated output
----------------

- ``contigs.fa`` : Main output of megahit with the assembly
    - e.g.: ``contigs.fa``
- ``megahit_status`` :  Stores the status of the megahit run. If it was \
    successfully executed, it stores ``'pass'``. Otherwise, it stores the\
    ``STDERR`` message.
    - e.g.: ``'pass'``

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "26042018"
__template__ = "megahit-nf"

import os
import re
import subprocess

from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def is_odd(k_mer):
    for i in k_mer:
        if i % 2 != 0:
            return True
    return False


def __get_version_megahit():

    try:

        cli = ["megahit", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[-1][1:].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "megahit",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    MAX_LEN = int('$max_len'.strip())
    KMERS = '$kmers'.strip()
    MEM = '$task.memory'
    CLEAR = '$clear'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("MAX_LEN: {}".format(MAX_LEN))
    logger.debug("KMERS: {}".format(KMERS))
    logger.debug("CLEAR: {}".format(CLEAR))


def set_kmers(kmer_opt, max_read_len):
    """Returns a kmer list based on the provided kmer option and max read len.

    Parameters
    ----------
    kmer_opt : str
        The k-mer option. Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.
    max_read_len : int
        The maximum read length of the current sample.

    Returns
    -------
    kmers : list
        List of k-mer values that will be provided to megahit.

    """

    logger.debug("Kmer option set to: {}".format(kmer_opt))

    # Check if kmer option is set to auto
    if kmer_opt == "auto":

        if max_read_len >= 175:
            kmers = [55, 77, 99, 113, 127]
        else:
            kmers = [21, 33, 55, 67, 77]

        logger.debug("Kmer range automatically selected based on max read"
                     "length of {}: {}".format(max_read_len, kmers))

    # Check if manual k-mers were specified
    elif len(kmer_opt.split()) > 1:

        kmers = kmer_opt.split()
        if kmers[0]<15 or kmers[-1]>255 or is_odd(kmers):
            kmers = []
            logger.debug("Kmer out of range or with even numbers"
                         "(will be automatically determined by megahit")
        else:
            logger.debug("Kmer range manually set to: {}".format(kmers))

    else:

        kmers = []
        logger.debug("Kmer range set to empty (will be automatically "
                     "determined by megahit")

    return kmers


def fix_contig_names(asseembly_path):
    """Removes whitespace from the assembly contig names

    Parameters
    ----------
    asseembly_path : path to assembly file

    Returns
    -------
    str:
        Path to new assembly file with fixed contig names
    """

    fixed_assembly = "fixed_assembly.fa"

    with open(asseembly_path) as in_hf, open(fixed_assembly, "w") as ou_fh:

        for line in in_hf:

            if line.startswith(">"):
                fixed_line = line.replace(" ", "_")
                ou_fh.write(fixed_line)
            else:
                ou_fh.write(line)

    return fixed_assembly


def clean_up(fastq):
    """
    Cleans the temporary fastq files. If they are symlinks, the link
    source is removed

    Parameters
    ----------
    fastq : list
        List of fastq files.
    """

    for fq in fastq:
        # Get real path of fastq files, following symlinks
        rp = os.path.realpath(fq)
        logger.debug("Removing temporary fastq file path: {}".format(rp))
        if re.match(".*/work/.{2}/.{30}/.*", rp):
            os.remove(rp)


@MainWrapper
def main(sample_id, fastq_pair, max_len, kmer, mem, clear):
    """Main executor of the megahit template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    max_len : int
        Maximum read length. This value is determined in
        :py:class:`templates.integrity_coverage`
    kmer : str
        Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.

    """

    logger.info("Starting megahit")

    logger.info("Setting megahit kmers")
    kmers = set_kmers(kmer, max_len)
    logger.info("megahit kmers set to: {}".format(kmers))

    mem_bytes = int(mem.replace(" GB", "")) * 1073741824

    cli = [
        "megahit",
        "--num-cpu-threads",
        "$task.cpus",
        "--memory",
        str(mem_bytes),
        "-o",
        "megahit"
    ]

    # Add kmers, if any were specified
    if kmers:
        cli += [
            "--k-list",
            ",".join([str(x) for x in kmers])
        ]

    # Add FastQ files
    cli += [
        "-1",
        fastq_pair[0],
        "-2",
        fastq_pair[1]
    ]

    logger.debug("Running megahit subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished megahit subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished megahit subprocesswith STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished megahit with return code: {}".format(
        p.returncode))

    with open(".status", "w") as fh:
        if p.returncode != 0:
            fh.write("error")
            return
        else:
            fh.write("pass")

    assembly_path = "megahit/final.contigs.fa"
    fixed_assembly = fix_contig_names(assembly_path)

    # Change the default final.contigs.fa assembly name to a more informative
    #  one
    if "_trim." in fastq_pair[0]:
        sample_id += "_trim"
    # Get megahit version for output name
    info = __get_version_megahit()

    assembly_file = "{}_megahit{}.fasta".format(
        sample_id, info["version"].replace(".", ""))
    os.rename(fixed_assembly, assembly_file)
    logger.info("Setting main assembly file to: {}".format(assembly_file))

    # Remove input fastq files when clear option is specified.
    # Only remove temporary input when the expected output exists.
    if clear == "true" and os.path.exists(assembly_file):
        clean_up(fastq_pair)


if __name__ == '__main__':

    main(SAMPLE_ID, FASTQ_PAIR, MAX_LEN, KMERS, MEM, CLEAR)


================================================
FILE: flowcraft/templates/metaspades.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended execute metaSpades on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``kmers`` : Setting for Spades kmers. Can be either ``'auto'``, \
    ``'default'`` or a user provided list.
    - e.g.: ``'auto'`` or ``'default'`` or ``'55 77 99 113 127'``

Generated output
----------------

- ``contigs.fasta`` : Main output of spades with the assembly
    - e.g.: ``contigs.fasta``
- ``spades_status`` :  Stores the status of the spades run. If it was \
    successfully executed, it stores ``'pass'``. Otherwise, it stores the\
    ``STDERR`` message.
    - e.g.: ``'pass'``

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "16012018"
__template__ = "metaspades-nf"

import os
import re
import subprocess

from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_spades():

    try:

        cli = ["metaspades.py", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[-1][1:].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "metaSPAdes",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    MAX_LEN = int('$max_len'.strip())
    KMERS = '$kmers'.strip()
    CLEAR = '$clear'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("MAX_LEN: {}".format(MAX_LEN))
    logger.debug("KMERS: {}".format(KMERS))
    logger.debug("CLEAR: {}".format(CLEAR))


def clean_up(fastq):
    """
    Cleans the temporary fastq files. If they are symlinks, the link
    source is removed

    Parameters
    ----------
    fastq : list
        List of fastq files.
    """

    for fq in fastq:
        # Get real path of fastq files, following symlinks
        rp = os.path.realpath(fq)
        logger.debug("Removing temporary fastq file path: {}".format(rp))
        if re.match(".*/work/.{2}/.{30}/.*", rp):
            os.remove(rp)


def set_kmers(kmer_opt, max_read_len):
    """Returns a kmer list based on the provided kmer option and max read len.

    Parameters
    ----------
    kmer_opt : str
        The k-mer option. Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.
    max_read_len : int
        The maximum read length of the current sample.

    Returns
    -------
    kmers : list
        List of k-mer values that will be provided to Spades.

    """

    logger.debug("Kmer option set to: {}".format(kmer_opt))

    # Check if kmer option is set to auto
    if kmer_opt == "auto":

        if max_read_len >= 175:
            kmers = [55, 77, 99, 113, 127]
        else:
            kmers = [21, 33, 55, 67, 77]

        logger.debug("Kmer range automatically selected based on max read"
                     "length of {}: {}".format(max_read_len, kmers))

    # Check if manual kmers were specified
    elif len(kmer_opt.split()) > 1:

        kmers = kmer_opt.split()
        logger.debug("Kmer range manually set to: {}".format(kmers))

    else:

        kmers = []
        logger.debug("Kmer range set to empty (will be automatically "
                     "determined by SPAdes")

    return kmers


@MainWrapper
def main(sample_id, fastq_pair, max_len, kmer, clear):
    """Main executor of the spades template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    max_len : int
        Maximum read length. This value is determined in
        :py:class:`templates.integrity_coverage`
    kmer : str
        Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.

    """

    logger.info("Starting spades")

    logger.info("Setting SPAdes kmers")
    kmers = set_kmers(kmer, max_len)
    logger.info("SPAdes kmers set to: {}".format(kmers))

    cli = [
        "metaspades.py",
        "--only-assembler",
        "--threads",
        "$task.cpus",
        "-o",
        "."
    ]

    # Add kmers, if any were specified
    if kmers:
        cli += ["-k {}".format(",".join([str(x) for x in kmers]))]

    # Add FastQ files
    cli += [
        "-1",
        fastq_pair[0],
        "-2",
        fastq_pair[1]
    ]

    logger.debug("Running metaSPAdes subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished metaSPAdes subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished metaSPAdes subprocesswith STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished metaSPAdes with return code: {}".format(
        p.returncode))

    with open(".status", "w") as fh:
        if p.returncode != 0:
            fh.write("error")
            return
        else:
            fh.write("pass")

    # Change the default contigs.fasta assembly name to a more informative one
    if "_trim." in fastq_pair[0]:
        sample_id += "_trim"

    assembly_file = "{}_metaspades.fasta".format(
        sample_id)
    os.rename("contigs.fasta", assembly_file)
    logger.info("Setting main assembly file to: {}".format(assembly_file))

    # Remove input fastq files when clear option is specified.
    # Only remove temporary input when the expected output exists.
    if clear == "true" and os.path.exists(assembly_file):
        clean_up(fastq_pair)


if __name__ == '__main__':

    main(SAMPLE_ID, FASTQ_PAIR, MAX_LEN, KMERS, CLEAR)


================================================
FILE: flowcraft/templates/pATLAS_consensus_json.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to generate a json output from the consensus results from
all the approaches available through options (mapping, assembly, mash screen)

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``mapping_json`` : String with the name of the json file with mapping results.
    - e.g.: ``'mapping_SampleA.json'``
- ``dist_json`` : String with the name of the json file with mash dist results.
    - e.g.: ``'mash_dist_SampleA.json'``
- ``screen_json`` : String with the name of the json file with mash screen results.
    - e.g.: ``'mash_screen_sampleA.json'``


Code documentation
------------------

"""

__version__ = "0.1.0"
__build__ = "24022018"
__template__ = "pATLAS_consensus_json-nf"

import os
import json

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    LIST_OF_FILES = '$infile_list'.split()
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("LIST_OF_FILES: {}".format(LIST_OF_FILES))


@MainWrapper
def main(list_of_jsons):
    """

    Parameters
    ----------
    list_of_jsons: list
        A list of files provided by fullConsensus process provided by nextflow

    """

    # first lets gather a collection of the input and their corresponding dicts
    file_correspondence = {}

    for infile in list_of_jsons:
        file_dict = json.load(open(infile))
        file_correspondence[infile] = file_dict

    json_dict = {}
    for accession in list(file_correspondence.values())[0]:
        if all([True if accession in f_dict else False
                for f_dict in file_correspondence.values()]):
            accession_dict = {}
            for infile in file_correspondence.keys():
                accession_dict[infile] = file_correspondence[infile][accession]

            json_dict[accession] = accession_dict

    out_file = open("consensus_{}.json".format(
        list_of_jsons[0].split(".")[0].split("_")[-1]), "w")

    out_file.write(json.dumps(json_dict))
    out_file.close()

    json_dic = {
        "patlas_mashscreen": json_dict
        # TODO add information for report webapp
    }

    with open(".report.json", "w") as json_report:
        json_report.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == "__main__":
    main(LIST_OF_FILES)

================================================
FILE: flowcraft/templates/pipeline_status.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to collect pipeline run statistics (such as
time, cpu, RAM for each tasks) into a report JSON

Expected input
--------------

- ``trace_file`` : *Trace file generated by nextflow*


Code documentation
------------------

"""

__version__ = "1.0.0"
__build__ = "16012018"
__template__ = "pipeline_status-nf"


import os
import json
import traceback

from os.path import join

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


LOG_STATS = ".pipeline_status.json"

if __file__.endswith(".command.sh"):
    fastq_id = 'sample_id'
    TRACE_FILE = 'pipeline_stats.txt'
    WORKDIR = '${workflow.projectDir}'


def get_json_info(fields, header):
    """

    Parameters
    ----------
    fields

    Returns
    -------

    """

    json_dic = dict((x, y) for x, y in zip(header, fields))

    return json_dic


def get_previous_stats(stats_path):
    """

    Parameters
    ----------
    workdir

    Returns
    -------

    """

    logger.debug("Path to pipeline status data set to: {}".format(stats_path))
    if os.path.exists(stats_path):
        logger.debug("Existing pipeline status data found. Loading JSON.")
        with open(stats_path) as fh:
            stats_json = json.load(fh)

    else:
        logger.debug("No pipeline status data found.")
        stats_json = {}

    return stats_json


@MainWrapper
def main(sample_id, trace_file, workdir):
    """
    Parses a nextflow trace file, searches for processes with a specific tag
    and sends a JSON report with the relevant information

    The expected fields for the trace file are::

        0. task_id
        1. process
        2. tag
        3. status
        4. exit code
        5. start timestamp
        6. container
        7. cpus
        8. duration
        9. realtime
        10. queue
        11. cpu percentage
        12. memory percentage
        13. real memory size of the process
        14. virtual memory size of the process

    Parameters
    ----------
    trace_file : str
        Path to the nextflow trace file
    """

    # Determine the path of the stored JSON for the sample_id
    stats_suffix = ".stats.json"
    stats_path = join(workdir, sample_id + stats_suffix)
    trace_path = join(workdir, trace_file)

    logger.info("Starting pipeline status routine")

    logger.debug("Checking for previous pipeline status data")
    stats_array = get_previous_stats(stats_path)
    logger.info("Stats JSON object set to : {}".format(stats_array))

    # Search for this substring in the tags field. Only lines with this
    # tag will be processed for the reports
    tag = " getStats"
    logger.debug("Tag variable set to: {}".format(tag))

    logger.info("Starting parsing of trace file: {}".format(trace_path))
    with open(trace_path) as fh:

        header = next(fh).strip().split()
        logger.debug("Header set to: {}".format(header))

        for line in fh:
            fields = line.strip().split("\t")
            # Check if tag substring is in the tag field of the nextflow trace
            if tag in fields[2] and fields[3] == "COMPLETED":
                logger.debug(
                    "Parsing trace line with COMPLETED status: {}".format(
                        line))
                current_json = get_json_info(fields, header)

                stats_array[fields[0]] = current_json
            else:
                logger.debug(
                    "Ignoring trace line without COMPLETED status"
                    " or stats specific tag: {}".format(
                        line))

    with open(join(stats_path), "w") as fh, open(".report.json", "w") as rfh:
        fh.write(json.dumps(stats_array, separators=(",", ":")))
        rfh.write(json.dumps(stats_array, separators=(",", ":")))


if __name__ == "__main__":

    main(fastq_id, TRACE_FILE, WORKDIR)


================================================
FILE: flowcraft/templates/process_abricate.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended parse the results of the Abricate for one or more
samples.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``abricate_files`` : Path to abricate output file.
    - e.g.: ``'abr_resfinder.tsv'``

Generated output
----------------

None


Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "26032018"
__template__ = "process_abricate-nf"

import re
import os
import json
import operator
import subprocess

from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_abricate():

    try:

        # Get abricate version
        cli = ["abricate", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[-1].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    try:

        # Get abricate database versions
        cli = ["abricate", "--list"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        dbout, _ = p.communicate()

        databases = [[u.decode("utf8") for u in i.strip().split()]
                     for i in dbout.splitlines()][1:]

    except Exception as e:
        logger.debug(e)
        databases = "undefined"

    return {
        "program": "abricate",
        "version": version,
        "databases": databases
    }


if __file__.endswith(".command.sh"):
    ABRICATE_FILES = '$abricate_file'.split()
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("ABRICATE_FILE: {}".format(ABRICATE_FILES))


class Abricate:
    """Main parser for Abricate output files.

    This class parses one or more output files from Abricate, usually from
    different databases. In addition to the parsing methods, it also provides
    a flexible method to filter and re-format the content of the abricate
    files.

    Parameters
    ----------
    fls : list
       List of paths to Abricate output files.
    """

    def __init__(self, fls):

        self.storage = {}
        """
        dic: Main storage of Abricate's file content. Each entry corresponds
        to a single line and contains the keys::

            - ``log_file``: Name of the summary log file containing abricate
              results
            - ``infile``: Input file of Abricate.
            - ``reference``: Reference of the query sequence.
            - ``seq_range``: Range of the query sequence in the database
             sequence.
            - ``gene``: AMR gene name.
            - ``accession``: The genomic source of the sequence.
            - ``database``: The database the sequence came from.
            - ``coverage``: Proportion of gene covered.
            - ``identity``: Proportion of exact nucleotide matches.
        """

        self._key = 0
        """
        int: Arbitrary key for unique entries in the storage attribute
        """

        self.parse_files(fls)

    def parse_files(self, fls):
        """Public method for parsing abricate output files.

        This method is called at at class instantiation for the provided
        output files. Additional abricate output files can be added using
        this method after the class instantiation.

        Parameters
        ----------
        fls : list
            List of paths to Abricate files

        """

        for f in fls:
            # Make sure paths exists
            if os.path.exists(f):
                self._parser(f)
            else:
                logger.warning("File {} does not exist".format(f))

    def _parser(self, fl):
        """Parser for a single abricate output file.

        This parser will scan a single Abricate output file and populate
        the :py:attr:`Abricate.storage` attribute.

        Parameters
        ----------
        fl : str
            Path to abricate output file

        Notes
        -----
        This method will populate the :py:attr:`Abricate.storage` attribute
        with all compliant lines in the abricate output file. Entries are
        inserted using an arbitrary key that is set by the
        :py:attr:`Abricate._key` attribute.

        """

        with open(fl) as fh:

            for line in fh:
                # Skip header and comment lines
                if line.startswith("#") or line.strip() == "":
                    continue

                fields = line.strip().split("\t")

                try:
                    coverage = float(fields[8])
                except ValueError:
                    coverage = None
                try:
                    identity = float(fields[9])
                except ValueError:
                    identity = None

                try:
                    accession = fields[11]
                except IndexError:
                    accession = None

                self.storage[self._key] = {
                    "log_file": os.path.basename(fl),
                    "infile": fields[0],
                    "reference": fields[1],
                    "seq_range": (int(fields[2]), int(fields[3])),
                    "gene": fields[4],
                    "accession": accession,
                    "database": fields[10],
                    "coverage": coverage,
                    "identity": identity
                }

                self._key += 1

    @staticmethod
    def _test_truth(x, op, y):
        """ Test the truth of a comparison between x and y using an operator.

        If you want to compare '100 > 200', this method can be called as
        self._test_truth(100, ">", 200).

        Parameters
        ----------
        x : int
            Arbitrary value to compare in the left.
        op : str
            Comparison operator.
        y : int
            Arbitrary value to compare in the right.

        Returns
        -------
        x : bool
            The 'truthness' of the test.
        """

        ops = {
            ">": operator.gt,
            "<": operator.lt,
            ">=": operator.ge,
            "<=": operator.le,
            "==": operator.eq,
            "!=": operator.ne
        }

        return ops[op](x, y)

    def iter_filter(self, filters, databases=None, fields=None,
                    filter_behavior="and"):
        """General purpose filter iterator.

        This general filter iterator allows the filtering of entries based
        on one or more custom filters. These filters must contain
        an entry of the `storage` attribute, a comparison operator, and the
        test value. For example, to filter out entries with coverage below 80::

            my_filter = ["coverage", ">=", 80]

        Filters should always be provide as a list of lists::

            iter_filter([["coverage", ">=", 80]])
            # or
            my_filters = [["coverage", ">=", 80],
                          ["identity", ">=", 50]]

            iter_filter(my_filters)

        As a convenience, a list of the desired databases can be directly
        specified using the `database` argument, which will only report
        entries for the specified databases::

            iter_filter(my_filters, databases=["plasmidfinder"])

        By default, this method will yield the complete entry record. However,
        the returned filters can be specified using the `fields` option::

            iter_filter(my_filters, fields=["reference", "coverage"])

        Parameters
        ----------
        filters : list
            List of lists with the custom filter. Each list should have three
            elements. (1) the key from the entry to be compared; (2) the
            comparison operator; (3) the test value. Example:
                ``[["identity", ">", 80]]``.
        databases : list
            List of databases that should be reported.
        fields : list
            List of fields from each individual entry that are yielded.
        filter_behavior : str
            options: ``'and'`` ``'or'``
            Sets the behaviour of the filters, if multiple filters have been
            provided. By default it is set to ``'and'``, which means that an
            entry has to pass all filters. It can be set to ``'or'``, in which
            case one one of the filters has to pass.

        yields
        ------
        dic : dict
            Dictionary object containing a :py:attr:`Abricate.storage` entry
            that passed the filters.

        """

        if filter_behavior not in ["and", "or"]:
            raise ValueError("Filter behavior must be either 'and' or 'or'")

        for dic in self.storage.values():

            # This attribute will determine whether an entry will be yielded
            # or not
            _pass = False

            # Stores the flags with the test results for each filter
            # The results will be either True or False
            flag = []

            # Filter for databases
            if databases:
                # Skip entry if not in specified database
                if dic["database"] not in databases:
                    continue

            # Apply filters
            for f in filters:
                # Get value of current filter
                val = dic[f[0]]
                if not self._test_truth(val, f[1], f[2]):
                    flag.append(False)
                else:
                    flag.append(True)

            # Test whether the entry will pass based on the test results
            # and the filter behaviour
            if filter_behavior == "and":
                if all(flag):
                    _pass = True
            elif filter_behavior == "or":
                if any(flag):
                    _pass = True

            if _pass:
                if fields:
                    yield dict((x, y) for x, y in dic.items() if x in fields)
                else:
                    yield dic

    def get_filter(self, *args, **kwargs):
        """ Wrapper of the iter_filter method that returns a list with results

        It should be called exactly as in the `iter_filter`

        Returns
        -------
        _ : list
            List of dictionary entries that passed the filters in the
            `iter_filter` method.

        See Also
        --------
        iter_filter
        """

        return list(self.iter_filter(*args, **kwargs))


class AbricateReport(Abricate):
    """Report generator for single Abricate output files

    This class is intended to parse an Abricate output file from a single
    sample and database and generates a JSON report for the report webpage.

    Parameters
    ----------
    fls : list
       List of paths to Abricate output files.
    database : (optional) str
        Name of the database for the current report. If not provided, it will
        be inferred based on the first entry of the Abricate file.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @staticmethod
    def _get_contig_id(contig_str):
        """Tries to retrieve contig id. Returns the original string if it
        is unable to retrieve the id.

        Parameters
        ----------
        contig_str : str
            Full contig string (fasta header)

        Returns
        -------
        str
            Contig id
        """

        contig_id = contig_str

        try:
            contig_id = re.search(".*NODE_([0-9]*)_.*", contig_str).group(1)
        except AttributeError:
            pass

        try:
            contig_id = re.search(".*Contig_([0-9]*)_.*", contig_str).group(1)
        except AttributeError:
            pass

        return contig_id

    def get_plot_data(self):
        """ Generates the JSON report to plot the gene boxes

        Following the convention of the reports platform, this method returns
        a list of JSON/dict objects with the information about each entry in
        the abricate file. The information contained in this JSON is::

            {contig_id: <str>,
             seqRange: [<int>, <int>],
             gene: <str>,
             accession: <str>,
             coverage: <float>,
             identity: <float>
             }

        Note that the `seqRange` entry contains the position in the
        corresponding contig, not the absolute position in the whole assembly.

        Returns
        -------
        json_dic : list
            List of JSON/dict objects with the report data.
        """

        json_dic = {"plotData": []}
        sample_dic = {}
        sample_assembly_map = {}

        for entry in self.storage.values():

            sample_id = re.match("(.*)_abr", entry["log_file"]).groups()[0]
            if sample_id not in sample_dic:
                sample_dic[sample_id] = {}

            # Get contig ID using the same regex as in `assembly_report.py`
            # template
            contig_id = self._get_contig_id(entry["reference"])
            # Get database
            database = entry["database"]
            if database not in sample_dic[sample_id]:
                sample_dic[sample_id][database] = []

            # Update the sample-assembly correspondence dict
            if sample_id not in sample_assembly_map:
                sample_assembly_map[sample_id] = entry["infile"]

            sample_dic[sample_id][database].append(
                {"contig": contig_id,
                 "seqRange": entry["seq_range"],
                 "gene": entry["gene"].replace("'", ""),
                 "accession": entry["accession"],
                 "coverage": entry["coverage"],
                 "identity": entry["identity"],
                 },
            )

        for sample, data in sample_dic.items():
            json_dic["plotData"].append(
                {
                    "sample": sample,
                    "data": {"abricateXrange": data},
                    "assemblyFile": sample_assembly_map[sample]
                }
            )

        return json_dic

    def get_table_data(self):
        """

        Returns
        -------

        """

        gene_storage = {}
        json_dic = {"tableRow": []}
        logger.info("Generating JSON table data")

        # Collect the gene lists for each database
        for key, entry in self.storage.items():

            # Retrieve and initiate new sample entry, if not present already
            logger.debug("Retrieving sample if from: {}".format(
                entry["infile"]))
            sample_id = re.match("(.*)_abr", entry["log_file"]).groups()[0]
            database = entry["database"]

            if sample_id not in gene_storage:
                gene_storage[sample_id] = {}

            if database not in gene_storage[sample_id]:
                gene_storage[sample_id][database] = []

            gene_storage[sample_id][database].append(
                entry["gene"].replace("'", "").replace('"', '')
            )

        # For each database, create the JSON report
        for sample, table_data in gene_storage.items():

            json_dic["tableRow"].append({
                "sample": sample,
                "data": []
            })

            for db, gene_list in table_data.items():

                ind_json = {
                    "table": "abricate",
                    "header": db,
                    "value": len(gene_list),
                    "geneList": gene_list
                }
                json_dic["tableRow"][-1]["data"].append(ind_json)

        return json_dic

    def write_report_data(self):
        """Writes the JSON report to a json file
        """

        json_plot = self.get_plot_data()
        json_table = self.get_table_data()

        json_dic = {**json_plot, **json_table}

        with open(".report.json", "w") as json_report:
            json_report.write(json.dumps(json_dic, separators=(",", ":")))


if __name__ == '__main__':

    @MainWrapper
    def main(abr_file):

        abr = AbricateReport(fls=abr_file)
        abr.write_report_data()

    main(ABRICATE_FILES)


================================================
FILE: flowcraft/templates/process_assembly.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to process the output of assemblies from a single
sample from programs such as Spades or Skesa.
The main input is an assembly file produced by an assembler, which will then be
filtered according to user-specified parameters.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id``: Sample Identification string.
    - e.g.: ``'SampleA'``
- ``assembly``: Fasta file with the assembly.
    - e.g.: ``'contigs.fasta'``
- ``opts``: List of options for processing spades assembly.
    1. Minimum contig length.
        - e.g.: ``'150'``
    2. Minimum k-mer coverage.
        - e.g.: ``'2'``
    3. Maximum number of contigs per 1.5Mb.
        - e.g.: ``'100'``
- ``assembler``: The name of the assembler
    - e.g.: ``spades``

Generated output
----------------

(Values within ``${}`` are substituted by the corresponding variable.)

- ``'${sample_id}.assembly.fasta'`` : Fasta file with the filtered assembly.
    - e.g.: ``'Sample1.assembly.fasta'``
- ``${sample_id}.report.fasta`` : CSV file with the results of the filters for\
    each contig.
    - e.g.: ``'Sample1.report.csv'``

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "11042018"
__template__ = "process_assembly-nf"

import os
import json
import operator

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY_FILE = '$assembly'
    GSIZE = float('$gsize')
    OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")]
    ASSEMBLER = '$assembler'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("GSIZE: {}".format(GSIZE))
    logger.debug("OPTS: {}".format(OPTS))
    logger.debug("ASSEMBLER: {}".format(ASSEMBLER))


class Assembly:
    """Class that parses and filters a Fasta assembly file

    This class parses an assembly fasta file, collects a number
    of summary statistics and metadata from the contigs, filters
    contigs based on user-defined metrics and writes filtered assemblies
    and reports.

    Parameters
    ----------
    assembly_file : str
        Path to assembly file.
    min_contig_len : int
        Minimum contig length when applying the initial assembly filter.
    min_kmer_cov : int
        Minimum k-mer coverage when applying the initial assembly.
        filter.
    sample_id : str
        Name of the sample for the current assembly.
    """

    def __init__(self, assembly_file, min_contig_len, min_kmer_cov,
                 sample_id):

        self.contigs = {}
        """
        dict: Dictionary storing data for each contig.
        """

        self.filtered_ids = []
        """
        list: List of filtered contig_ids.
        """

        self.min_gc = 0.05
        """
        float: Sets the minimum GC content on a contig.
        """

        self.sample = sample_id
        """
        str: The name of the sample for the assembly.
        """

        self.report = {}
        """
        dict: Will contain the filtering results for each contig.
        """

        self.filters = [
            ["length", ">=", min_contig_len],
            ["kmer_cov", ">=", min_kmer_cov]
        ]
        """
        list: Setting initial filters to check when parsing the assembly file.
        This can be later changed using the 'filter_contigs' method.
        """

        # Parse assembly and populate self.contigs
        self._parse_assembly(assembly_file)

        # Perform first contig filtering using min_contig_len, min_kmer_cov,
        # and gc content
        self.filter_contigs(*self.filters)

    @staticmethod
    def _parse_coverage(header_str):
        """Attempts to retrieve the coverage value from the header string.

        It splits the header by "_" and then screens the list backwards in
        search of the first float value. This will be interpreted as the
        coverage value. If it cannot find a float value, it returns None.
        This search methodology is based on the strings of assemblers
        like spades and skesa that put the mean kmer coverage for each
        contig in its corresponding fasta header.

        Parameters
        ----------
        header_str : str
            String

        Returns
        -------
        float or None
            The coverage value for the contig. None if it cannot find the
            value in the provide string.
        """

        cov = None
        for i in header_str.split("_")[::-1]:
            try:
                cov = float(i)
                break
            except ValueError:
                continue

        return cov

    def _parse_assembly(self, assembly_file):
        """Parse an assembly fasta file.

        This is a Fasta parsing method that populates the
        :py:attr:`~Assembly.contigs` attribute with data for each contig in the
        assembly.

        The insertion of data on the self.contigs is done by the
        :py:meth:`Assembly._populate_contigs` method, which also calculates
        GC content and proportions.

        Parameters
        ----------
        assembly_file : str
            Path to the assembly fasta file.

        """

        # Temporary storage of sequence data
        seq_temp = []
        # Id counter for contig that will serve as key in self.contigs
        contig_id = 0
        # Initialize kmer coverage and header
        cov, header = None, None

        with open(assembly_file) as fh:

            logger.debug("Starting iteration of assembly file: {}".format(
                assembly_file))
            for line in fh:
                # Skip empty lines
                if not line.strip():
                    continue
                else:
                    # Remove whitespace surrounding line for further processing
                    line = line.strip()

                if line.startswith(">"):
                    # If a sequence has already been populated, save the
                    # previous contig information
                    if seq_temp:
                        # Use join() to convert string list into the full
                        # contig string. This is generally much more efficient
                        # than successively concatenating strings.
                        seq = "".join(seq_temp)

                        logger.debug("Populating contig with contig_id '{}', "
                                     "header '{}' and cov '{}'".format(
                                        contig_id, header, cov))
                        self._populate_contigs(contig_id, header, cov, seq)

                        # Reset temporary sequence storage
                        seq_temp = []
                        contig_id += 1

                    header = line[1:]
                    cov = self._parse_coverage(line)

                else:
                    seq_temp.append(line)

            # Populate last contig entry
            logger.debug("Populating contig with contig_id '{}', "
                         "header '{}' and cov '{}'".format(
                            contig_id, header, cov))
            seq = "".join(seq_temp)
            self._populate_contigs(contig_id, header, cov, seq)

    def _populate_contigs(self, contig_id, header, cov, sequence):
        """ Inserts data from a single contig into\
         :py:attr:`~Assembly.contigs`.

        By providing a contig id, the original header, the coverage that
        is parsed from the header and the sequence, this method will
        populate the :py:attr:`~Assembly.contigs` attribute.

        Parameters
        ----------
        contig_id : int
            Arbitrary unique contig identifier.
        header : str
            Original header of the current contig.
        cov : float
            The contig coverage, parsed from the fasta header
        sequence : str
            The complete sequence of the contig.

        """

        # Get AT/GC/N counts and proportions.
        # Note that self._get_gc_content returns a dictionary with the
        # information on the GC/AT/N counts and proportions. This makes it
        # much easier to add to the contigs attribute using the ** notation.
        gc_kwargs = self._get_gc_content(sequence, len(sequence))
        logger.debug("Populate GC content with: {}".format(gc_kwargs))

        self.contigs[contig_id] = {
            "header": header,
            "sequence": sequence,
            "length": len(sequence),
            "kmer_cov": cov,
            **gc_kwargs
        }

    @staticmethod
    def _get_gc_content(sequence, length):
        """Get GC content and proportions.

        Parameters
        ----------
        sequence : str
            The complete sequence of the contig.
        length : int
            The length of the sequence contig.

        Returns
        -------
        x : dict
            Dictionary with the at/gc/n counts and proportions

        """

        # Get AT/GC/N counts
        at = sum(map(sequence.count, ["A", "T"]))
        gc = sum(map(sequence.count, ["G", "C"]))
        n = length - (at + gc)

        # Get AT/GC/N proportions
        at_prop = at / length
        gc_prop = gc / length
        n_prop = n / length

        return {"at": at, "gc": gc, "n": n,
                "at_prop": at_prop, "gc_prop": gc_prop, "n_prop": n_prop}

    @staticmethod
    def _test_truth(x, op, y):
        """ Test the truth of a comparisong between x and y using an \
        ``operator``.

        If you want to compare '100 > 200', this method can be called as::

            self._test_truth(100, ">", 200).

        Parameters
        ----------
        x : int
            Arbitrary value to compare in the left
        op : str
            Comparison operator
        y : int
            Arbitrary value to compare in the rigth

        Returns
        -------
        x : bool
            The 'truthness' of the test
        """

        ops = {
            ">": operator.gt,
            "<": operator.lt,
            ">=": operator.ge,
            "<=": operator.le,
        }

        return ops[op](x, y)

    def filter_contigs(self, *comparisons):
        """Filters the contigs of the assembly according to user provided\
        comparisons.

        The comparisons must be a list of three elements with the
        :py:attr:`~Assembly.contigs` key, operator and test value. For
        example, to filter contigs with a minimum length of 250, a comparison
        would be::

            self.filter_contigs(["length", ">=", 250])

        The filtered contig ids will be stored in the
        :py:attr:`~Assembly.filtered_ids` list.

        The result of the test for all contigs will be stored in the
        :py:attr:`~Assembly.report` dictionary.

        Parameters
        ----------
        comparisons : list
            List with contig key, operator and value to test.

        """

        # Reset list of filtered ids
        self.filtered_ids = []
        self.report = {}

        gc_filters = [
            ["gc_prop", ">=", self.min_gc],
            ["gc_prop", "<=", 1 - self.min_gc]
        ]

        self.filters = list(comparisons) + gc_filters

        logger.debug("Filtering contigs using filters: {}".format(
            self.filters))

        for contig_id, contig in self.contigs.items():
            for key, op, value in list(comparisons) + gc_filters:
                if not self._test_truth(contig[key], op, value):
                    self.filtered_ids.append(contig_id)
                    self.report[contig_id] = "{}/{}/{}".format(key,
                                                               contig[key],
                                                               value)
                    break
                else:
                    self.report[contig_id] = "pass"

    def get_assembly_length(self):
        """Returns the length of the assembly, without the filtered contigs.

        Returns
        -------
        x : int
            Total length of the assembly.

        """

        return sum(
            [vals["length"] for contig_id, vals in self.contigs.items()
             if contig_id not in self.filtered_ids])

    def write_assembly(self, output_file, filtered=True):
        """Writes the assembly to a new file.

        The ``filtered`` option controls whether the new assembly will be
        filtered or not.

        Parameters
        ----------
        output_file : str
            Name of the output assembly file.
        filtered : bool
            If ``True``, does not include filtered ids.
        """

        logger.debug("Writing the filtered assembly into: {}".format(
            output_file))
        with open(output_file, "w") as fh:

            for contig_id, contig in self.contigs.items():
                if contig_id not in self.filtered_ids and filtered:
                    fh.write(">{}_{}\\n{}\\n".format(self.sample,
                                                     contig["header"],
                                                     contig["sequence"]))

    def write_report(self, output_file):
        """Writes a report with the test results for the current assembly

        Parameters
        ----------
        output_file : str
            Name of the output assembly file.

        """

        logger.debug("Writing the assembly report into: {}".format(
            output_file))
        with open(output_file, "w") as fh:

            for contig_id, vals in self.report.items():
                fh.write("{}, {}\\n".format(contig_id, vals))


@MainWrapper
def main(sample_id, assembly_file, gsize, opts, assembler):
    """Main executor of the process_spades template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly_file : str
        Path to the assembly file generated by Spades.
    gsize : int
        Estimate of genome size.
    opts : list
        List of options for processing spades assembly.
    assembler : str
        Name of the assembler, for logging purposes

    """

    logger.info("Starting assembly file processing")
    warnings = []
    fails = ""

    min_contig_len, min_kmer_cov, max_contigs = [int(x) for x in opts]
    logger.debug("Setting minimum conting length to: {}".format(
        min_contig_len))
    logger.debug("Setting minimum kmer coverage: {}".format(min_kmer_cov))

    # Parse the spades assembly file and perform the first filtering.
    logger.info("Starting assembly parsing")
    assembly_obj = Assembly(assembly_file, min_contig_len, min_kmer_cov,
                               sample_id)

    with open(".warnings", "w") as warn_fh:
        t_80 = gsize * 1000000 * 0.8
        t_150 = gsize * 1000000 * 1.5
        # Check if assembly size of the first assembly is lower than 80% of the
        # estimated genome size. If True, redo the filtering without the
        # k-mer coverage filter
        assembly_len = assembly_obj.get_assembly_length()
        logger.debug("Checking assembly length: {}".format(assembly_len))

        if assembly_len < t_80:

            logger.warning("Assembly size ({}) smaller than the minimum "
                           "threshold of 80% of expected genome size. "
                           "Applying contig filters without the k-mer "
                           "coverage filter".format(assembly_len))
            assembly_obj.filter_contigs(*[
                ["length", ">=", min_contig_len]
            ])

            assembly_len = assembly_obj.get_assembly_length()
            logger.debug("Checking updated assembly length: "
                         "{}".format(assembly_len))
            if assembly_len < t_80:

                warn_msg = "Assembly size smaller than the minimum" \
                           " threshold of 80% of expected genome size: {}".format(
                                assembly_len)
                logger.warning(warn_msg)
                warn_fh.write(warn_msg)
                fails = warn_msg

        if assembly_len > t_150:

            warn_msg = "Assembly size ({}) larger than the maximum" \
                       " threshold of 150% of expected genome size.".format(
                            assembly_len)
            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            fails = warn_msg

        logger.debug("Checking number of contigs: {}".format(
            len(assembly_obj.contigs)))
        contig_threshold = (max_contigs * gsize) / 1.5
        if len(assembly_obj.contigs) > contig_threshold:

            warn_msg = "The number of contigs ({}) exceeds the threshold of " \
                       "{} contigs per 1.5Mb ({})".format(
                            len(assembly_obj.contigs),
                            max_contigs,
                            round(contig_threshold, 1))

            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            warnings.append(warn_msg)

    # Write filtered assembly
    logger.debug("Renaming old assembly file to: {}".format(
        "{}.old".format(assembly_file)))
    assembly_obj.write_assembly("{}_proc.fasta".format(
        os.path.splitext(assembly_file)[0]))
    # Write report
    output_report = "{}.report.csv".format(sample_id)
    assembly_obj.write_report(output_report)
    # Write json report
    with open(".report.json", "w") as json_report:
        json_dic = {
            "tableRow": [{
                "sample": sample_id,
                "data": [
                    {"header": "Contigs ({})".format(assembler),
                     "value": len(assembly_obj.contigs),
                     "table": "assembly",
                     "columnBar": True},
                    {"header": "Assembled BP ({})".format(assembler),
                     "value": assembly_len,
                     "table": "assembly",
                     "columnBar": True}
                ]
            }],
        }

        if warnings:
            json_dic["warnings"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": warnings
            }]

        if fails:
            json_dic["fail"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": [fails]
            }]

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY_FILE, GSIZE, OPTS, ASSEMBLER)


================================================
FILE: flowcraft/templates/process_assembly_mapping.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to process the coverage report from the
:py:class:`assembly_mapping` process.

TODO: Better purpose

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``assembly`` : Fasta assembly file.
    - e.g.: ``'SH10761A.assembly.fasta'``
- ``coverage`` : TSV file with the average coverage for each assembled contig.
    - e.g.: ``'coverage.tsv'``
- ``coverage_bp`` : TSV file with the coverage for each assembled bp.
    - e.g.: ``'coverage.tsv'``
- ``bam_file`` : BAM file with the alignment of reads to the genome.
    - e.g.: ``'sorted.bam'``
- ``opts`` : List of options for processing assembly mapping output.
    1. Minimum coverage for assembled contigs. Can be``auto``.
        - e.g.: ``'auto'`` or ``'10'``
    2. Maximum number of contigs.
        - e.g.: '100'
- ``gsize``: Expected genome size.
    - e.g.: ``'2.5'``

Generated output
----------------
- ``${sample_id}_filtered.assembly.fasta`` : Filtered assembly file in Fasta \
    format.
    - e.g.: ``'SampleA_filtered.assembly.fasta'``
- ``filtered.bam`` : BAM file with the same filtering as the assembly file.
    - e.g.: ``filtered.bam``


Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "09022018"
__template__ = "process_assembly_mapping-nf"

import os
import json
import shutil
import subprocess

from subprocess import PIPE
from collections import OrderedDict

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_samtools():

    try:
        cli = ["samtools", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout = p.communicate()[0]

        version = stdout.splitlines()[0].split()[1].decode("utf8")
    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "Samtools",
        "version": version
    }


def __get_version_bowtie2():

    try:
        cli = ["bowtie2", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout = p.communicate()[0]

        version = stdout.splitlines()[0].split()[-1].decode("utf8")
    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "Bowtie2",
        "version": version
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY_FILE = '$assembly'
    COVERAGE_FILE = '$coverage'
    COVERAGE_BP_FILE = '$coverage_bp'
    BAM_FILE = '$bam_file'
    OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")]
    GSIZE = float('$gsize')
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY_FILE: {}".format(ASSEMBLY_FILE))
    logger.debug("COVERAGE_FILE: {}".format(COVERAGE_FILE))
    logger.debug("COVERAGE_BP_FILE: {}".format(COVERAGE_BP_FILE))
    logger.debug("BAM_FILE: {}".format(BAM_FILE))
    logger.debug("MIN_ASSEMBLY_COVERAGE: {}".format(OPTS))
    logger.debug("GSIZE: {}".format(GSIZE))


def parse_coverage_table(coverage_file):
    """Parses a file with coverage information into objects.

    This function parses a TSV file containing coverage results for
    all contigs in a given assembly and will build an ``OrderedDict``
    with the information about their coverage and length.  The length
    information is actually gathered from the contig header using a
    regular expression that assumes the usual header produced by Spades::

        contig_len = int(re.search("length_(.+?)_", line).group(1))

    Parameters
    ----------
    coverage_file : str
        Path to TSV file containing the coverage results.

    Returns
    -------
    coverage_dict : OrderedDict
        Contains the coverage and length information for each contig.
    total_size : int
        Total size of the assembly in base pairs.
    total_cov : int
        Sum of coverage values across all contigs.
    """

    # Stores the correspondence between a contig and the corresponding coverage
    # e.g.: {"contig_1": {"cov": 424} }
    coverage_dict = OrderedDict()
    # Stores the total coverage
    total_cov = 0

    with open(coverage_file) as fh:
        for line in fh:
            # Get contig and coverage
            contig, cov = line.strip().split()
            coverage_dict[contig] = {"cov": int(cov)}
            # Add total coverage
            total_cov += int(cov)
            logger.debug("Processing contig '{}' with coverage '{}'"
                         "".format(contig, cov))

    return coverage_dict, total_cov


def filter_assembly(assembly_file, minimum_coverage, coverage_info,
                    output_file):
    """Generates a filtered assembly file.

    This function generates a filtered assembly file based on an original
    assembly and a minimum coverage threshold.

    Parameters
    ----------
    assembly_file : str
        Path to original assembly file.
    minimum_coverage : int or float
        Minimum coverage required for a contig to pass the filter.
    coverage_info : OrderedDict or dict
        Dictionary containing the coverage information for each contig.
    output_file : str
        Path where the filtered assembly file will be generated.

    """

    # This flag will determine whether sequence data should be written or
    # ignored because the current contig did not pass the minimum
    # coverage threshold
    write_flag = False

    with open(assembly_file) as fh, open(output_file, "w") as out_fh:

        for line in fh:
            if line.startswith(">"):
                # Reset write_flag
                write_flag = False
                # Get header of contig
                header = line.strip()[1:]
                # Check coverage for current contig
                contig_cov = coverage_info[header]["cov"]
                # If the contig coverage is above the threshold, write to
                # output filtered assembly
                if contig_cov >= minimum_coverage:
                    write_flag = True
                    out_fh.write(line)

            elif write_flag:
                out_fh.write(line)


def filter_bam(coverage_info, bam_file, min_coverage, output_bam):
    """Uses Samtools to filter a BAM file according to minimum coverage

    Provided with a minimum coverage value, this function will use Samtools
    to filter a BAM file. This is performed to apply the same filter to
    the BAM file as the one applied to the assembly file in
    :py:func:`filter_assembly`.

    Parameters
    ----------
    coverage_info : OrderedDict or dict
        Dictionary containing the coverage information for each contig.
    bam_file : str
        Path to the BAM file.
    min_coverage : int
        Minimum coverage required for a contig to pass the filter.
    output_bam : str
        Path to the generated filtered BAM file.
    """

    # Get list of contigs that will be kept
    contig_list = [x for x, vals in coverage_info.items()
                   if vals["cov"] >= min_coverage]

    cli = [
        "samtools",
        "view",
        "-bh",
        "-F",
        "4",
        "-o",
        output_bam,
        "-@",
        "1",
        bam_file,
    ]

    cli += contig_list

    logger.debug("Runnig samtools view subprocess with command: {}".format(
        cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished samtools view subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished samtools view subprocesswith STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished samtools view with return code: {}".format(
        p.returncode))

    if not p.returncode:
        # Create index
        cli = [
            "samtools",
            "index",
            output_bam
        ]

        logger.debug("Runnig samtools index subprocess with command: "
                     "{}".format(cli))

        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()

        try:
            stderr = stderr.decode("utf8")
            stdout = stdout.decode("utf8")
        except (UnicodeDecodeError, AttributeError):
            stderr = str(stderr)
            stdout = str(stdout)

        logger.info("Finished samtools index subprocess with STDOUT:\\n"
                    "======================================\\n{}".format(
            stdout))
        logger.info("Fished samtools index subprocesswith STDERR:\\n"
                    "======================================\\n{}".format(
            stderr))
        logger.info("Finished samtools index with return code: {}".format(
            p.returncode))


def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage,
                            genome_size, contig_size, max_contigs,
                            sample_id):
    """Checks whether a filtered assembly passes a size threshold

    Given a minimum coverage threshold, this function evaluates whether an
    assembly will pass the minimum threshold of ``genome_size * 1e6 * 0.8``,
    which means 80% of the expected genome size or the maximum threshold
    of ``genome_size * 1e6 * 1.5``, which means 150% of the expected genome
    size. It will issue a warning if any of these thresholds is crossed.
    In the case of an expected genome size below 80% it will return False.

    Parameters
    ----------
    coverage_info : OrderedDict or dict
        Dictionary containing the coverage information for each contig.
    coverage_bp : dict
        Dictionary containing the per base coverage information for each
        contig. Used to determine the total number of base pairs in the
        final assembly.
    minimum_coverage : int
        Minimum coverage required for a contig to pass the filter.
    genome_size : int
        Expected genome size.
    contig_size : dict
        Dictionary with the len of each contig. Contig headers as keys and
        the corresponding lenght as values.
    max_contigs : int
        Maximum threshold for contig number. A warning is issued if this
        threshold is crossed.
    sample_id : str
        Id or name of the current sample

    Returns
    -------
    x : bool
        True if the filtered assembly size is higher than 80% of the
        expected genome size.

    """

    # Get size of assembly after filtering contigs below minimum_coverage
    assembly_len = sum([v for k, v in contig_size.items()
                        if coverage_info[k]["cov"] >= minimum_coverage])
    logger.debug("Assembly length after filtering for minimum coverage of"
                 " {}: {}".format(minimum_coverage, assembly_len))
    # Get number of contigs after filtering
    ncontigs = len([x for x in coverage_info.values()
                    if x["cov"] >= minimum_coverage])
    logger.debug("Number of contigs: {}".format(ncontigs))
    # Get number of bp after filtering
    filtered_contigs = [k for k, v in coverage_info.items()
                        if v["cov"] >= minimum_coverage]
    logger.debug("Filtered contigs for minimum coverage of "
                 "{}: {}".format(minimum_coverage, filtered_contigs))
    total_assembled_bp = sum([sum(coverage_bp[x]) for x in filtered_contigs
                              if x in coverage_bp])
    logger.debug("Total number of assembled base pairs:"
                 "{}".format(total_assembled_bp))

    warnings = []
    fails = []
    health = True

    with open(".warnings", "w") as warn_fh, \
            open(".report.json", "w") as json_report:

        logger.debug("Checking assembly size after filtering : {}".format(
            assembly_len))

        # If the filtered assembly size is above the 150% genome size
        # threshold, issue a warning
        if assembly_len > genome_size * 1e6 * 1.5:
            warn_msg = "Assembly size ({}) smaller than the maximum" \
                       " threshold of 150% of expected genome size.".format(
                            assembly_len)
            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            fails.append("Large_genome_size_({})".format(assembly_len))

        # If the number of contigs in the filtered assembly size crosses the
        # max_contigs threshold, issue a warning
        logger.debug("Checking number of contigs: {}".format(
                len(coverage_info)))
        contig_threshold = max_contigs * genome_size / 1.5
        if ncontigs > contig_threshold:
            warn_msg = "The number of contigs ({}) exceeds the threshold of " \
                       "100 contigs per 1.5Mb ({})".format(
                            ncontigs, round(contig_threshold, 1))
            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            warnings.append(warn_msg)

        # If the filtered assembly size falls below the 80% genome size
        # threshold, fail this check and return False
        if assembly_len < genome_size * 1e6 * 0.8:
            warn_msg = "Assembly size smaller than the minimum" \
                       " threshold of 80% of expected genome size: {}".format(
                            assembly_len)
            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            fails.append("Small_genome_size_({})".format(assembly_len))
            assembly_len = sum([v for v in contig_size.values()])
            total_assembled_bp = sum(
                [sum(coverage_bp[x]) for x in coverage_info if x in
                 coverage_bp])
            logger.debug("Assembly length without coverage filtering: "
                         "{}".format(assembly_len))
            logger.debug("Total number of assembled base pairs without"
                         " filtering: {}".format(total_assembled_bp))

            health = False

        json_dic = {
            "plotData": [{
                "sample": sample_id,
                "data": {
                    "sparkline": total_assembled_bp
                }
            }]
        }

        if warnings:
            json_dic["warnings"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": warnings
            }]
        if fails:
            json_dic["fail"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": [fails]
            }]

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    return health


def get_coverage_from_file(coverage_file):
    """

    Parameters
    ----------
    coverage_file

    Returns
    -------

    """

    contig_coverage = {}

    with open(coverage_file) as fh:
        for line in fh:

            fields = line.strip().split()

            # Get header
            header = fields[0]
            coverage = int(fields[2])

            if header not in contig_coverage:
                contig_coverage[header] = [coverage]
            else:
                contig_coverage[header].append(coverage)

    return contig_coverage


def evaluate_min_coverage(coverage_opt, assembly_coverage, assembly_size):
    """ Evaluates the minimum coverage threshold from the value provided in
    the coverage_opt.

    Parameters
    ----------
    coverage_opt : str or int or float
        If set to "auto" it will try to automatically determine the coverage
        to 1/3 of the assembly size, to a minimum value of 10. If it set
        to a int or float, the specified value will be used.
    assembly_coverage : int or float
        The average assembly coverage for a genome assembly. This value
        is retrieved by the `:py:func:parse_coverage_table` function.
    assembly_size : int
        The size of the genome assembly. This value is retrieved by the
        `py:func:get_assembly_size` function.

    Returns
    -------
    x: int
        Minimum coverage threshold.

    """

    if coverage_opt == "auto":
        # Get the 1/3 value of the current assembly coverage
        min_coverage = (assembly_coverage / assembly_size) * .3
        logger.info("Minimum assembly coverage automatically set to: "
                    "{}".format(min_coverage))
        # If the 1/3 coverage is lower than 10, change it to the minimum of
        # 10
        if min_coverage < 10:
            logger.info("Minimum assembly coverage cannot be set to lower"
                        " that 10. Setting to 10")
            min_coverage = 10
    else:
        min_coverage = int(coverage_opt)
        logger.info("Minimum assembly coverage manually set to: {}".format(
            min_coverage))

    return min_coverage


def get_assembly_size(assembly_file):
    """Returns the number of nucleotides and the size per contig for the
    provided assembly file path

    Parameters
    ----------
    assembly_file : str
        Path to assembly file.

    Returns
    -------
    assembly_size : int
        Size of the assembly in nucleotides
    contig_size : dict
        Length of each contig (contig name as key and length as value)

    """

    assembly_size = 0
    contig_size = {}
    header = ""

    with open(assembly_file) as fh:
        for line in fh:

            # Skip empty lines
            if line.strip() == "":
                continue

            if line.startswith(">"):
                header = line.strip()[1:]
                contig_size[header] = 0

            else:
                line_len = len(line.strip())
                assembly_size += line_len
                contig_size[header] += line_len

    return assembly_size, contig_size


@MainWrapper
def main(sample_id, assembly_file, coverage_file, coverage_bp_file, bam_file,
         opts, gsize):
    """Main executor of the process_assembly_mapping template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly_file : str
        Path to assembly file in Fasta format.
    coverage_file : str
        Path to TSV file with coverage information for each contig.
    coverage_bp_file : str
        Path to TSV file with coverage information for each base.
    bam_file : str
        Path to BAM file.
    opts : list
        List of options for processing assembly mapping.
    gsize : int
        Expected genome size

    """

    min_assembly_coverage, max_contigs = opts

    logger.info("Starting assembly mapping processing")

    # Get coverage info, total size and total coverage from the assembly
    logger.info("Parsing coverage table")
    coverage_info, a_cov = parse_coverage_table(coverage_file)
    a_size, contig_size = get_assembly_size(assembly_file)
    logger.info("Assembly processed with a total size of '{}' and coverage"
                " of '{}'".format(a_size, a_cov))
    # Get number of assembled bp after filters
    logger.info("Parsing coverage per bp table")
    coverage_bp_data = get_coverage_from_file(coverage_bp_file)

    # Assess the minimum assembly coverage
    min_coverage = evaluate_min_coverage(min_assembly_coverage, a_cov, a_size)

    # Check if filtering the assembly using the provided min_coverage will
    # reduce the final bp number to less than 80% of the estimated genome
    # size.
    # If the check below passes with True, then the filtered assembly
    # is above the 80% genome size threshold.
    filtered_assembly = "{}_filt.fasta".format(
        os.path.splitext(assembly_file)[0])
    filtered_bam = "filtered.bam"
    logger.info("Checking filtered assembly")
    if check_filtered_assembly(coverage_info, coverage_bp_data, min_coverage,
                               gsize, contig_size, int(max_contigs),
                               sample_id):
        # Filter assembly contigs based on the minimum coverage.
        logger.info("Filtered assembly passed minimum size threshold")
        logger.info("Writting filtered assembly")
        filter_assembly(assembly_file, min_coverage, coverage_info,
                        filtered_assembly)
        logger.info("Filtering BAM file according to saved contigs")
        filter_bam(coverage_info, bam_file, min_coverage, filtered_bam)
    # Could not filter the assembly as it would drop below acceptable
    # length levels. Copy the original assembly to the output assembly file
    # for compliance with the output channel
    else:
        shutil.copy(assembly_file, filtered_assembly)
        shutil.copy(bam_file, filtered_bam)
        shutil.copy(bam_file + ".bai", filtered_bam + ".bai")

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY_FILE, COVERAGE_FILE, COVERAGE_BP_FILE,
         BAM_FILE, OPTS, GSIZE)


================================================
FILE: flowcraft/templates/process_concoct.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------
This module is intended to process the output of concoct
 to generate a report in json format.

Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``sample_id`` : Sample Identification string.
- ``cluster``: concoct cluster output.

"""

import json
import csv
import os
from itertools import groupby

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


__version__ = "1.0.0"
__build__ = "22.05.2019"
__template__ = "concoct-nf"

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    CLUSTER = '$cluster'
    CONTIGS = '$contigs'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("CLUSTER: {}".format(CLUSTER))
    logger.debug("CONTIGS: {}".format(CONTIGS))


def parse_assembly(file):
    """
    Simple fasta parser.
    :param file: assembly file in fasta format
    :return: dictionary containing the contigs in the assembly
    """

    all_seqs = {}

    with open(file, "r") as handle:
        entry = (x[1] for x in groupby(handle, lambda line: line[0] == ">"))
        for header in entry:
            contig_header = header.__next__()[1:].strip()
            contig_seq = "".join(s.strip() for s in entry.__next__())
            all_seqs[contig_header] = contig_seq

    return all_seqs


def parse_cluster_csv(file):
    """
    Simple csv parser for clustering file of concoct
    :param file: clustering csv file
    :return: dictionary containing the cluster id and the contigs in the cluster
    """

    clusters = {}

    reader = csv.reader(open(file), delimiter=',')
    next(reader)  # skip header
    for row in reader:
        if row[1] in clusters:
            clusters[row[1]].append(row[0])
        else:
            clusters[row[1]] = [row[0]]

    return clusters


def get_GC(sequence):

    return round(sum(1 for nucl in sequence if nucl in ['G', 'C'])/len(sequence)*100, 2)


def merge_data(contigs, clusters):
    """
    Obtain genome size, cg content and number of contigs for concoct bins

    :param contigs: dict with the sequences for the binned contigs
    :param clusters: dict with the cluster and respective sequence headers
    :return: dict with the statistics for each bin (cluster)
    """

    binning = {}

    for cluster_id in clusters.keys():
        complete_sequence = ''
        n_sequences = 0
        for sequence in clusters[cluster_id]:
            complete_sequence += contigs[sequence]
            n_sequences += 1

        binning[int(cluster_id)] = {"Bin name": cluster_id,
                                    "Contig number": n_sequences,
                                    "Genome size": len(complete_sequence),
                                    "GC content": get_GC(complete_sequence)}

    return binning


@MainWrapper
def main(sample_id, cluster_file, contig_file):

    seqs = parse_assembly(contig_file)

    clusters = parse_cluster_csv(cluster_file)

    bin_stats = merge_data(seqs, clusters)

    report_list = [["Bin name", "Contig number", "Genome size", "GC content %"]]

    for key, value in sorted(bin_stats.items(), key=lambda x: x[0]):
        print("{} : {}".format(key, value))
        report_list.append([value["Bin name"],
                            str(value["Contig number"]),
                            str(value["Genome size"]),
                            str(value["GC content"])])

    # this tsvData is a single object since it only has one element
    # this data type expects full tables in tsv format
    report_json = {
        "tsvData": [{
            "sample": sample_id,
            "data": {}
        }]
    }

    # web-app excepts a list with all the values in the table.
    #  To expand this to other processes other than MaxBin2, this line needs to be reworked
    report_json["tsvData"][0]["data"]["MaxBin2"] = report_list

    with open(".report.json", "w") as k:
        k.write(json.dumps(report_json))


if __name__ == "__main__":
    main(SAMPLE_ID, CLUSTER, CONTIGS)


================================================
FILE: flowcraft/templates/process_mapping.py
================================================
#!/usr/bin/env python3

import re
import os
import json

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


"""
Purpose
-------

This module is intended to process the output of mapping proces from a single
sample from the program Bowtie for the report component.
The main input is an log file produced by the mapper.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id``: Sample Identification string.
    - e.g.: ``'SampleA'``
- ``bowtie_log``: Log file from the mapper.
    - e.g.: ``'bowtie.log'``

Generated output
----------------
- ``.report.jason``: Data structure for the report

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "10.09.2018"
__template__ = "remove_host-nf"

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    BOWTIE_LOG = '$bowtie_log'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("BOWTIE_LOG: {}".format(BOWTIE_LOG))


class Bowtie:
    """
    Class to parse and store the info in the bowtie log file.

    """

    def __init__(self, sample_id, bowtie_log):

        self.sample = sample_id
        """
        str: The name of the sample for the assembly.
        """

        self.n_reads = 0

        self.align_0x = 0

        self.align_1x = 0

        self.align_mt1x = 0

        self.overall_rate = 0.0

        # Parse assembly and populate self.n_reads, self.align_0x, self.align_1x, self.align_mt1x and self.overall_rate
        self.parse_log(bowtie_log)


    def set_n_reads(self, n_reads):
        self.n_reads = int(n_reads)


    def set_align_0x(self,align_0x):
        self.align_0x = align_0x


    def set_align_1x(self,align_1x):
        self.align_1x = align_1x


    def set_align_mt1x(self,align_mt1x):
        self.align_mt1x = align_mt1x


    def set_overall_rate(self,overall_rate):
        self.overall_rate = overall_rate


    def parse_log(self, bowtie_log):
        """Parse a bowtie log file.

        This is a bowtie log parsing method that populates the
        :py:attr:`self.n_reads, self.align_0x, self.align_1x, self.align_mt1x and self.overall_rate` attributes with
        data from the log file.

        Disclamer: THIS METHOD IS HORRIBLE BECAUSE THE BOWTIE LOG IS HORRIBLE.

        The insertion of data on the attribytes is done by the
        :py:meth:`set_attribute method.

        Parameters
        ----------
        bowtie_log : str
            Path to the boetie log file.

       """

        print("is here!")

        # Regexes - thanks to https://github.com/ewels/MultiQC/blob/master/multiqc/modules/bowtie2/bowtie2.py
        regexes = {
            'unpaired': {
                'unpaired_aligned_none': r"(\\d+) \\([\\d\\.]+%\\) aligned 0 times",
                'unpaired_aligned_one': r"(\\d+) \\([\\d\\.]+%\\) aligned exactly 1 time",
                'unpaired_aligned_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned >1 times"
            },
            'paired': {
                'paired_aligned_none': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly 0 times",
                'paired_aligned_one': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly exactly 1 time",
                'paired_aligned_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly >1 times",
                'paired_aligned_discord_one': r"(\\d+) \\([\\d\\.]+%\\) aligned discordantly 1 time",
                'paired_aligned_discord_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned discordantly >1 times",
                'paired_aligned_mate_one': r"(\\d+) \\([\\d\\.]+%\\) aligned exactly 1 time",
                'paired_aligned_mate_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned >1 times",
                'paired_aligned_mate_none': r"(\\d+) \\([\\d\\.]+%\\) aligned 0 times"
            }
        }

        #Missing parser for unpaired (not implemented in flowcraft yet)

        with open(bowtie_log, "r") as f:
            #Go through log file line by line
            for l in f:

                print(l)

                #total reads
                total = re.search(r"(\\d+) reads; of these:", l)
                print(total)
                if total:
                    print(total)
                    self.set_n_reads(total.group(1))


                # Paired end reads aka the pain
                paired = re.search(r"(\\d+) \\([\\d\\.]+%\\) were paired; of these:", l)
                if paired:
                    paired_total = int(paired.group(1))

                    paired_numbers = {}

                    # Do nested loop whilst we have this level of indentation
                    l = f.readline()
                    while l.startswith('    '):
                        for k, r in regexes['paired'].items():
                            match = re.search(r, l)
                            if match:
                                paired_numbers[k] = int(match.group(1))
                        l = f.readline()


                    align_zero_times = paired_numbers['paired_aligned_none'] + paired_numbers['paired_aligned_mate_none']
                    if align_zero_times:
                        self.set_align_0x(align_zero_times)

                    align_one_time = paired_numbers['paired_aligned_one'] + paired_numbers['paired_aligned_mate_one']
                    if align_one_time:
                        self.set_align_1x(align_one_time)

                    align_more_than_one_time = paired_numbers['paired_aligned_multi'] + paired_numbers['paired_aligned_mate_multi']
                    if align_more_than_one_time:
                        self.set_align_mt1x(align_more_than_one_time)


                # Overall alignment rate
                overall = re.search(r"([\\d\\.]+)% overall alignment rate", l)
                if overall:
                    self.overall_rate = float(overall.group(1))


@MainWrapper
def main(sample_id, bowite_log):
    """Main executor of the process_mapping template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    boetie_log: str
        Path to the log file generated by bowtie.

    """

    logger.info("Starting mapping file processing")
    warnings = []
    fails = ""

    bowtie_info = Bowtie(sample_id, bowite_log)

    print(bowtie_info.overall_rate)


    with open(".report.json", "w") as json_report:
        json_dic = {
            "tableRow": [{
                "sample": sample_id,
                "data": [
                    {"header": "Reads",
                     "value": int(bowtie_info.n_reads),
                     "table": "mapping",
                     "columnBar": False},
                    {"header": "Unmapped",
                     "value": int(bowtie_info.align_0x),
                     "table": "mapping",
                     "columnBar": False},
                    {"header": "Mapped 1x",
                     "value": int(bowtie_info.align_1x),
                     "table": "mapping",
                     "columnBar": False},
                    {"header": "Mapped >1x",
                     "value": int(bowtie_info.align_mt1x),
                     "table": "mapping",
                     "columnBar": False},
                    {"header": "Overall alignment rate (%)",
                     "value": float(bowtie_info.overall_rate),
                     "table": "mapping",
                     "columnBar": False}
                ]
            }],
        }

        if warnings:
            json_dic["warnings"] = [{
                "sample": sample_id,
                "table": "mapping",
                "value": warnings
            }]

        if fails:
            json_dic["fail"] = [{
                "sample": sample_id,
                "table": "mapping",
                "value": [fails]
            }]

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, BOWTIE_LOG)

================================================
FILE: flowcraft/templates/process_metabat.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------
This module is intended to process the output of metaBAT
 to generate a report in json format.

Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``sample_id`` : Sample Identification string.
- ``cluster``: concoct cluster output.

"""

import json
import csv
import os
from itertools import groupby

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


__version__ = "1.0.0"
__build__ = "22.05.2019"
__template__ = "concoct-nf"

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    BINS = '$bins'.split()
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("BINS: {}".format(BINS))


def parse_assembly(file):
    """
    Simple fasta parser.
    :param file: assembly file in fasta format
    :return: dictionary containing the contigs in the assembly
    """

    all_seqs = {}

    with open(file, "r") as handle:
        entry = (x[1] for x in groupby(handle, lambda line: line[0] == ">"))
        for header in entry:
            contig_header = header.__next__()[1:].strip()
            contig_seq = "".join(s.strip() for s in entry.__next__())
            all_seqs[contig_header] = contig_seq

    return all_seqs


def get_cg(sequence):

    return round(sum(1 for nucl in sequence if nucl in ['G', 'C'])/len(sequence)*100, 2)


def get_bin_stats(bin_file):
    n_contigs = 0
    all_seq = ""

    with open(bin_file, "r") as handle:
        entry = (x[1] for x in groupby(handle, lambda line: line[0] == ">"))
        for header in entry:
            n_contigs += 1
            all_seq += "".join(s.strip() for s in entry.__next__())

    return str(n_contigs), str(len(all_seq)), str(get_cg(all_seq))

@MainWrapper
def main(sample_id, bins):

    report_list = [["Bin name", "Contig number", "Genome size", "GC content %"]]

    if len(bins) == 1 and "false_bin.fa" not in bins:
        ncontigs, gsize, gc = get_bin_stats(bins)
        report_list.append([bins.split(".")[1], ncontigs, gsize, gc])
    else:
        for file in bins:
            ncontigs, gsize, gc = get_bin_stats(file)
            report_list.append([file.split(".")[1], ncontigs, gsize, gc])

    # this tsvData is a single object since it only has one element
    # this data type expects full tables in tsv format
    report_json = {
        "tsvData": [{
            "sample": sample_id,
            "data": {}
        }]
    }

    # web-app excepts a list with all the values in the table.
    #  To expand this to other processes other than MaxBin2, this line needs to be reworked
    report_json["tsvData"][0]["data"]["MaxBin2"] = report_list

    with open(".report.json", "w") as k:
        k.write(json.dumps(report_json))


if __name__ == "__main__":
    main(SAMPLE_ID, BINS)


================================================
FILE: flowcraft/templates/process_newick.py
================================================
#!/usr/bin/env python3

import os
import json
import dendropy

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


"""
Purpose
-------

This module is intended to process the newick generated by
 a proces to generate a report. The newick tree will be 
 rooted (midpoint). 
 
 
Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``newick``: phylogenetic tree in newick format.

Generated output
----------------
- ``.report.jason``: Data structure for the report

Code documentation
------------------

"""

__version__ = "1.0.2"
__build__ = "28.12.2018"
__template__ = "raxml-nf"

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    NEWICK = '$newick'
    LABELS = '$label'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("NEWICK: {}".format(NEWICK))
    logger.debug("LABELS: {}".format(LABELS))


@MainWrapper
def main(newick, labels):
    """Main executor of the process_newick template.

    Parameters
    ----------
    newick : str
        path to the newick file.

    """

    logger.info("Starting newick file processing")

    #load tree and midpoint root
    tree = dendropy.Tree.get(file=open(newick, 'r'), schema="newick")
    tree.reroot_at_midpoint()

    to_write_trees = tree.as_string("newick").strip().replace("[&R] ", '').replace(' ', '_').replace("'", "")

    #add labels to replace taxon names in phylocanvas
    labels_dict = {}

    if labels == 'true':

        original_labels = tree.update_taxon_namespace()

        for item in original_labels:

            original_name = str(item).strip().replace("[&R] ", '').replace(' ', '_').replace("'", "")

            # if it's a reference sequence
            if '|' in original_name:
                new_name = original_name.split('|')[0]
            else:
                # in case it's a reversed complement sequence or a genebank reference
                new_name = original_name.replace("_R_", "").replace("gb_", "gb:").split('_')[0]

            labels_dict[original_name] = new_name

    # write report in json format
    with open(".report.json", "w") as json_report:
        json_dic = {
            "treeData": [{
                "trees": [
                    to_write_trees
                ],
                "labels":[
                    labels_dict
                ]
            }],
        }

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':
    main(NEWICK, LABELS)


================================================
FILE: flowcraft/templates/process_tsv.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------
This module is intended to process the output in tsv
 to generate a report in json format.

Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``sample_id`` : Sample Identification string.
- ``tsv``: tsv output.

"""

import json
import csv
import os

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

__version__ = "1.0.1"
__build__ = "05.10.2018"
__template__ = "maxbin2-nf"

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FILE = '$tsv'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FILE: {}".format(FILE))

@MainWrapper
def main(sample_id, tsv_file):

    # this tsvData could be a single object since it only has one element
    # this data type expects full tables in tsv format
    report_json = {
        "tsvData": [{
            "sample": sample_id,
            "data": {}
        }]
    }

    # web-app excepts a list with all the values in the table.
    #  To expand this to other processes other than MaxBin2, this line needs to be reworked
    report_json["tsvData"][0]["data"]["MaxBin2"] = list(csv.reader(open(tsv_file), delimiter='\t'))

    with open(".report.json", "w") as k:
        k.write(json.dumps(report_json))


if __name__ == "__main__":
    main(SAMPLE_ID, FILE)


================================================
FILE: flowcraft/templates/process_viral_assembly.py
================================================
#!/usr/bin/env python3

import os
import json
import operator
from itertools import groupby

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper


"""
Purpose
-------

This module is intended to process the output of assembly process from a single
sample from the program Spades or Megahit for the report component.
The main input is an fasta file produced by the assembler.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id``: Sample Identification string.
    - e.g.: ``'SampleA'``
- ``assembly``: fasta file from the assembler.
    - e.g.: ``'spades.fasta'``
-  ``orfSize``: minimum contig size to be considered a complete ORF

Generated output
----------------
- ``.report.jason``: Data structure for the report

Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "11.09.2018"
__template__ = "viral_assembly-nf"

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY = '$assembly'
    MINSIZE = '$min_size'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY: {}".format(ASSEMBLY))
    logger.debug("MINSIZE: {}".format(MINSIZE))


class Assembly:
    """Class that parses and filters a Fasta assembly file

    This class parses an assembly fasta file, collects a number
    of summary statistics and metadata from the contigs, filters
    contigs based on user-defined metrics and writes filtered assemblies
    and reports.

    Parameters
    ----------
    assembly_file : str
        Path to assembly file.
    min_contig_len : int
        Minimum contig length when applying the initial assembly filter.
    min_kmer_cov : int
        Minimum k-mer coverage when applying the initial assembly.
        filter.
    sample_id : str
        Name of the sample for the current assembly.
    """

    def __init__(self, assembly_file, min_contig_len, min_kmer_cov,
                 sample_id, min_size):

        self.contigs = {}
        """
        dict: Dictionary storing data for each contig.
        """

        self.filtered_ids = []
        """
        list: List of filtered contig_ids.
        """

        self.min_gc = 0.05
        """
        float: Sets the minimum GC content on a contig.
        """

        self.sample = sample_id
        """
        str: The name of the sample for the assembly.
        """

        self.nORFs = 0
        """
        int: number of complete ORFs in the assembly.
        """

        self.report = {}
        """
        dict: Will contain the filtering results for each contig.
        """

        self.filters = [
            ["length", ">=", min_contig_len],
            ["kmer_cov", ">=", min_kmer_cov]
        ]
        """
        list: Setting initial filters to check when parsing the assembly file.
        This can be later changed using the 'filter_contigs' method.
        """

        # Parse assembly and populate self.contigs
        self._parse_assembly(assembly_file)

        #Gets the number of ORFs
        self.getORFs(assembly_file, min_size)

    def getORFs(self, assembly, min_size):

        f_open = open(assembly, "rU")

        entry = (x[1] for x in groupby(f_open, lambda line: line[0] == ">"))

        ORF = 0

        for header in entry:
            seq = "".join(s.strip() for s in entry.__next__())
            if len(seq) >= int(min_size):
                ORF += 1

        self.nORFs = ORF


    @staticmethod
    def _parse_coverage(header_str):
        """Attempts to retrieve the coverage value from the header string.

        It splits the header by "_" and then screens the list backwards in
        search of the first float value. This will be interpreted as the
        coverage value. If it cannot find a float value, it returns None.
        This search methodology is based on the strings of assemblers
        like spades and skesa that put the mean kmer coverage for each
        contig in its corresponding fasta header.

        Parameters
        ----------
        header_str : str
            String

        Returns
        -------
        float or None
            The coverage value for the contig. None if it cannot find the
            value in the provide string.
        """

        cov = None
        for i in header_str.split("_")[::-1]:
            try:
                cov = float(i)
                break
            except ValueError:
                continue

        return cov

    def _parse_assembly(self, assembly_file):
        """Parse an assembly fasta file.

        This is a Fasta parsing method that populates the
        :py:attr:`~Assembly.contigs` attribute with data for each contig in the
        assembly.

        The insertion of data on the self.contigs is done by the
        :py:meth:`Assembly._populate_contigs` method, which also calculates
        GC content and proportions.

        Parameters
        ----------
        assembly_file : str
            Path to the assembly fasta file.

        """

        # Temporary storage of sequence data
        seq_temp = []
        # Id counter for contig that will serve as key in self.contigs
        contig_id = 0
        # Initialize kmer coverage and header
        cov, header = None, None

        with open(assembly_file) as fh:

            logger.debug("Starting iteration of assembly file: {}".format(
                assembly_file))
            for line in fh:
                # Skip empty lines
                if not line.strip():
                    continue
                else:
                    # Remove whitespace surrounding line for further processing
                    line = line.strip()

                if line.startswith(">"):
                    # If a sequence has already been populated, save the
                    # previous contig information
                    if seq_temp:
                        # Use join() to convert string list into the full
                        # contig string. This is generally much more efficient
                        # than successively concatenating strings.
                        seq = "".join(seq_temp)

                        logger.debug("Populating contig with contig_id '{}', "
                                     "header '{}' and cov '{}'".format(
                                        contig_id, header, cov))
                        self._populate_contigs(contig_id, header, cov, seq)

                        # Reset temporary sequence storage
                        seq_temp = []
                        contig_id += 1

                    header = line[1:]
                    cov = self._parse_coverage(line)

                else:
                    seq_temp.append(line)

            # Populate last contig entry
            logger.debug("Populating contig with contig_id '{}', "
                         "header '{}' and cov '{}'".format(
                            contig_id, header, cov))
            seq = "".join(seq_temp)
            self._populate_contigs(contig_id, header, cov, seq)

    def _populate_contigs(self, contig_id, header, cov, sequence):
        """ Inserts data from a single contig into\
         :py:attr:`~Assembly.contigs`.

        By providing a contig id, the original header, the coverage that
        is parsed from the header and the sequence, this method will
        populate the :py:attr:`~Assembly.contigs` attribute.

        Parameters
        ----------
        contig_id : int
            Arbitrary unique contig identifier.
        header : str
            Original header of the current contig.
        cov : float
            The contig coverage, parsed from the fasta header
        sequence : str
            The complete sequence of the contig.

        """

        # Get AT/GC/N counts and proportions.
        # Note that self._get_gc_content returns a dictionary with the
        # information on the GC/AT/N counts and proportions. This makes it
        # much easier to add to the contigs attribute using the ** notation.
        gc_kwargs = self._get_gc_content(sequence, len(sequence))
        logger.debug("Populate GC content with: {}".format(gc_kwargs))

        self.contigs[contig_id] = {
            "header": header,
            "sequence": sequence,
            "length": len(sequence),
            "kmer_cov": cov,
            **gc_kwargs
        }

    @staticmethod
    def _get_gc_content(sequence, length):
        """Get GC content and proportions.

        Parameters
        ----------
        sequence : str
            The complete sequence of the contig.
        length : int
            The length of the sequence contig.

        Returns
        -------
        x : dict
            Dictionary with the at/gc/n counts and proportions

        """

        # Get AT/GC/N counts
        at = sum(map(sequence.count, ["A", "T"]))
        gc = sum(map(sequence.count, ["G", "C"]))
        n = length - (at + gc)

        # Get AT/GC/N proportions
        at_prop = at / length
        gc_prop = gc / length
        n_prop = n / length

        return {"at": at, "gc": gc, "n": n,
                "at_prop": at_prop, "gc_prop": gc_prop, "n_prop": n_prop}

    @staticmethod
    def _test_truth(x, op, y):
        """ Test the truth of a comparisong between x and y using an \
        ``operator``.

        If you want to compare '100 > 200', this method can be called as::

            self._test_truth(100, ">", 200).

        Parameters
        ----------
        x : int
            Arbitrary value to compare in the left
        op : str
            Comparison operator
        y : int
            Arbitrary value to compare in the rigth

        Returns
        -------
        x : bool
            The 'truthness' of the test
        """

        ops = {
            ">": operator.gt,
            "<": operator.lt,
            ">=": operator.ge,
            "<=": operator.le,
        }

        return ops[op](x, y)

    def filter_contigs(self, *comparisons):
        """Filters the contigs of the assembly according to user provided\
        comparisons.

        The comparisons must be a list of three elements with the
        :py:attr:`~Assembly.contigs` key, operator and test value. For
        example, to filter contigs with a minimum length of 250, a comparison
        would be::

            self.filter_contigs(["length", ">=", 250])

        The filtered contig ids will be stored in the
        :py:attr:`~Assembly.filtered_ids` list.

        The result of the test for all contigs will be stored in the
        :py:attr:`~Assembly.report` dictionary.

        Parameters
        ----------
        comparisons : list
            List with contig key, operator and value to test.

        """

        # Reset list of filtered ids
        self.filtered_ids = []
        self.report = {}

        gc_filters = [
            ["gc_prop", ">=", self.min_gc],
            ["gc_prop", "<=", 1 - self.min_gc]
        ]

        self.filters = list(comparisons) + gc_filters

        logger.debug("Filtering contigs using filters: {}".format(
            self.filters))

        for contig_id, contig in self.contigs.items():
            for key, op, value in list(comparisons) + gc_filters:
                if not self._test_truth(contig[key], op, value):
                    self.filtered_ids.append(contig_id)
                    self.report[contig_id] = "{}/{}/{}".format(key,
                                                               contig[key],
                                                               value)
                    break
                else:
                    self.report[contig_id] = "pass"

    def get_assembly_length(self):
        """Returns the length of the assembly, without the filtered contigs.

        Returns
        -------
        x : int
            Total length of the assembly.

        """

        return sum(
            [vals["length"] for contig_id, vals in self.contigs.items()
             if contig_id not in self.filtered_ids])

    def write_assembly(self, output_file, filtered=True):
        """Writes the assembly to a new file.

        The ``filtered`` option controls whether the new assembly will be
        filtered or not.

        Parameters
        ----------
        output_file : str
            Name of the output assembly file.
        filtered : bool
            If ``True``, does not include filtered ids.
        """

        logger.debug("Writing the filtered assembly into: {}".format(
            output_file))
        with open(output_file, "w") as fh:

            for contig_id, contig in self.contigs.items():
                if contig_id not in self.filtered_ids and filtered:
                    fh.write(">{}_{}\\n{}\\n".format(self.sample,
                                                     contig["header"],
                                                     contig["sequence"]))

    def write_report(self, output_file):
        """Writes a report with the test results for the current assembly

        Parameters
        ----------
        output_file : str
            Name of the output assembly file.

        """

        logger.debug("Writing the assembly report into: {}".format(
            output_file))
        with open(output_file, "w") as fh:

            for contig_id, vals in self.report.items():
                fh.write("{}, {}\\n".format(contig_id, vals))


@MainWrapper
def main(sample_id, assembly_file, minsize):
    """Main executor of the process_mapping template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly: str
        Path to the fatsa file generated by the assembler.
    minsize: str
        Min contig size to be considered a complete ORF

    """

    logger.info("Starting assembly file processing")
    warnings = []
    fails = ""

    # Parse the spades assembly file and perform the first filtering.
    logger.info("Starting assembly parsing")
    assembly_obj = Assembly(assembly_file, 0, 0,
                            sample_id, minsize)

    if 'spades' in assembly_file:
        assembler = "SPAdes"
    else:
        assembler = "MEGAHIT"

    with open(".warnings", "w") as warn_fh:

        t_80 = int(minsize) * 0.8
        t_150 = int(minsize) * 1.5
        # Check if assembly size of the first assembly is lower than 80% of the
        # estimated genome size - DENV ORF has min 10k nt. If True, redo the filtering without the
        # k-mer coverage filter
        assembly_len = assembly_obj.get_assembly_length()
        logger.debug("Checking assembly length: {}".format(assembly_len))

        if assembly_obj.nORFs < 1:
            warn_msg = "No complete ORFs found."
            warn_fh.write(warn_msg)
            fails = warn_msg

        if assembly_len < t_80:

            logger.warning("Assembly size ({}) smaller than the minimum "
                           "threshold of 80% of expected genome size. "
                           "Applying contig filters without the k-mer "
                           "coverage filter".format(assembly_len))

            assembly_len = assembly_obj.get_assembly_length()
            logger.debug("Checking updated assembly length: "
                         "{}".format(assembly_len))
            if assembly_len < t_80:

                warn_msg = "Assembly size smaller than the minimum" \
                           " threshold of 80% of expected genome size: {}".format(
                                assembly_len)
                logger.warning(warn_msg)
                warn_fh.write(warn_msg)
                fails = warn_msg

        if assembly_len > t_150:

            warn_msg = "Assembly size ({}) larger than the maximum" \
                       " threshold of 150% of expected genome size.".format(
                            assembly_len)
            logger.warning(warn_msg)
            warn_fh.write(warn_msg)
            fails = warn_msg


    # Write json report
    with open(".report.json", "w") as json_report:
        json_dic = {
            "tableRow": [{
                "sample": sample_id,
                "data": [
                    {"header": "Contigs ({})".format(assembler),
                     "value": len(assembly_obj.contigs),
                     "table": "assembly",
                     "columnBar": True},
                    {"header": "Assembled BP ({})".format(assembler),
                     "value": assembly_len,
                     "table": "assembly",
                     "columnBar": True},
                    {"header": "ORFs",
                     "value": assembly_obj.nORFs,
                     "table": "assembly",
                     "columnBar":False}
                ]
            }],
        }

        if warnings:
            json_dic["warnings"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": warnings
            }]

        if fails:
            json_dic["fail"] = [{
                "sample": sample_id,
                "table": "assembly",
                "value": [fails]
            }]

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(".status", "w") as status_fh:
        status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, ASSEMBLY, MINSIZE)


================================================
FILE: flowcraft/templates/skesa.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended execute Skesa on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Generated output
----------------

- ``${sample_id}_*.assembly.fasta`` : Main output of skesawith the assembly
    - e.g.: ``sample_1_skesa.fasta``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Code documentation
------------------

"""

__version__ = "1.0.2"
__build__ = "29062018"
__template__ = "skesa-nf"

import os
import re
import subprocess

from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_skesa():

    try:

        cli = ["skesa", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        _, err = p.communicate()

        try:
            version = re.search("v((\\..*))-", err.decode("utf8")).group(1)
        except AttributeError:
            version = "undefined"

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "skesa",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    CLEAR = '$clear'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("CLEAR: {}".format(CLEAR))


def clean_up(fastq):
    """
    Cleans the temporary fastq files. If they are symlinks, the link
    source is removed

    Parameters
    ----------
    fastq : list
        List of fastq files.
    """

    for fq in fastq:
        # Get real path of fastq files, following symlinks
        rp = os.path.realpath(fq)
        logger.debug("Removing temporary fastq file path: {}".format(rp))
        if re.match(".*/work/.{2}/.{30}/.*", rp):
            os.remove(rp)


@MainWrapper
def main(sample_id, fastq_pair, clear):
    """Main executor of the skesa template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    clear : str
        Can be either 'true' or 'false'. If 'true', the input fastq files will
        be removed at the end of the run, IF they are in the working directory
    """

    logger.info("Starting skesa")

    # Determine output file
    if "_trim." in fastq_pair[0]:
        sample_id += "_trim"
    version = __get_version_skesa()["version"]
    output_file = "{}_skesa{}.fasta".format(sample_id, version.replace(".", ""))

    cli = [
        "skesa",
        "--fastq",
        "{},{}".format(fastq_pair[0], fastq_pair[1]),
        "--gz",
        "--use_paired_ends",
        "--cores",
        "${task.cpus}"
    ]

    logger.debug("Running Skesa subprocess with command: {}".format(cli))

    with open(output_file, "w") as fh:
        p = subprocess.Popen(cli, stdout=fh, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished Skesa subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished Skesa subprocess with STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished Skesa with return code: {}".format(
        p.returncode))

    # Remove input fastq files when clear option is specified.
    # Only remove temporary input when the expected output exists.
    if clear == "true" and os.path.exists(output_file):
        clean_up(fastq_pair)

    with open(".status", "w") as fh:
        if p.returncode != 0:
            fh.write("error")
            raise SystemExit(p.returncode)
        else:
            fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, FASTQ_PAIR, CLEAR)


================================================
FILE: flowcraft/templates/spades.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended execute Spades on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``kmers`` : Setting for Spades kmers. Can be either ``'auto'``, \
    ``'default'`` or a user provided list.
    - e.g.: ``'auto'`` or ``'default'`` or ``'55 77 99 113 127'``
- ``opts`` : List of options for spades execution.
    1. The minimum number of reads to consider an edge in the de Bruijn \
    graph during the assembly.
        - e.g.: ``'5'``
    2. Minimum contigs k-mer coverage.
        - e.g.: ``['2' '2']``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Generated output
----------------

- ``contigs.fasta`` : Main output of spades with the assembly
    - e.g.: ``contigs.fasta``
- ``spades_status`` :  Stores the status of the spades run. If it was \
    successfully executed, it stores ``'pass'``. Otherwise, it stores the\
    ``STDERR`` message.
    - e.g.: ``'pass'``

Code documentation
------------------

"""

__version__ = "1.0.2"
__build__ = "29062018"
__template__ = "spades-nf"

import os
import sys
import re
import subprocess

from subprocess import PIPE

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_spades():

    try:

        cli = ["spades.py", "--version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().split()[-1][1:].decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "SPAdes",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    MAX_LEN = int('$max_len'.strip())
    KMERS = '$kmers'.strip()
    CLEAR = '$clear'
    DISABLE_RR = '$disable_rr'
    OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")]
    CLEAR = '$clear'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("MAX_LEN: {}".format(MAX_LEN))
    logger.debug("KMERS: {}".format(KMERS))
    logger.debug("OPTS: {}".format(OPTS))
    logger.debug("CLEAR: {}".format(CLEAR))
    logger.debug("DISABLE_RR: {}".format(DISABLE_RR))


def set_kmers(kmer_opt, max_read_len):
    """Returns a kmer list based on the provided kmer option and max read len.

    Parameters
    ----------
    kmer_opt : str
        The k-mer option. Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.
    max_read_len : int
        The maximum read length of the current sample.

    Returns
    -------
    kmers : list
        List of k-mer values that will be provided to Spades.

    """

    logger.debug("Kmer option set to: {}".format(kmer_opt))

    # Check if kmer option is set to auto
    if kmer_opt == "auto":

        if max_read_len >= 175:
            kmers = [55, 77, 99, 113, 127]
        else:
            kmers = [21, 33, 55, 67, 77]

        logger.debug("Kmer range automatically selected based on max read"
                     "length of {}: {}".format(max_read_len, kmers))

    # Check if manual kmers were specified
    elif len(kmer_opt.split()) > 1:

        kmers = kmer_opt.split()
        logger.debug("Kmer range manually set to: {}".format(kmers))

    else:

        kmers = []
        logger.debug("Kmer range set to empty (will be automatically "
                     "determined by SPAdes")

    return kmers


def clean_up(fastq):
    """
    Cleans the temporary fastq files. If they are symlinks, the link
    source is removed

    Parameters
    ----------
    fastq : list
        List of fastq files.
    """

    for fq in fastq:
        # Get real path of fastq files, following symlinks
        rp = os.path.realpath(fq)
        logger.debug("Removing temporary fastq file path: {}".format(rp))
        if re.match(".*/work/.{2}/.{30}/.*", rp):
            os.remove(rp)


@MainWrapper
def main(sample_id, fastq_pair, max_len, kmer, opts, clear, disable_rr):
    """Main executor of the spades template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    max_len : int
        Maximum read length. This value is determined in
        :py:class:`templates.integrity_coverage`
    kmer : str
        Can be either ``'auto'``, ``'default'`` or a
        sequence of space separated integers, ``'23, 45, 67'``.
    opts : List of options for spades execution. See above.
    clear : str
        Can be either 'true' or 'false'. If 'true', the input fastq files will
        be removed at the end of the run, IF they are in the working directory
    disable_rr : str
        Can either be 'true' or 'false'. If 'true', disables repeat resolution 
        stage of assembling
    """

    logger.info("Starting spades")

    min_coverage, min_kmer_coverage = opts

    logger.info("Setting SPAdes kmers")
    kmers = set_kmers(kmer, max_len)
    logger.info("SPAdes kmers set to: {}".format(kmers))

    cli = [
        "spades.py",
        "--careful",
        "--only-assembler",
        "--threads",
        "$task.cpus",
        "--cov-cutoff",
        min_coverage,
        "-o",
        "."
    ]

    # Add kmers, if any were specified
    if kmers:
        cli += ["-k {}".format(",".join([str(x) for x in kmers]))]

    # Add FastQ files
    cli += [
        "-1",
        fastq_pair[0],
        "-2",
        fastq_pair[1]
    ]

    # Disable RR?
    if disable_rr == 'true':
        cli += ['--disable-rr']

    logger.debug("Running SPAdes subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
        stdout = stdout.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)
        stdout = str(stdout)

    logger.info("Finished SPAdes subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Fished SPAdes subprocess with STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished SPAdes with return code: {}".format(
        p.returncode))

    with open(".status", "w") as fh:
        if p.returncode != 0:
            fh.write("error")
            sys.exit(p.returncode)
        else:
            fh.write("pass")

    # Change the default contigs.fasta assembly name to a more informative one
    if "_trim." in fastq_pair[0]:
        sample_id += "_trim"
    # Get spades version for output name
    info = __get_version_spades()

    assembly_file = "{}_spades{}.fasta".format(
        sample_id, info["version"].replace(".", ""))
    os.rename("contigs.fasta", assembly_file)
    logger.info("Setting main assembly file to: {}".format(assembly_file))

    # Remove input fastq files when clear option is specified.
    # Only remove temporary input when the expected output exists.
    if clear == "true" and os.path.exists(assembly_file):
        clean_up(fastq_pair)


if __name__ == '__main__':
    main(SAMPLE_ID, FASTQ_PAIR, MAX_LEN, KMERS, OPTS, CLEAR, DISABLE_RR)


================================================
FILE: flowcraft/templates/split_fasta.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module intends to split a multifasta file into seperate fasta files.

If no sequence is larger than min_contig_size, returns the original assembly.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Sample Identification string.
    - e.g.: ``'SampleA'``
- ``fasta`` : A fasta file path.
    - e.g.: ``'SampleA.fasta'``
- ``min_contig_size`` : A minimum contig length
    - e.g.: ``'1000'``

Generated output
----------------

-  A fasta file per contig (given the minimum contig size
"""

__version__ = "0.0.3"
__build__ = "19122018"
__template__ = "split_assembly-nf"

import os
from itertools import groupby
from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    ASSEMBLY = '$assembly'
    MIN_SIZE = int('$min_contig_size'.strip())
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY: {}".format(ASSEMBLY))
    logger.debug("MIN_SIZE: {}".format(MIN_SIZE))

@MainWrapper
def main(sample_id, assembly, min_size):
    """Main executor of the split_fasta template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    assembly : list
        Assembly file.
    min_size : int
        Minimum contig size."""

    logger.info("Starting script")

    f_open = open(assembly, "rU")

    success = 0

    entry = (x[1] for x in groupby(f_open, lambda line: line[0] == ">"))

    for header in entry:

        header_str = header.__next__()[1:].strip()
        seq = "".join(s.strip() for s in entry.__next__())
        if len(seq) >= min_size:
            with open(sample_id + '_' + header_str.replace(" ", "_").replace("=", "_") + '.fasta', "w") as output_file:
                output_file.write(
                    ">" + sample_id + "_" + header_str.replace(" ", "_").replace("=", "_") + "\\n" + seq + "\\n")
                success += 1

    if success < 1:
        with open(sample_id + ".fasta", "w") as logfile:

            for x in f_open.readlines():
                logfile.write(x)

    f_open.close()


    logger.info("{} sequences sucessfully splitted.".format(success))


if __name__ == '__main__':
    main(SAMPLE_ID, ASSEMBLY, MIN_SIZE)

================================================
FILE: flowcraft/templates/trimmomatic.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended execute trimmomatic on paired-end FastQ files.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``trim_range`` : Crop range detected using FastQC.
    - e.g.: ``'15 151'``
- ``opts`` : List of options for trimmomatic
    - e.g.: ``'["5:20", "3", "3", "55"]'``
    - e.g.: ``'[trim_sliding_window, trim_leading, trim_trailing, trim_min_length]'``
- ``phred`` : List of guessed phred values for each sample
    - e.g.: ``'[SampleA: 33, SampleB: 33]'``
- ``clear`` : If 'true', remove the input fastq files at the end of the
    component run, IF THE FILES ARE IN THE WORK DIRECTORY

Generated output
----------------

The generated output are output files that contain an object, usually a string.
(Values within ``${}`` are substituted by the corresponding variable.)

- ``${sample_id}_*P*``: Pair of paired FastQ files generated by Trimmomatic
    - e.g.: ``'SampleA_1_P.fastq.gz SampleA_2_P.fastq.gz'``
- ``trimmomatic_status``: Stores the status of the trimmomatic run. If it was\
    successfully executed, it stores 'pass'. Otherwise, it stores the \
    ``STDERR`` message.
    - e.g.: ``'pass'``

Code documentation
------------------

"""

# TODO: More control over read trimming
# TODO: Add option to remove adapters
# TODO: What to do when there is encoding failure

__version__ = "1.0.3"
__build__ = "29062018"
__template__ = "trimmomatic-nf"

import os
import re
import json
import fileinput
import subprocess
import tempfile

from subprocess import PIPE
from collections import OrderedDict

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


def __get_version_trimmomatic():

    try:

        cli = ["java", "-jar", TRIM_PATH, "-version"]
        p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
        stdout, _ = p.communicate()

        version = stdout.strip().decode("utf8")

    except Exception as e:
        logger.debug(e)
        version = "undefined"

    return {
        "program": "Trimmomatic",
        "version": version,
    }


if __file__.endswith(".command.sh"):
    SAMPLE_ID = '$sample_id'
    FASTQ_PAIR = '$fastq_pair'.split()
    TRIM_RANGE = '$trim_range'.split()
    TRIM_OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")]
    PHRED = '$phred'
    ADAPTERS_FILE = '$ad'
    CLEAR = '$clear'

    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("TRIM_RANGE: {}".format(TRIM_RANGE))
    logger.debug("TRIM_OPTS: {}".format(TRIM_OPTS))
    logger.debug("PHRED: {}".format(PHRED))
    logger.debug("ADAPTERS_FILE: {}".format(ADAPTERS_FILE))
    logger.debug("CLEAR: {}".format(CLEAR))

TRIM_PATH = "/NGStools/Trimmomatic-0.36/trimmomatic.jar"
ADAPTERS_PATH = "/NGStools/Trimmomatic-0.36/adapters"


def parse_log(log_file):
    """Retrieves some statistics from a single Trimmomatic log file.

    This function parses Trimmomatic's log file and stores some trimming
    statistics in an :py:class:`OrderedDict` object. This object contains
    the following keys:

        - ``clean_len``: Total length after trimming.
        - ``total_trim``: Total trimmed base pairs.
        - ``total_trim_perc``: Total trimmed base pairs in percentage.
        - ``5trim``: Total base pairs trimmed at 5' end.
        - ``3trim``: Total base pairs trimmed at 3' end.

    Parameters
    ----------
    log_file : str
        Path to trimmomatic log file.

    Returns
    -------
    x : :py:class:`OrderedDict`
        Object storing the trimming statistics.

    """

    template = OrderedDict([
        # Total length after trimming
        ("clean_len", 0),
        # Total trimmed base pairs
        ("total_trim", 0),
        # Total trimmed base pairs in percentage
        ("total_trim_perc", 0),
        # Total trimmed at 5' end
        ("5trim", 0),
        # Total trimmed at 3' end
        ("3trim", 0),
        # Bad reads (completely trimmed)
        ("bad_reads", 0)
    ])

    with open(log_file) as fh:

        for line in fh:
            # This will split the log fields into:
            # 0. read length after trimming
            # 1. amount trimmed from the start
            # 2. last surviving base
            # 3. amount trimmed from the end
            fields = [int(x) for x in line.strip().split()[-4:]]

            if not fields[0]:
                template["bad_reads"] += 1

            template["5trim"] += fields[1]
            template["3trim"] += fields[3]
            template["total_trim"] += fields[1] + fields[3]
            template["clean_len"] += fields[0]

        total_len = template["clean_len"] + template["total_trim"]

        if total_len:
            template["total_trim_perc"] = round(
                (template["total_trim"] / total_len) * 100, 2)
        else:
            template["total_trim_perc"] = 0

    return template


def write_report(storage_dic, output_file, sample_id):
    """ Writes a report from multiple samples.

    Parameters
    ----------
    storage_dic : dict or :py:class:`OrderedDict`
        Storage containing the trimming statistics. See :py:func:`parse_log`
        for its generation.
    output_file : str
        Path where the output file will be generated.
    """

    with open(output_file, "w") as fh, open(".report.json", "w") as json_rep:

        # Write header
        fh.write("Sample,Total length,Total trimmed,%,5end Trim,3end Trim,"
                 "bad_reads\\n")

        # Write contents
        for sample, vals in storage_dic.items():
            fh.write("{},{}\\n".format(
                sample, ",".join([str(x) for x in vals.values()])))

            json_dic = {
                "tableRow": [{
                    "sample": sample_id,
                    "data": [
                        {"header": "Trimmed (%)",
                         "value": vals["total_trim_perc"],
                         "table": "qc",
                         "columnBar": True},
                    ]
                }],
                "plotData": [{
                    "sample": sample_id,
                    "data": {
                        "sparkline": vals["clean_len"]
                    }
                }],
                "badReads": vals["bad_reads"]
            }
            json_rep.write(json.dumps(json_dic, separators=(",", ":")))


def trimmomatic_log(log_file, sample_id):

    log_storage = OrderedDict()

    log_storage[sample_id] = parse_log(log_file)

    #remove temp dir where log file is stored
    tempdir = os.path.dirname(log_file)

    os.remove(log_file)

    os.rmdir(tempdir)

    write_report(log_storage, "trimmomatic_report.csv", sample_id)


def clean_up(fastq_pairs, clear):
    """Cleans the working directory of unwanted temporary files"""

    # Find unpaired fastq files
    unpaired_fastq = [f for f in os.listdir(".")
                      if f.endswith("_U.fastq.gz")]

    # Remove unpaired fastq files, if any
    for fpath in unpaired_fastq:
        os.remove(fpath)

    # Expected output to assess whether it is safe to remove temporary input
    expected_out = [f for f in os.listdir(".") if f.endswith("_trim.fastq.gz")]

    if clear == "true" and len(expected_out) == 2:
        for fq in fastq_pairs:
            # Get real path of fastq files, following symlinks
            rp = os.path.realpath(fq)
            logger.debug("Removing temporary fastq file path: {}".format(rp))
            if re.match(".*/work/.{2}/.{30}/.*", rp):
                os.remove(rp)


def merge_default_adapters():
    """Merges the default adapters file in the trimmomatic adapters directory

    Returns
    -------
    str
        Path with the merged adapters file.
    """

    default_adapters = [os.path.join(ADAPTERS_PATH, x) for x in
                        os.listdir(ADAPTERS_PATH)]
    filepath = os.path.join(os.getcwd(), "default_adapters.fasta")

    with open(filepath, "w") as fh, \
            fileinput.input(default_adapters) as in_fh:
        for line in in_fh:
            fh.write("{}{}".format(line, "\\n"))

    return filepath


@MainWrapper
def main(sample_id, fastq_pair, trim_range, trim_opts, phred, adapters_file,
         clear):
    """ Main executor of the trimmomatic template.

    Parameters
    ----------
    sample_id : str
        Sample Identification string.
    fastq_pair : list
        Two element list containing the paired FastQ files.
    trim_range : list
        Two element list containing the trimming range.
    trim_opts : list
        Four element list containing several trimmomatic options:
        [*SLIDINGWINDOW*; *LEADING*; *TRAILING*; *MINLEN*]
    phred : int
        Guessed phred score for the sample. The phred score is a generated
        output from :py:class:`templates.integrity_coverage`.
    adapters_file : str
        Path to adapters file. If not provided, or the path is not available,
        it will use the default adapters from Trimmomatic will be used
    clear : str
        Can be either 'true' or 'false'. If 'true', the input fastq files will
        be removed at the end of the run, IF they are in the working directory
    """

    logger.info("Starting trimmomatic")

    # Create base CLI
    cli = [
        "java",
        "-Xmx{}".format("$task.memory"[:-1].lower().replace(" ", "")),
        "-jar",
        TRIM_PATH.strip(),
        "PE",
        "-threads",
        "$task.cpus"
    ]

    # If the phred encoding was detected, provide it
    try:
        # Check if the provided PHRED can be converted to int
        phred = int(phred)
        phred_flag = "-phred{}".format(str(phred))
        cli += [phred_flag]
    # Could not detect phred encoding. Do not add explicit encoding to
    # trimmomatic and let it guess
    except ValueError:
        pass

    # Add input samples to CLI
    cli += fastq_pair

    # Add output file names
    output_names = []
    for i in range(len(fastq_pair)):
        output_names.append("{}_{}_trim.fastq.gz".format(
            SAMPLE_ID, str(i + 1)))
        output_names.append("{}_{}_U.fastq.gz".format(
            SAMPLE_ID, str(i + 1)))
    cli += output_names

    if trim_range != ["None"]:
        cli += [
            "CROP:{}".format(trim_range[1]),
            "HEADCROP:{}".format(trim_range[0]),
        ]

    if os.path.exists(adapters_file):
        logger.debug("Using the provided adapters file '{}'".format(
            adapters_file))
    else:
        logger.debug("Adapters file '{}' not provided or does not exist. Using"
                     " default adapters".format(adapters_file))
        adapters_file = merge_default_adapters()

    cli += [
        "ILLUMINACLIP:{}:3:30:10:6:true".format(adapters_file)
    ]

    #create log file im temporary dir to avoid issues when running on a docker container in macOS
    logfile = os.path.join(tempfile.mkdtemp(prefix='tmp'), "{}_trimlog.txt".format(sample_id))

    # Add trimmomatic options
    cli += [
        "SLIDINGWINDOW:{}".format(trim_opts[0]),
        "LEADING:{}".format(trim_opts[1]),
        "TRAILING:{}".format(trim_opts[2]),
        "MINLEN:{}".format(trim_opts[3]),
        "TOPHRED33",
        "-trimlog",
        logfile
    ]

    logger.debug("Running trimmomatic subprocess with command: {}".format(cli))

    p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()

    # Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
    # string
    try:
        stderr = stderr.decode("utf8")
    except (UnicodeDecodeError, AttributeError):
        stderr = str(stderr)

    logger.info("Finished trimmomatic subprocess with STDOUT:\\n"
                "======================================\\n{}".format(stdout))
    logger.info("Finished trimmomatic subprocesswith STDERR:\\n"
                "======================================\\n{}".format(stderr))
    logger.info("Finished trimmomatic with return code: {}".format(
        p.returncode))

    trimmomatic_log(logfile, sample_id)

    if p.returncode == 0 and os.path.exists("{}_1_trim.fastq.gz".format(
            SAMPLE_ID)):
        clean_up(fastq_pair, clear)

    # Check if trimmomatic ran successfully. If not, write the error message
    # to the status channel and exit.
    with open(".status", "w") as status_fh:
        if p.returncode != 0:
            status_fh.write("fail")
            return
        else:
            status_fh.write("pass")


if __name__ == '__main__':

    main(SAMPLE_ID, FASTQ_PAIR, TRIM_RANGE, TRIM_OPTS, PHRED, ADAPTERS_FILE,
         CLEAR)


================================================
FILE: flowcraft/templates/trimmomatic_report.py
================================================
#!/usr/bin/env python3

"""
Purpose
-------

This module is intended parse the results of the Trimmomatic log for a set
of one or more samples.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``log_files``: Trimmomatic log files.
    - e.g.: ``'Sample1_trimlog.txt Sample2_trimlog.txt'``


Generated output
----------------
- ``trimmomatic_report.csv`` : Summary report of the trimmomatic logs for\
    all samples

Code documentation
------------------

"""

__version__ = "1.0.0"
__build__ = "16012018"
__template__ = "trimmomatic_report-nf"

import os
import json

from collections import OrderedDict

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    LOG_FILES = '$log_files'.split()


def parse_log(log_file):
    """Retrieves some statistics from a single Trimmomatic log file.

    This function parses Trimmomatic's log file and stores some trimming
    statistics in an :py:class:`OrderedDict` object. This object contains
    the following keys:

        - ``clean_len``: Total length after trimming.
        - ``total_trim``: Total trimmed base pairs.
        - ``total_trim_perc``: Total trimmed base pairs in percentage.
        - ``5trim``: Total base pairs trimmed at 5' end.
        - ``3trim``: Total base pairs trimmed at 3' end.

    Parameters
    ----------
    log_file : str
        Path to trimmomatic log file.

    Returns
    -------
    x : :py:class:`OrderedDict`
        Object storing the trimming statistics.

    """

    template = OrderedDict([
        # Total length after trimming
        ("clean_len", 0),
        # Total trimmed base pairs
        ("total_trim", 0),
        # Total trimmed base pairs in percentage
        ("total_trim_perc", 0),
        # Total trimmed at 5' end
        ("5trim", 0),
        # Total trimmed at 3' end
        ("3trim", 0),
        # Bad reads (completely trimmed)
        ("bad_reads", 0)
    ])

    with open(log_file) as fh:

        for line in fh:
            # This will split the log fields into:
            # 0. read length after trimming
            # 1. amount trimmed from the start
            # 2. last surviving base
            # 3. amount trimmed from the end
            fields = [int(x) for x in line.strip().split()[-4:]]

            if not fields[0]:
                template["bad_reads"] += 1

            template["5trim"] += fields[1]
            template["3trim"] += fields[3]
            template["total_trim"] += fields[1] + fields[3]
            template["clean_len"] += fields[0]

        total_len = template["clean_len"] + template["total_trim"]

        if total_len:
            template["total_trim_perc"] = round(
                (template["total_trim"] / total_len) * 100, 2)
        else:
            template["total_trim_perc"] = 0

    return template


def write_report(storage_dic, output_file, sample_id):
    """ Writes a report from multiple samples.

    Parameters
    ----------
    storage_dic : dict or :py:class:`OrderedDict`
        Storage containing the trimming statistics. See :py:func:`parse_log`
        for its generation.
    output_file : str
        Path where the output file will be generated.
    sample_id : str
        Id or name of the current sample.
    """

    with open(output_file, "w") as fh, open(".report.json", "w") as json_rep:

        # Write header
        fh.write("Sample,Total length,Total trimmed,%,5end Trim,3end Trim,"
                 "bad_reads\\n")

        # Write contents
        for sample, vals in storage_dic.items():
            fh.write("{},{}\\n".format(
                sample, ",".join([str(x) for x in vals.values()])))

            json_dic = {
                "tableRow": [{
                    "sample": sample_id,
                    "data": [
                        {"header": "trimmed",
                         "value": vals["total_trim_perc"],
                         "table": "qc",
                         "columnBar": True},
                    ]
                }],
                "plotData": [{
                    "sample": sample_id,
                    "data": {
                        "sparkline": vals["clean_len"]
                    }
                }],
                "badReads": vals["bad_reads"]
            }
            json_rep.write(json.dumps(json_dic, separators=(",", ":")))


@MainWrapper
def main(log_files):
    """ Main executor of the trimmomatic_report template.

    Parameters
    ----------
    log_files : list
        List of paths to the trimmomatic log files.
    """

    log_storage = OrderedDict()

    for log in log_files:

        log_id = log.rstrip("_trimlog.txt")

        # Populate storage of current sample
        log_storage[log_id] = parse_log(log)

        # Remove temporary trim log file
        os.remove(log)

    write_report(log_storage, "trimmomatic_report.csv", log_id)


if __name__ == '__main__':

    main(LOG_FILES)


================================================
FILE: flowcraft/tests/__init__.py
================================================


================================================
FILE: flowcraft/tests/broadcast_tests/empty_log.txt
================================================


================================================
FILE: flowcraft/tests/broadcast_tests/log_with_command.txt
================================================
Log with command
nextflow run file.nf -profile docker

================================================
FILE: flowcraft/tests/broadcast_tests/log_with_command_regex.txt
================================================
Log with command - different chars in path
/usr/local/bin/nextflow run /mnt/innuendo_storage/users/bgoncalves/jobs/2-3/test.nf -profile incd -resume

================================================
FILE: flowcraft/tests/broadcast_tests/log_without_command.txt
================================================
Test for log file without command

================================================
FILE: flowcraft/tests/data_pipelines.py
================================================
pipelines = [
    ["A", [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "A", "lane": 1}}]],
    ["A B", [{"input": {"process": "__init__", "lane": 1},
              "output": {"process": "A", "lane": 1}},
             {"input": {"process": "A", "lane": 1},
              "output": {"process": "B", "lane": 1}}]],
    ["A B (C | D)", [{"input": {"process": "__init__", "lane": 1},
                      "output": {"process": "A", "lane": 1}},
                     {"input": {"process": "A", "lane": 1},
                      "output": {"process": "B", "lane": 1}},
                     {"input": {"process": "B", "lane": 1},
                      "output": {"process": "C", "lane": 2}},
                     {"input": {"process": "B", "lane": 1},
                      "output": {"process": "D", "lane": 3}}]],
    ["A B (C | D E F)", [{"input": {"process": "__init__", "lane": 1},
                          "output": {"process": "A", "lane": 1}},
                         {"input": {"process": "A", "lane": 1},
                          "output": {"process": "B", "lane": 1}},
                         {"input": {"process": "B", "lane": 1},
                          "output": {"process": "C", "lane": 2}},
                         {"input": {"process": "B", "lane": 1},
                          "output": {"process": "D", "lane": 3}},
                         {"input": {"process": "D", "lane": 3},
                          "output": {"process": "E", "lane": 3}},
                         {"input": {"process": "E", "lane": 3},
                          "output": {"process": "F", "lane": 3}}]],
    ["(A | B | C)", [{"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "A", "lane": 1}},
                     {"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "B", "lane": 2}},
                     {"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "C", "lane": 3}}]],
    ["(A | B | C E (F | G))", [{"input": {"process": "__init__", "lane": 0},
                                "output": {"process": "A", "lane": 1}},
                               {"input": {"process": "__init__", "lane": 0},
                                "output": {"process": "B", "lane": 2}},
                               {"input": {"process": "__init__", "lane": 0},
                                "output": {"process": "C", "lane": 3}},
                               {"input": {"process": "C", "lane": 3},
                                "output": {"process": "E", "lane": 3}},
                               {"input": {"process": "E", "lane": 3},
                                "output": {"process": "F", "lane": 4}},
                               {"input": {"process": "E", "lane": 3},
                                "output": {"process": "G", "lane": 5}}]],
    ["(A (Z | X)| B | C E (F | G))",
                    [{"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "A", "lane": 1}},
                     {"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "B", "lane": 2}},
                     {"input": {"process": "__init__", "lane": 0},
                      "output": {"process": "C", "lane": 3}},
                     {"input": {"process": "C", "lane": 3},
                      "output": {"process": "E", "lane": 3}},
                     {"input": {"process": "A", "lane": 1},
                      "output": {"process": "Z", "lane": 4}},
                     {"input": {"process": "A", "lane": 1},
                      "output": {"process": "X", "lane": 5}},
                     {"input": {"process": "E", "lane": 3},
                      "output": {"process": "F", "lane": 6}},
                     {"input": {"process": "E", "lane": 3},
                      "output": {"process": "G", "lane": 7}}]],
    ["(A (Z | X)| B(Y|H A) | C E (F | G))",
     [{"input": {"process": "__init__", "lane": 0},
       "output": {"process": "A", "lane": 1}},
      {"input": {"process": "__init__", "lane": 0},
       "output": {"process": "B", "lane": 2}},
      {"input": {"process": "__init__", "lane": 0},
       "output": {"process": "C", "lane": 3}},
      {"input": {"process": "C", "lane": 3},
       "output": {"process": "E", "lane": 3}},
      {"input": {"process": "A", "lane": 1},
       "output": {"process": "Z", "lane": 4}},
      {"input": {"process": "A", "lane": 1},
       "output": {"process": "X", "lane": 5}},
      {"input": {"process": "B", "lane": 2},
       "output": {"process": "Y", "lane": 6}},
      {"input": {"process": "B", "lane": 2},
       "output": {"process": "H", "lane": 7}},
      {"input": {"process": "H", "lane": 7},
       "output": {"process": "A", "lane": 7}},
      {"input": {"process": "E", "lane": 3},
       "output": {"process": "F", "lane": 8}},
      {"input": {"process": "E", "lane": 3},
       "output": {"process": "G", "lane": 9}}]]
]


================================================
FILE: flowcraft/tests/pipeline_tests/pipe1.txt
================================================
A

================================================
FILE: flowcraft/tests/pipeline_tests/pipe2.txt
================================================
A B

================================================
FILE: flowcraft/tests/pipeline_tests/pipe3.txt
================================================
A B (
    C |
    D)

================================================
FILE: flowcraft/tests/pipeline_tests/pipe4.txt
================================================
A B (
    C |
    D E F)

================================================
FILE: flowcraft/tests/pipeline_tests/pipe5.txt
================================================
(A | B | C)

================================================
FILE: flowcraft/tests/pipeline_tests/pipe6.txt
================================================
(A | B | C E
    (F |
     G))

================================================
FILE: flowcraft/tests/pipeline_tests/pipe7.txt
================================================
(A
    (Z |
    X)|
B | C E
    (F |
    G))

================================================
FILE: flowcraft/tests/pipeline_tests/pipe8.txt
================================================
(A (
    Z |
    X)|
B(
    Y|
    H A) |
C E (
    F |
    G))

================================================
FILE: flowcraft/tests/test_assemblerflow.py
================================================
import os
import sys
import shutil
import pytest

import flowcraft.flowcraft as af


@pytest.fixture
def tmp():

    os.mkdir("temp")
    yield "temp"
    shutil.rmtree("temp")


def test_check():

    sys.argv.append(1)
    args = af.get_args(["build", "-t 'A B C'", "-c", "-o teste.nf"])

    with pytest.raises(SystemExit):
        af.build(args)


def test_check_invalid():

    sys.argv.append(1)
    args = af.get_args(["build", "-t",  "'A B C()'", "-c", "-o teste.nf"])

    with pytest.raises(SystemExit):
        af.build(args)


def test_build_file(tmp):

    p = os.path.join(os.path.abspath(tmp), "teste.nf")
    sys.argv.append(1)

    args = af.get_args(["build", "-t", "integrity_coverage fastqc", "-o",
                        "{}".format(p)])
    af.build(args)


def test_build_file_2(tmp):

    sys.argv.append(1)
    p = os.path.join(os.path.abspath(tmp), "teste.nf")

    args = af.get_args(["build", "-t integrity_coverage fastqc", "-o",
                        "{}".format(p), "--pipeline-only"])
    af.build(args)

    assert sorted(os.listdir(tmp)) == [".forkTree.json", ".treeDag.json",
                                       "containers.config",
                                       "lib", "nextflow.config", "params.config",
                                       "resources.config", "teste.html",
                                       "teste.nf", "user.config"]


def test_build_recipe(tmp):

    sys.argv.append(1)
    p = os.path.join(os.path.abspath(tmp), "teste.nf")

    args = af.get_args(["build", "-r", "innuca", "-o",
                        "{}".format(p), "--pipeline-only"])
    af.build(args)


def test_build_recipe_innuendo(tmp):

    sys.argv.append(1)
    p = os.path.join(os.path.abspath(tmp), "teste.nf")

    args = af.get_args(["build", "-r", "innuendo", "-o",
                        "{}".format(p), "--pipeline-only"])
    af.build(args)


================================================
FILE: flowcraft/tests/test_broadcast.py
================================================
import pytest
import os

import flowcraft.generator.utils as utils
from flowcraft.generator.error_handling import LogError


def test_empty_log():
    with pytest.raises(LogError):
        utils.get_nextflow_filepath(
            os.path.join(os.getcwd(), "flowcraft/tests/broadcast_tests/empty_log.txt"))


def test_no_path_in_log():
    with pytest.raises(LogError):
        utils.get_nextflow_filepath(
            os.path.join(os.getcwd(), "flowcraft/tests/broadcast_tests/log_without_command.txt"))


def test_path_in_log():
    filepath = utils.get_nextflow_filepath(
        os.path.join(os.getcwd(), "flowcraft/tests/broadcast_tests/log_with_command.txt"))

    assert filepath != ""


def test_regex_in_log():
    filepath = utils.get_nextflow_filepath(
        os.path.join(os.getcwd(), "flowcraft/tests/broadcast_tests/log_with_command_regex.txt"))

    assert filepath != ""


================================================
FILE: flowcraft/tests/test_engine.py
================================================
import os
import shutil
import pytest

import flowcraft.generator.engine as eg
import flowcraft.generator.process as pc
import flowcraft.generator.error_handling as eh

from flowcraft.generator.process_collector import collect_process_map

process_map = collect_process_map()


@pytest.fixture
def single_con():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc", "lane": 1}}
           ]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def single_status():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "skesa", "lane": 1}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def single_con_fasta():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "abricate", "lane": 1}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def single_con_multi_raw():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "assembly_mapping", "lane": 1}},
           {"input": {"process": "assembly_mapping", "lane": 1},
            "output": {"process": "pilon", "lane": 1}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def implicit_link():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc", "lane": 1}},
           {"input": {"process": "fastqc", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "assembly_mapping", "lane": 1}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def implicit_link_2():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "assembly_mapping", "lane": 1}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def single_fork():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades", "lane": 2}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "skesa", "lane": 3}},
           {'input': {'process': 'spades', 'lane': 2},
            'output': {'process': 'abricate', 'lane': 2}},
           {'input': {'process': 'skesa', 'lane': 3},
            'output': {'process': 'abricate', 'lane': 3}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def raw_forks():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc", "lane": 1}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "patho_typing", "lane": 2}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "seq_typing", "lane": 3}}]

    return eg.NextflowGenerator(con, "teste.nf", process_map)


@pytest.fixture
def multi_forks():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "seq_typing", "lane": 2}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "integrity_coverage", "lane": 3}},
           {"input": {"process": "integrity_coverage", "lane": 3},
            "output": {"process": "check_coverage", "lane": 3}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades", "lane": 4}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "skesa", "lane": 5}},
           {"input": {"process": "check_coverage", "lane": 3},
            "output": {"process": "spades", "lane": 6}},
           {"input": {"process": "check_coverage", "lane": 3},
            "output": {"process": "skesa", "lane": 7}}]

    os.mkdir(".temp")
    yield eg.NextflowGenerator(con, os.path.join(".temp", "teste.nf"),
                               process_map)
    shutil.rmtree(".temp")


def test_simple_init():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "{}", "lane": 1}}]

    for p in process_map:

        con[0]["output"]["process"] = p
        nf = eg.NextflowGenerator(con, "teste/teste.nf", process_map,
                                  ignore_dependencies=True)

        assert [len(nf.processes), nf.processes[1].template] == \
            [2, p]


def test_invalid_process():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "invalid", "lane": 1}}]

    with pytest.raises(SystemExit):
        eg.NextflowGenerator(con, "teste.nf", process_map)


def test_connections_single_process_channels(single_con):

    template = "integrity_coverage"

    p = single_con.processes[1]

    assert [p.input_channel, p.output_channel] == \
        ["{}_in_1_0".format(template), "{}_out_1_0".format(template)]


def test_connections_invalid():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}}
           ]

    with pytest.raises(SystemExit):
        eg.NextflowGenerator(con, "teste.nf", process_map)


def test_connections_ignore_type():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "skesa", "lane": 1}},
           {"input": {"process": "skesa", "lane": 1},
            "output": {"process": "patho_typing", "lane": 1}}
           ]

    eg.NextflowGenerator(con, "teste.nf", process_map)


def test_build_header(single_con):

    single_con._build_header()

    assert single_con.template != ""


def test_connections_nofork(single_con):

    assert single_con._fork_tree == {}


def test_connections_singlefork(single_fork):

    assert single_fork._fork_tree == {1: [2, 3]}


def test_connections_rawfork(raw_forks):

    assert raw_forks._fork_tree == {0: [1, 2, 3]}


def test_connections_multiforks(multi_forks):

    assert multi_forks._fork_tree == {0: [1, 2, 3], 1: [4, 5], 3: [6, 7]}


def test_connections_no_fork_channel_update(single_con):

    p = single_con.processes[1]

    assert p.forks == []


def test_connections_fork_channel_update(single_fork):

    p = single_fork.processes[1]

    assert p.forks != []


def test_connections_channel_update(single_con):

    p1 = single_con.processes[1]
    p2 = single_con.processes[2]

    assert p1.output_channel == p2.input_channel


def test_connections_channel_update_wfork(single_fork):

    p1 = single_fork.processes[1]
    p2 = single_fork.processes[2]
    p3 = single_fork.processes[3]

    assert [p1.main_forks[1], p1.main_forks[2]] == \
           [p2.input_channel, p3.input_channel]


def test_connections_channel_update_wfork_2(single_fork):

    p1 = single_fork.processes[3]
    p2 = single_fork.processes[5]

    assert p1.output_channel == p2.input_channel


def test_connections_channel_update_wfork_3(single_fork):

    p1 = single_fork.processes[2]
    p2 = single_fork.processes[4]

    assert p1.output_channel == p2.input_channel


def test_set_channels_single_con_raw_fastq(single_con):

    single_con._set_channels()

    assert [list(single_con.main_raw_inputs.keys())[0],
            len(single_con.main_raw_inputs),
            list(single_con.main_raw_inputs.values())[0]["raw_forks"]] == \
           ["fastq", 1, ["integrity_coverage_in_1_0"]]


def test_set_channels_single_con_raw_fasta(single_con_fasta):

    single_con_fasta._set_channels()

    assert [list(single_con_fasta.main_raw_inputs.keys())[0],
            len(single_con_fasta.main_raw_inputs),
            list(single_con_fasta.main_raw_inputs.values())[0][
                "raw_forks"]] == \
           ["fasta", 1, ["abricate_in_1_0"]]


def test_set_channels_multi_raw_input(single_con_multi_raw):

    single_con_multi_raw._set_channels()

    print(single_con_multi_raw.main_raw_inputs)

    assert [list(single_con_multi_raw.main_raw_inputs.keys()),
            len(single_con_multi_raw.main_raw_inputs)] == \
           [["fasta", "fastq"], 2]


def test_set_channels_secondary_channels_nolink(single_con):

    single_con._set_channels()

    assert single_con.secondary_channels["SIDE_phred"][1]["end"] == []


def test_set_channels_secondary_chanels_link(multi_forks):

    multi_forks._set_channels()

    assert [multi_forks.secondary_channels["SIDE_phred"][1]["end"],
            multi_forks.secondary_channels["SIDE_max_len"][1]["end"],
            multi_forks.secondary_channels["SIDE_max_len"][3]["end"]] == \
           [[], ["SIDE_max_len_4_5"], ["SIDE_max_len_6_7"]]


def test_set_secondary_inputs_raw_forks(raw_forks):

    raw_forks._set_channels()
    raw_forks._set_init_process()

    p = raw_forks

    assert p.main_raw_inputs["fastq"]["raw_forks"] == \
           ["integrity_coverage_in_0_0",
            "patho_typing_in_0_2",
            "seq_typing_in_0_3"]


def test_set_secondary_inputs_multi_raw(single_con_multi_raw):

    single_con_multi_raw._set_channels()
    single_con_multi_raw._set_init_process()

    p = single_con_multi_raw

    assert sorted(list(p.main_raw_inputs.keys())) == ["fasta", "fastq"]


def test_set_secondary_channels(multi_forks):

    multi_forks._set_channels()
    multi_forks._set_secondary_channels()

    p = multi_forks.processes[1]

    print(multi_forks.main_raw_inputs)

    assert [p._context["output_channel"], p._context["forks"]] == \
        ["_integrity_coverage_out_1_0",
         "\n_integrity_coverage_out_1_0.into{ integrity_coverage_out_1_0;"
         "spades_in_1_4;skesa_in_1_5 }\n\n\nSIDE_max_len_1_1.set{"
         " SIDE_max_len_4_5 }\n"]


def test_set_secondary_channels_2(multi_forks):

    multi_forks._set_channels()
    multi_forks._set_secondary_channels()

    p = multi_forks.processes[4]

    assert [p._context["output_channel"], p.main_forks] == \
           ["_check_coverage_out_3_3",
            ["check_coverage_out_3_3", "spades_in_3_6", "skesa_in_3_7"]]


def test_set_implicit_link(implicit_link):

    implicit_link._set_channels()
    implicit_link._set_secondary_channels()

    p = implicit_link.processes[2]

    assert p.main_forks == ["fastqc_out_1_1", "_LAST_fastq_4"]


def test_set_implicit_link(implicit_link_2):

    implicit_link_2._set_channels()
    implicit_link_2._set_secondary_channels()

    p = implicit_link_2.processes[1]

    assert p.main_forks == ["integrity_coverage_out_1_0", "_LAST_fastq_1_3"]


def test_set_status_channels_multi(single_con):

    single_con._set_channels()
    single_con._set_status_channels()

    p = [x for x in single_con.processes[::-1]
         if isinstance(x, pc.StatusCompiler)][0]

    assert p._context["compile_channels"] == \
        "STATUS_integrity_coverage_1_1.mix(STATUS_fastqc2_1_2," \
        "STATUS_fastqc2_report_1_2)"


def test_set_status_channels_single(single_status):

    single_status._set_channels()
    single_status._set_status_channels()

    p = [x for x in single_status.processes[::-1]
         if isinstance(x, pc.StatusCompiler)][0]

    assert p._context["compile_channels"] == "STATUS_skesa_1_1"


def test_set_compiler_channels(single_status):

    single_status.lane = 1
    single_status._set_channels()
    single_status._set_compiler_channels()

    p = [x for x in single_status.processes[::-1]
         if isinstance(x, pc.StatusCompiler)][0]

    assert p._context["compile_channels"] == "STATUS_skesa_1_1"


def test_set_status_channels_no_status(single_status):

    single_status.processes[1].status_channels = []

    single_status._set_channels()
    single_status._set_status_channels()

    with pytest.raises(IndexError):
        p = [x for x in single_status.processes[::-1]
             if isinstance(x, pc.StatusCompiler)][0]


def test_set_status_channels_duplicate_status(single_status):

    single_status.processes[1].status_channels = ["A", "A"]

    single_status._set_channels()

    with pytest.raises(eh.ProcessError):
        single_status._set_status_channels()


def test_build(multi_forks):

    multi_forks.build()

    assert multi_forks.template != ""


def test_resources_string(single_con):

    res_dict = {"procA": {"cpus": 1, "memory": "'4GB'", "container": "img",
                          "version": "1"}}

    res = single_con._get_resources_string(res_dict, 1)

    assert res == '\n\t$procA_1.cpus = 1\n\t$procA_1.memory = \'4GB\''


def test_resources_string_2(single_con):

    res_dict = {"procA": {"cpus": 1, "container": "img",
                          "version": "1"}}

    res = single_con._get_resources_string(res_dict, 1)

    assert res == '\n\t$procA_1.cpus = 1'


def test_resources_string_3(single_con):

    res_dict = {"procA": {"cpus": 1, "memory": "'4GB'", "container": "img",
                          "version": "1"},
                "procB": {"memory": "{ 4.GB * task.attempt }"}}

    res = single_con._get_resources_string(res_dict, 1)

    assert res == '\n\t$procA_1.cpus = 1\n\t$procA_1.memory = \'4GB\'' \
                  '\n\t$procB_1.memory = { 4.GB * task.attempt }'


def test_container_string(single_con):

    res_dict = {"procA": {"cpus": 1, "memory": "4GB", "container": "img",
                          "version": "1"}}

    res = single_con._get_container_string(res_dict, 2)

    assert res == '\n\t$procA_2.container = "img:1"'


def test_container_string_2(single_con):

    res_dict = {"procA": {"cpus": 1, "memory": "4GB", "container": "img",
                          "version": "1"},
                "procB": {"container": "img"}}

    res = single_con._get_container_string(res_dict, 2)

    assert res == '\n\t$procA_2.container = "img:1"\n\t' \
                  '$procB_2.container = "img:latest"'


def test_extra_inputs_1():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'extra_input':'teste'}", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[2].extra_input == "teste"


def test_extra_inputs_2():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "abricate={'extra_input':'teste'}", "lane": 1}}
           ]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[3].extra_input == "teste"


def test_extra_inputs_3():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'extra_input':'teste'}", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)
    nf._set_channels()

    assert [list(nf.extra_inputs.keys())[0],
            nf.extra_inputs["teste"]["input_type"],
            nf.extra_inputs["teste"]["channels"]] == \
           ["teste", "fastq", ["EXTRA_fastqc_1_2"]]


def test_extra_inputs_default():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "abricate={'extra_input':'default'}", "lane": 1}}
           ]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)
    nf._set_channels()

    assert [list(nf.extra_inputs.keys())[0],
            nf.extra_inputs["fasta"]["input_type"],
            nf.extra_inputs["fasta"]["channels"]] == \
           ["fasta", "fasta", ["EXTRA_abricate_1_3"]]


def test_extra_inputs_invalid():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'extra_input':'default'}", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    with pytest.raises(SystemExit):
        nf._set_channels()


def test_extra_inputs_invalid_2():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "spades={'extra_input':'teste'}", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "abricate={'extra_input':'teste'}", "lane": 1}}
           ]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    with pytest.raises(SystemExit):
        nf._set_channels()


def test_run_time_directives():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'cpus':'3'}", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[2].directives["fastqc2"]["cpus"] == "3"


def test_run_time_directives_full():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'cpus':'3','memory':'4GB',"
                                  "'container':'img','version':'1'}",
                       "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert [nf.processes[2].directives["fastqc2"]["cpus"],
            nf.processes[2].directives["fastqc2"]["memory"],
            nf.processes[2].directives["fastqc2"]["container"],
            nf.processes[2].directives["fastqc2"]["version"]] == \
           ["3", "4GB", "img", "1"]


def test_run_time_directives_invalid():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "integrity_coverage", "lane": 1}},
           {"input": {"process": "integrity_coverage", "lane": 1},
            "output": {"process": "fastqc={'cpus'", "lane": 1}}]

    with pytest.raises(SystemExit):
        eg.NextflowGenerator(con, "teste.nf", process_map)


def test_not_automatic_dependency():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}}]

    with pytest.raises(SystemExit):
        eg.NextflowGenerator(con, "teste.nf", process_map,
                             auto_dependency=False)


def test_automatic_dependency():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[1].template == "integrity_coverage"


def test_automatic_dependency_2():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[1].output_channel == nf.processes[2].input_channel


def test_automatic_dependency_3():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert [nf.processes[1].parent_lane, nf.processes[2].parent_lane] == \
           [None, 1]


def test_automatic_dependency_wfork():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "integrity_coverage", "lane": 2}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[1].template == "integrity_coverage"


def test_automatic_dependency_wfork_2():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "integrity_coverage", "lane": 2}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)
    nf._set_channels()

    assert len(nf.main_raw_inputs["fastq"]["raw_forks"]) == 2


def test_automatic_dependency_wfork_3():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "reads_download", "lane": 1}},
           {"input": {"process": "reads_download", "lane": 1},
            "output": {"process": "skesa", "lane": 2}},
           {"input": {"process": "reads_download", "lane": 1},
            "output": {"process": "spades", "lane": 3}}
           ]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)
    nf._set_channels()

    assert nf.processes[3].parent_lane == 1


def test_automatic_dependency_wfork_4():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "reads_download", "lane": 1}},
           {"input": {"process": "reads_download", "lane": 1},
            "output": {"process": "skesa", "lane": 2}},
           {"input": {"process": "reads_download", "lane": 1},
            "output": {"process": "spades", "lane": 3}}
           ]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)
    nf._set_channels()

    assert nf.processes[4].parent_lane == 3


def test_automatic_dependency_multi():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "trimmomatic", "lane": 1}},
           {"input": {"process": "trimmomatic", "lane": 1},
            "output": {"process": "spades", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert len([x for x in nf.processes
                if x.template == "integrity_coverage"]) == 1


def test_automatic_dependency_non_raw():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "spades", "lane": 1}},
           {"input": {"process": "spades", "lane": 1},
            "output": {"process": "pilon", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    assert nf.processes[2].parent_lane == 1


def test_patlas_compiler_channels():

    con = [{"input": {"process": "__init__", "lane": 0},
            "output": {"process": "mash_screen", "lane": 1}},
           {"input": {"process": "__init__", "lane": 0},
            "output": {"process": "mapping_patlas", "lane": 2}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    nf._set_channels()
    nf._set_compiler_channels()

    assert len(nf.compilers["patlas_consensus"]["channels"]) == 2


def test_patlas_compiler_channels_2():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "mash_screen", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    nf._set_channels()
    nf._set_compiler_channels()

    assert len(nf.compilers["patlas_consensus"]["channels"]) == 1


def test_patlas_compiler_channels_empty():

    con = [{"input": {"process": "__init__", "lane": 1},
            "output": {"process": "trimmomatic", "lane": 1}}]

    nf = eg.NextflowGenerator(con, "teste.nf", process_map)

    nf._set_channels()
    nf._set_compiler_channels()

    assert len(nf.compilers["patlas_consensus"]["channels"]) == 0


================================================
FILE: flowcraft/tests/test_pipeline_parser.py
================================================
import os
import json

import flowcraft.generator.pipeline_parser as ps
from flowcraft.tests.data_pipelines import pipelines as pipes


def test_get_lanes():

    raw_string = [
        "A | B)",
        "A | B C D | E F)",
        "A Z | B C (D | E) | G H)",
        "A | B (C | D) | E (E | F I ))"
    ]

    expected = [
        [["A"], ["B"]],
        [["A"], ["B", "C", "D"], ["E", "F"]],
        [["A", "Z"], ["B", "C"], ["G", "H"]],
        [["A"], ["B"], ["E"]]
    ]

    for p, exp in zip(raw_string, expected):
        res = ps.get_lanes(p)
        assert exp == res


def test_linear_connection():

    p = ["A", "B", "C"]
    lane = 1

    res = ps.linear_connection(p, lane)

    assert res == [{
        "input": {
            "process": "A",
            "lane": lane
        },
        "output": {
            "process": "B",
            "lane": lane
        }},
        {"input": {
            "process": "B",
            "lane": lane
        },
        "output": {
            "process": "C",
            "lane": lane
        }
    }]


def test_two_fork_connection():

    source_lane = 1

    res = ps.fork_connection(
        source="A",
        sink=["B", "C"],
        source_lane=source_lane,
        lane=source_lane
    )

    assert res == [{
        "input": {
            "process": "A",
            "lane": source_lane
        },
        "output": {
            "process": "B",
            "lane": source_lane + 1
        }}, {
        "input": {
            "process": "A",
            "lane": source_lane,
        },
        "output": {
            "process": "C",
            "lane": source_lane + 2
        }
    }]


def test_two_fork_connection_mismatch_lane():

    source_lane = 1
    lane = 3

    res = ps.fork_connection(
        source="A",
        sink=["B", "C"],
        source_lane=source_lane,
        lane=lane
    )

    assert res == [{
        "input": {
            "process": "A",
            "lane": source_lane
        },
        "output": {
            "process": "B",
            "lane": lane + 1
        }}, {
        "input": {
            "process": "A",
            "lane": source_lane,
        },
        "output": {
            "process": "C",
            "lane": lane + 2
        }
    }]


def test_multi_fork_connection():

    source_lane = 1

    res = ps.fork_connection(
        source="A",
        sink=["B", "C", "D"],
        source_lane=source_lane,
        lane=source_lane
    )

    assert res == [{
        "input": {
            "process": "A",
            "lane": source_lane
        },
        "output": {
            "process": "B",
            "lane": source_lane + 1
        }}, {
        "input": {
            "process": "A",
            "lane": source_lane,
        },
        "output": {
            "process": "C",
            "lane": source_lane + 2
        }}, {
        "input": {
            "process": "A",
            "lane": source_lane,
        },
        "output": {
            "process": "D",
            "lane": source_lane + 3
        }
    }]


def test_linear_lane_connection():

    res = ps.linear_lane_connection([["A", "B", "C"]], lane=1)

    assert res == [{
        "input": {
            "process": "A",
            "lane": 2
        },
        "output": {
            "process": "B",
            "lane": 2
        }},
        {"input": {
            "process": "B",
            "lane": 2
        },
        "output": {
            "process": "C",
            "lane": 2
        }
    }]


def test_linear_multi_lane_connection():

    res = ps.linear_lane_connection([["A", "B"], ["C", "D"]], lane=1)

    assert res == [{
        "input": {
            "process": "A",
            "lane": 2
        },
        "output": {
            "process": "B",
            "lane": 2
        }},
        {"input": {
            "process": "C",
            "lane": 3
        },
        "output": {
            "process": "D",
            "lane": 3
        }
    }]


def test_get_source_lane():

    pipeline_list = [{'input': {'process': '__init__', 'lane': 1},
                      'output': {'process': 'integrity_coverage', 'lane': 1}},
                     {'input': {'process': 'integrity_coverage', 'lane': 1},
                      'output': {'process': 'fastqc_trimmomatic', 'lane': 1}},
                     {'input': {'process': 'fastqc_trimmomatic', 'lane': 1},
                      'output': {'process': 'spades', 'lane': 2}},
                     {'input': {'process': 'fastqc_trimmomatic', 'lane': 1},
                      'output': {'process': 'skesa', 'lane': 3}}]

    res = ps.get_source_lane(["integrity_coverage", "fastqc_trimmomatic"],
                             pipeline_list)

    assert res == 1


def test_get_source_lane_2():

    pipeline_list = [{'input': {'process': '__init__', 'lane': 1},
                      'output': {'process': 'integrity_coverage', 'lane': 1}},
                     {'input': {'process': 'integrity_coverage', 'lane': 1},
                      'output': {'process': 'fastqc_trimmomatic', 'lane': 1}},
                     {'input': {'process': 'fastqc_trimmomatic', 'lane': 1},
                      'output': {'process': 'spades', 'lane': 2}},
                     {'input': {'process': 'fastqc_trimmomatic', 'lane': 1},
                      'output': {'process': 'skesa', 'lane': 3}},
                     {'input': {'process': 'spades', 'lane': 2},
                      'output': {'process': 'pilon', 'lane': 2}},
                     {'input': {'process': 'skesa', 'lane': 3},
                      'output': {'process': 'pilon', 'lane': 3}},
                     ]

    res = ps.get_source_lane(["spades", "pilon"], pipeline_list)

    assert res == 2


def test_parse_pipeline():

    for p, expected in pipes:
        res = ps.parse_pipeline(p)
        assert res == expected


def test_parse_pipeline_file():

    for i in range(1, 9):

        p_path = os.path.join("flowcraft", "tests", "pipeline_tests",
                              "pipe{}.txt".format(i))
        expected = pipes[i - 1][1]
        print(p_path)
        res = ps.parse_pipeline(p_path)
        print(res)
        assert res == expected


def test_unique_id_len():

    pip_list = [
        "A B C",
        "A (B C (D | E)| B C (D | E))",
        "A (B C (D | E)| C (D | E))",
        "A (B C (D | E)| B (D | E))",
    ]

    res_list = [
        "A_0 B_1 C_2",
        "A_0 (B_1 C_2 (D_3 | E_4)| B_5 C_6 (D_7 | E_8))",
        "A_0 (B_1 C_2 (D_3 | E_4)| C_5 (D_6 | E_7))",
        "A_0 (B_1 C_2 (D_3 | E_4)| B_5 (D_6 | E_7))",
    ]

    for x, pip_str in enumerate(pip_list):
        res_str, res_ids = ps.add_unique_identifiers(pip_str)
        assert res_str.replace(" ", "") == res_list[x].replace(" ", "")

def test_remove_id():

    pip_list = [
        "A B C",
        "A (B C (D | E)| B C (D | E))",
    ]

    pipeline_mod_links = [
        [{'input': {'process': '__init__', 'lane': 1},
          'output': {'process': 'A_0', 'lane': 1}},
         {'input': {'process': 'A_0', 'lane': 1},
          'output': {'process': 'B_1', 'lane': 1}},
         {'input': {'process': 'B_1', 'lane': 1},
          'output': {'process': 'C_2', 'lane': 1}}],
        [{'input': {'process': '__init__', 'lane': 1},
          'output': {'process': 'A_0', 'lane': 1}},
         {'input': {'process': 'A_0', 'lane': 1},
          'output': {'process': 'B_1', 'lane': 2}},
         {'input': {'process': 'A_0', 'lane': 1},
          'output': {'process': 'B_5', 'lane': 3}},
         {'input': {'process': 'B_1', 'lane': 2},
          'output': {'process': 'C_2', 'lane': 2}},
         {'input': {'process': 'B_5', 'lane': 3},
          'output': {'process': 'C_6', 'lane': 3}},
         {'input': {'process': 'C_2', 'lane': 2},
          'output': {'process': 'D_3', 'lane': 4}},
         {'input': {'process': 'C_2', 'lane': 2},
          'output': {'process': 'E_4', 'lane': 5}},
         {'input': {'process': 'C_6', 'lane': 3},
          'output': {'process': 'D_7', 'lane': 6}},
         {'input': {'process': 'C_6', 'lane': 3},
          'output': {'process': 'E_8', 'lane': 7}}]
    ]

    pipeline_exp_links = [
        [{'input': {'process': '__init__', 'lane': 1},
          'output': {'process': 'A', 'lane': 1}},
         {'input': {'process': 'A', 'lane': 1},
          'output': {'process': 'B', 'lane': 1}},
         {'input': {'process': 'B', 'lane': 1},
          'output': {'process': 'C', 'lane': 1}}],
        [{'input': {'process': '__init__', 'lane': 1},
          'output': {'process': 'A', 'lane': 1}},
         {'input': {'process': 'A', 'lane': 1},
          'output': {'process': 'B', 'lane': 2}},
         {'input': {'process': 'A', 'lane': 1},
          'output': {'process': 'B', 'lane': 3}},
         {'input': {'process': 'B', 'lane': 2},
          'output': {'process': 'C', 'lane': 2}},
         {'input': {'process': 'B', 'lane': 3},
          'output': {'process': 'C', 'lane': 3}},
         {'input': {'process': 'C', 'lane': 2},
          'output': {'process': 'D', 'lane': 4}},
         {'input': {'process': 'C', 'lane': 2},
          'output': {'process': 'E', 'lane': 5}},
         {'input': {'process': 'C', 'lane': 3},
          'output': {'process': 'D', 'lane': 6}},
         {'input': {'process': 'C', 'lane': 3},
          'output': {'process': 'E', 'lane': 7}}]
    ]

    for x, pip_str in enumerate(pip_list):
        res_str, res_ids = ps.add_unique_identifiers(pip_str)
        res = ps.remove_unique_identifiers(res_ids, pipeline_mod_links[x])
        assert json.dumps(res) == json.dumps(pipeline_exp_links[x])

================================================
FILE: flowcraft/tests/test_process_details.py
================================================
import pytest

import flowcraft.generator.process_details as pd
import flowcraft.flowcraft as af

from flowcraft.generator.process_collector import collect_process_map
from flowcraft.generator.process_details import COLORS

process_map = collect_process_map()


def test_color_print():

    for c in COLORS:
        pd.colored_print("teste_msg", c)

    assert 1


def test_long_list():

    arguments = af.get_args(["build", "-L"])

    pipeline_string = "fastqc trimmomatic"

    with pytest.raises(SystemExit):
        pd.proc_collector(process_map, arguments, pipeline_string)


def test_short_list():

    arguments = af.get_args(["build", "-l"])

    pipeline_string = "fastqc trimmomatic"

    with pytest.raises(SystemExit):
        pd.proc_collector(process_map, arguments, pipeline_string)


================================================
FILE: flowcraft/tests/test_processes.py
================================================
import os
import pytest

import flowcraft.generator.process as pc
import flowcraft.generator.error_handling as eh

from flowcraft.generator.components import assembly
from flowcraft.generator.components import assembly_processing as ap
from flowcraft.generator.components import reads_quality_control as readsqc

from flowcraft.generator.process_collector import collect_process_map

process_map = collect_process_map()


@pytest.fixture
def mock_process():

    return pc.Process(template="integrity_coverage")


@pytest.fixture
def process_wchannels():

    p = pc.Process(template="integrity_coverage")

    p.input_channel = "in_channel"
    p.output_channel = "out_channel"

    return p


@pytest.fixture
def mock_status():

    return pc.StatusCompiler(template="status_compiler")

@pytest.fixture
def mock_patlas_compiler():

    return pc.StatusCompiler(template="patlas_consensus")


@pytest.fixture
def mock_init():

    return pc.Init(template="init")


def test_process_init():

    for template, proc in process_map.items():

        p = proc(template=template)

        assert p.template == template


def test_set_correct_template(mock_process):

    mock_process._set_template("fastqc")

    assert os.path.exists(mock_process._template_path)


def test_set_wrong_template(mock_process):

    with pytest.raises(eh.ProcessError):
        mock_process._set_template("wrong_template")


def test_template_render_empty(mock_process):

    with pytest.raises(eh.ProcessError):
        mock_process.template_str


def test_template_render(process_wchannels):

    process_wchannels.set_channels(pid=1)
    t = process_wchannels.template_str

    assert 1


def test_main_channel_setup(mock_process):

    mock_process.set_main_channel_names("input_suf", "output_suf", 1)

    assert [mock_process.input_channel.endswith("input_suf"),
            mock_process.output_channel.endswith("output_suf"),
            mock_process.lane] == [True, True, 1]


def test_main_raw_channel_self(mock_process):
    """Tests the retrieval of the raw input channel when the input type is
    inferred from the class"""

    mock_process.input_type = "fastq"
    res = mock_process.get_user_channel("myChannel")

    assert res == {"input_channel": "myChannel",
                   **mock_process.RAW_MAPPING["fastq"]}


def test_main_raw_channel_fastq(mock_process):

    res = mock_process.get_user_channel("myChannel", "fastq")

    assert res == {"input_channel": "myChannel",
                   **mock_process.RAW_MAPPING["fastq"]}


def test_main_raw_channel_fasta(mock_process):

    res = mock_process.get_user_channel("myChannel", "fasta")

    assert res == {"input_channel": "myChannel",
                   **mock_process.RAW_MAPPING["fasta"]}


def test_main_raw_channel_invalid(mock_process):

    res = mock_process.get_user_channel("myChannel", "invalid")

    assert res is None


def test_channels_setup(process_wchannels):

    process_wchannels.lane = 1
    process_wchannels.set_channels(pid=1)

    expected = {"input_channel": "in_channel",
                "output_channel": "out_channel",
                "template": process_wchannels.template,
                "pid": "1_1",
                "forks": ""}

    assert process_wchannels._context == expected


def test_channels_setup_withforks(process_wchannels):

    process_wchannels.forks = ["A", "B"]

    process_wchannels.lane = 3
    process_wchannels.set_channels(pid=1)

    expected = {"input_channel": "in_channel",
                "output_channel": "out_channel",
                "template": process_wchannels.template,
                "pid": "3_1",
                "forks": "A\nB"}

    assert process_wchannels._context == expected


def test_setup_one_raw_fork(process_wchannels):

    process_wchannels.main_forks = ["A"]
    process_wchannels.lane = 1
    process_wchannels.set_channels(pid=1)

    expected = {"input_channel": "in_channel",
                "output_channel": "out_channel",
                "template": process_wchannels.template,
                "pid": "1_1",
                "forks": "\nout_channel.set{ A }\n"}

    assert process_wchannels._context == expected


def test_setup_multiple_raw_forks(process_wchannels):

    process_wchannels.main_forks = ["A", "B"]
    process_wchannels.lane = 3
    process_wchannels.set_channels(pid=1)

    expected = {"input_channel": "in_channel",
                "output_channel": "out_channel",
                "template": process_wchannels.template,
                "pid": "3_1",
                "forks": "\nout_channel.into{ A;B }\n"}

    assert process_wchannels._context == expected


def test_channels_setup_status(process_wchannels):

    process_wchannels.status_channels = ["A", "B"]

    process_wchannels.lane = 3
    process_wchannels.set_channels(pid=1)

    assert process_wchannels.status_strs == ["STATUS_A_3_1", "STATUS_B_3_1"]


def test_update_main_fork_noprevious(process_wchannels):
    """Updates the forks attributes when there are no previous main forks"""

    process_wchannels.set_channels(pid=1)
    process_wchannels.update_main_forks("A")

    assert [process_wchannels.output_channel,
            process_wchannels.main_forks,
            process_wchannels.forks] == \
           ["_out_channel",
            ["out_channel", "A"],
            ["\n_out_channel.into{ out_channel;A }\n"]]


def test_secondary_channels_multisink(process_wchannels):

    process_wchannels.lane = 2
    process_wchannels.set_channels(pid=1)
    process_wchannels.set_secondary_channel("A", ["B", "C"])

    assert process_wchannels.forks == ["\nA_2_1.into{ B;C }\n"]


def test_secondary_channels_singlesink(process_wchannels):

    process_wchannels.lane = 2
    process_wchannels.set_channels(pid=1)
    process_wchannels.set_secondary_channel("A", ["B"])

    assert process_wchannels.forks == ["\nA_2_1.set{ B }\n"]


def test_secondary_channels_duplicatesink(process_wchannels):

    process_wchannels.lane = 1
    process_wchannels.set_channels(pid=1)
    process_wchannels.set_secondary_channel("A", ["B", "B"])

    assert process_wchannels.forks == ["\nA_1_1.set{ B }\n"]


def test_status_init(mock_status):

    assert mock_status.template == "status_compiler"


def test_status_channel_setup_empty(mock_status):

    with pytest.raises(eh.ProcessError):
        mock_status.set_compiler_channels([])


def test_status_channel_single(mock_status):

    mock_status.set_compiler_channels(["A"])

    assert mock_status._context == {"compile_channels": "A"}


def test_status_channel_two(mock_status):

    mock_status.set_compiler_channels(["A", "B"])

    assert mock_status._context == {"compile_channels": "A.mix(B)"}


def test_status_channel_multiple(mock_status):

    mock_status.set_compiler_channels(["A", "B", "C"])

    assert mock_status._context == {"compile_channels": "A.mix(B,C)"}


def test_init_process(mock_init):

    assert mock_init.template == "init"


def test_init_raw_inputs_single(mock_init):

    mock_init.set_raw_inputs({"fasta": {"channel": "rawChannel",
                                    "raw_forks": ["A"],
                                    "channel_str": "rawChannel.Channel"}})

    assert [mock_init.forks, mock_init._context["main_inputs"]] == \
        [["\nrawChannel.set{ A }\n"], "rawChannel.Channel"]


def test_init_raw_inputs_multi_forks(mock_init):

    mock_init.set_raw_inputs({"fastq": {"channel": "rawChannel",
                                    "raw_forks": ["A", "B"],
                                    "channel_str": "rawChannel.Channel"}})

    assert [mock_init.forks, mock_init._context["main_inputs"]] == \
        [["\nrawChannel.into{ A;B }\n"], "rawChannel.Channel"]


def test_init_multi_raw_inputs(mock_init):

    mock_init.set_raw_inputs({"fastq": {"channel": "rawChannel",
                                    "raw_forks": ["A", "B"],
                                    "channel_str": "rawChannel.Channel"},
                              "fasta": {"channel": "otherChannel",
                                    "raw_forks": ["C"],
                                    "channel_str": "otherChannel.Channel"}})

    assert [mock_init.forks, mock_init._context["main_inputs"]] == \
        [["\nrawChannel.into{ A;B }\n",
          "\notherChannel.set{ C }\n"],
         "rawChannel.Channel\notherChannel.Channel"]


def test_init_secondary_inputs(mock_init):

    mock_init.set_secondary_inputs(
        {"genomeSize": "IN_genome_size = Channel.value(params.genomeSize)"})

    assert mock_init._context["secondary_inputs"] == \
        "IN_genome_size = Channel.value(params.genomeSize)"


def test_init_multi_secondary_inputs(mock_init):

    mock_init.set_secondary_inputs(
        {"genomeSize": "IN_genome_size = Channel.value(params.genomeSize)",
         "other": "Other"})

    assert mock_init._context["secondary_inputs"] == \
        "IN_genome_size = Channel.value(params.genomeSize)\nOther"


def test_directive_update():

    p = assembly.Spades(template="spades")

    p.update_attributes({"version": "3.9.0"})

    assert p.directives["spades"]["version"] == "3.9.0"


def test_directive_update2():

    p = readsqc.Fastqc(template="fastqc")

    p.update_attributes({"cpus": "3", "memory": "4GB"})

    assert [p.directives["fastqc2"]["cpus"],
            p.directives["fastqc2"]["memory"]] ==\
           ["3", "4GB"]


def test_directive_update3():

    p = ap.Pilon(template="pilon")

    p.update_attributes({"cpus": "3", "memory": "4GB",
                         "container": "another", "version": "1.0"})

    assert [p.directives["pilon"]["cpus"],
            p.directives["pilon"]["memory"],
            p.directives["pilon"]["container"],
            p.directives["pilon"]["version"]] == \
           ["3", "4GB", "another", "1.0"]


def test_directive_update4():

    p = readsqc.Trimmomatic(template="trimmomatic")

    p.update_attributes({"cpus": "3", "memory": "{4.GB*task.attempt}",
                         "container": "another", "version": "1.0"})

    assert [p.directives["trimmomatic"]["cpus"],
            p.directives["trimmomatic"]["memory"],
            p.directives["trimmomatic"]["container"],
            p.directives["trimmomatic"]["version"]] == \
           ["3", "{4.GB*task.attempt}", "another", "1.0"]


def test_join_compiler(mock_patlas_compiler):

    mock_patlas_compiler.set_compiler_channels(["A", "B"], operator="join")

    assert mock_patlas_compiler._context == \
        {"compile_channels": "A.join(B).map{ ot -> [ ot[0], ot[1..-1] ] }"}


def test_join_compiler_one_channel(mock_patlas_compiler):

    mock_patlas_compiler.set_compiler_channels(["A"], operator="join")

    assert mock_patlas_compiler._context == \
        {"compile_channels": "A"}


================================================
FILE: flowcraft/tests/test_recipes.py
================================================
import pytest
import pkgutil

from argparse import Namespace

from flowcraft.generator import error_handling as eh
from flowcraft.generator import recipes
from flowcraft.generator import recipe


def test_empty_recipe():

    r = recipe.Recipe()

    with pytest.raises(eh.RecipeError):
        r.brew()


def test_empty_pipeline_str():

    r = recipe.Recipe()

    r.name = "teste"

    with pytest.raises(eh.RecipeError):
        r.brew()


def test_basic_recipe():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "teste"

    r.brew()

    assert r.pipeline_str == "teste"


def test_recipe_wdirectives():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    r.directives = {
        "componentA": {
            "params": {
                "paramA": "val"
            },
            "directives": {
                "dirA": "val"
            }
        }
    }

    r.brew()

    assert '"params":{"paramA":"val"}' in r.pipeline_str and \
        '"dirA":"val"' in r.pipeline_str


def test_recipe_partial_directives():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    r.directives = {
        "componentA": {
            "params": {
                "paramA": "val"
            },
        }
    }

    r.brew()

    assert '"params":{"paramA":"val"}' in r.pipeline_str


def test_recipe_partial_directives2():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    r.directives = {
        "componentA": {
            "directives": {
                "dirA": "val"
            }
        }
    }

    r.brew()

    assert '"dirA":"val"' in r.pipeline_str


def test_component_str():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    directives = {
        "dirA": "val"
    }

    res = r._get_component_str("componentA", directives=directives)

    assert '"dirA":"val"' in res


def test_component_str2():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    directives = {
        "paramA": "val"
    }

    res = r._get_component_str("componentA", params=directives)
    print(res)

    assert '"params":{"paramA":"val"}' in res


def test_component_str3():

    r = recipe.Recipe()

    r.name = "teste"
    r.pipeline_str = "componentA"
    params = {
        "paramA": "val"
    }
    directives = {
        "dirA": "val"
    }

    res = r._get_component_str("componentA", params=params,
                               directives=directives)

    assert '"params":{"paramA":"val"}' in res and \
           '"dirA":"val"' in res


def test_brew_recipe():

    res = recipe.brew_recipe("innuca")

    assert res != ""


def test_bad_recipe_name():

    with pytest.raises(SystemExit):
        res = recipe.brew_recipe("bad_name")


def test_all_recipes():

    prefix = "{}.".format(recipes.__name__)
    for importer, modname, _ in pkgutil.iter_modules(recipes.__path__, prefix):

        _module = importer.find_module(modname).load_module(modname)

        _recipe_classes = [cls for cls in _module.__dict__.values() if
                           isinstance(cls, type)]

        for cls in _recipe_classes:
            cls()


def test_innuendo_recipe():

    args = Namespace(tasks=None)

    recipe.brew_innuendo(args)


def test_innuendo_partial_recipe():

    args = Namespace(tasks="integrity_coverage")

    recipe.brew_innuendo(args)


def test_list_recipes():

    with pytest.raises(SystemExit):
        recipe.list_recipes()

def test_list_recipes_full():

    with pytest.raises(SystemExit):
        recipe.list_recipes(True)

================================================
FILE: flowcraft/tests/test_sanity.py
================================================
import pytest
from contextlib import contextmanager

try:
    import generator.pipeline_parser as ps
    from generator.error_handling import SanityError
except ImportError:
    import flowcraft.generator.pipeline_parser as ps
    from flowcraft.generator.error_handling import SanityError


@contextmanager
def not_raises(exception, msg):
    try:
        yield
    except exception:
        raise pytest.fail(msg)

def test_empty_tasks():
    pipeline_strs = [
        "   ",
        ""
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.empty_tasks(p)


def test_no_brackets_fail():

    pipeline_strs = [
        "A B C | D",
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.brackets_but_no_lanes(p)


def test_number_of_forks_fail():

    pipeline_strs = [
        "A B (( C | D)",
        "A B ( C | D"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.brackets_insanity_check(p)


def test_lane_char_fail():

    pipeline_strs = [
        "A B (D || E)"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.lane_char_insanity_check(p)


def test_final_char_fail():

    pipeline_strs = [
        "|",
        "A B |"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.final_char_insanity_check(p)


def test_fork_no_proc_fail():

    pipeline_strs = [
        "A B (|E)",
        "A B (E|)"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.fork_procs_insanity_check(p)


def test_double_fork_fail():

    pipeline_strs = [
        "A B (( C | D ) E )"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.start_proc_insanity_check(p)


def test_close_token_ending_fail():

    pipeline_strs = [
        "A B ( C | D ) E"
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.late_proc_insanity_check(p)


def test_inner_forks_fail():

    pipeline_strs = [
        "A B ( A D )",
    ]

    for p in pipeline_strs:
        with pytest.raises(SanityError):
            ps.inner_fork_insanity_checks(p)


def test_string_pass_all():

    # all these functions listed here don't accept strings with spaces
    pipeline_strs = [
        "A B",
        "(A|B)",
        "A B (C|D)",
        "A B (D|E(F|G))",
        "A B (C|B)",
        "F T(S(P(P|M)|M(P|M(P| M)))|Sp)"
    ]

    for p in pipeline_strs:
        with not_raises(SanityError, "pipeline: {}".format(p)):
            ps.brackets_insanity_check(p)
            ps.lane_char_insanity_check(p)
            ps.brackets_but_no_lanes(p)
            ps.fork_procs_insanity_check(p)
            ps.start_proc_insanity_check(p)
            ps.late_proc_insanity_check(p)


def test_string_spaces_pass_all():

    # this test accepts strings with spaces
    pipeline_strs = [
        "A B",
        "(A | B)",
        "A B ( C | D)",
        "A B (D | E (F | G))",
        "A B ( C | B)",
        # spaces are important for this check
        "F T (S(P(P| M) |M(P|M(P| M)))|Sp)"
    ]

    for p in pipeline_strs:
        with not_raises(SanityError, "pipeline: {}".format(p)):
            ps.inner_fork_insanity_checks(p)


def test_string_pass_all_wrapper():

    pipeline_strs = [
        "A B",
        "(A | B)",
        "A B ( C | D)",
        "A B (D | E (F | G))",
        "A B ( C | B)"
    ]

    for p in pipeline_strs:
        with not_raises(SanityError, "pipeline: {}".format(p)):
            ps.insanity_checks(p)

================================================
FILE: requirements.txt
================================================
numpydoc

================================================
FILE: setup.py
================================================
import flowcraft

from setuptools import setup

VERSION = flowcraft.__version__

with open("README.md") as fh:
    README = fh.read()

setup(
    name="flowcraft",
    version="{}".format(VERSION),
    packages=["flowcraft",
              "flowcraft.templates",
              "flowcraft.templates.flowcraft_utils",
              "flowcraft.generator",
              "flowcraft.generator.components",
              "flowcraft.generator.recipes"],
    package_dir={"flowcraft": "flowcraft"},
    package_data={"flowcraft": ["nextflow.config",
                                "profiles.config",
                                "bin/*",
                                "lib/*",
                                "resources/*",
                                "generator/templates/*"]},
    data_files=[("", ["LICENSE"])],
    install_requires=[
        "pympler",
        "python-dateutil",
        "argparse",
        "jinja2",
        "requests"
    ],
    description="A Nextflow pipeline assembler for genomics. Pick your "
                "modules. Assemble them. Run the pipeline.",
    long_description=README,
    long_description_content_type="text/markdown",
    url="https://github.com/assemblerflow/flowcraft",
    author="Diogo N Silva",
    author_email="o.diogosilva@gmail.com",
    license="GPL3",
    entry_points={
        "console_scripts": [
            "flowcraft = flowcraft.flowcraft:main"
        ]
    }
)