Repository: yiluheihei/microbiomeMarker
Branch: devel
Commit: 66fc685c97be
Files: 179
Total size: 680.9 KB

Directory structure:
gitextract_few3gk_j/

├── .Rbuildignore
├── .gitattributes
├── .github/
│   ├── .gitignore
│   ├── ISSUE_TEMPLATE/
│   │   └── issue_template.md
│   └── workflows/
│       ├── check-bioc.yml
│       └── pkgdown.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R/
│   ├── AllClasses.R
│   ├── AllGenerics.R
│   ├── DA-aldex.R
│   ├── DA-all.R
│   ├── DA-ancom.R
│   ├── DA-ancombc.R
│   ├── DA-comparing.R
│   ├── DA-deseq2.R
│   ├── DA-edgeR.R
│   ├── DA-lefse.R
│   ├── DA-limma-voom.R
│   ├── DA-metagenomeSeq.R
│   ├── DA-simple-statistic.R
│   ├── DA-sl.R
│   ├── DA-test-multiple-groups.R
│   ├── DA-test-two-groups.R
│   ├── abundances-methods.R
│   ├── aggregate-taxa.R
│   ├── assignment-methods.R
│   ├── confounder.R
│   ├── data.R
│   ├── extract-methods.R
│   ├── import-biobakery-lefse_in.R
│   ├── import-dada2.R
│   ├── import-picrust2.R
│   ├── import-qiime2.R
│   ├── lefse-utilities.R
│   ├── microbiomeMarker.R
│   ├── normalization.R
│   ├── plot-abundance.R
│   ├── plot-cladogram.R
│   ├── plot-comparing.R
│   ├── plot-effect-size.R
│   ├── plot-heatmap.R
│   ├── plot-postHocTest.R
│   ├── plot-sl-roc.R
│   ├── post-hoc-test.R
│   ├── reexports.R
│   ├── subset-marker.R
│   ├── summarize-taxa.R
│   ├── sysdata.rda
│   ├── test-utilities.R
│   ├── transform.R
│   └── utilities.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── data/
│   ├── caporaso.rda
│   ├── cid_ying.rda
│   ├── ecam.rda
│   ├── enterotypes_arumugam.rda
│   ├── kostic_crc.rda
│   ├── oxygen.rda
│   ├── pediatric_ibd.rda
│   └── spontaneous_colitis.rda
├── data-raw/
│   ├── available_ranks.R
│   └── data.R
├── inst/
│   ├── CITATION
│   └── extdata/
│       ├── dada2_samdata.txt
│       ├── dada2_seqtab.rds
│       ├── dada2_taxtab.rds
│       ├── picrust2_metadata.tsv
│       ├── refseq.qza
│       ├── sample-metadata.tsv
│       ├── table.qza
│       ├── taxonomy.qza
│       └── tree.qza
├── man/
│   ├── abundances-methods.Rd
│   ├── aggregate_taxa.Rd
│   ├── assign-marker_table.Rd
│   ├── assign-otu_table.Rd
│   ├── compare_DA.Rd
│   ├── confounder.Rd
│   ├── data-caporaso.Rd
│   ├── data-cid_ying.Rd
│   ├── data-ecam.Rd
│   ├── data-enterotypes_arumugam.Rd
│   ├── data-kostic_crc.Rd
│   ├── data-oxygen.Rd
│   ├── data-pediatric_ibd.Rd
│   ├── data-spontaneous_colitis.Rd
│   ├── effect_size-plot.Rd
│   ├── extract-methods.Rd
│   ├── extract_posthoc_res.Rd
│   ├── figures/
│   │   └── sticker.R
│   ├── get_treedata_phyloseq.Rd
│   ├── import_dada2.Rd
│   ├── import_picrust2.Rd
│   ├── import_qiime2.Rd
│   ├── marker_table-class.Rd
│   ├── marker_table-methods.Rd
│   ├── microbiomeMarker-class.Rd
│   ├── microbiomeMarker-package.Rd
│   ├── microbiomeMarker.Rd
│   ├── nmarker-methods.Rd
│   ├── normalize-methods.Rd
│   ├── phyloseq2DESeq2.Rd
│   ├── phyloseq2edgeR.Rd
│   ├── phyloseq2metagenomeSeq.Rd
│   ├── plot.compareDA.Rd
│   ├── plot_abundance.Rd
│   ├── plot_cladogram.Rd
│   ├── plot_heatmap.Rd
│   ├── plot_postHocTest.Rd
│   ├── plot_sl_roc.Rd
│   ├── postHocTest-class.Rd
│   ├── postHocTest.Rd
│   ├── reexports.Rd
│   ├── run_aldex.Rd
│   ├── run_ancom.Rd
│   ├── run_ancombc.Rd
│   ├── run_deseq2.Rd
│   ├── run_edger.Rd
│   ├── run_lefse.Rd
│   ├── run_limma_voom.Rd
│   ├── run_marker.Rd
│   ├── run_metagenomeseq.Rd
│   ├── run_posthoc_test.Rd
│   ├── run_simple_stat.Rd
│   ├── run_sl.Rd
│   ├── run_test_multiple_groups.Rd
│   ├── run_test_two_groups.Rd
│   ├── subset_marker.Rd
│   ├── summarize_taxa.Rd
│   ├── summary.compareDA.Rd
│   └── transform_abundances.Rd
├── tests/
│   ├── testthat/
│   │   ├── _snaps/
│   │   │   ├── ancom.md
│   │   │   ├── edgeR.md
│   │   │   ├── lefse.md
│   │   │   ├── limma-voom.md
│   │   │   ├── multiple-groups-test.md
│   │   │   └── two-group-test.md
│   │   ├── data/
│   │   │   ├── ancom-zero.csv
│   │   │   ├── ancom-zero_neg_lb.csv
│   │   │   ├── data_tax_duplicate.rds
│   │   │   └── generate_cladogram_annotation.rds
│   │   ├── test-abundances.R
│   │   ├── test-aldex.R
│   │   ├── test-ancom.R
│   │   ├── test-ancombc.R
│   │   ├── test-assignment.R
│   │   ├── test-barplot.R
│   │   ├── test-comparing.R
│   │   ├── test-confounder.R
│   │   ├── test-edgeR.R
│   │   ├── test-extract.R
│   │   ├── test-import-picrust2.R
│   │   ├── test-import-qiime2.R
│   │   ├── test-lefse-input.R
│   │   ├── test-lefse.R
│   │   ├── test-limma-voom.R
│   │   ├── test-metagenomeSeq.R
│   │   ├── test-microbiomeMaker-methods.R
│   │   ├── test-microbiomeMarker-class.R
│   │   ├── test-multiple-groups-test.R
│   │   ├── test-normalization.R
│   │   ├── test-sl.R
│   │   ├── test-summarize-tax.R
│   │   ├── test-transform.R
│   │   ├── test-two-group-test.R
│   │   ├── test-utilities.R
│   │   ├── test_cladogram.R
│   │   └── test_fix_duplicate_tax.R
│   └── testthat.R
└── vignettes/
    ├── .gitignore
    ├── microbiomeMarker-vignette.Rmd
    └── vignette.bib

================================================
FILE CONTENTS
================================================

================================================
FILE: .Rbuildignore
================================================
^microbiomeMarker\.Rproj$
^\.Rproj\.user$
^test\.R$
^README\.Rmd$
^data-raw$
^lefse$
^dev_test$
^\.github/workflows/R-CMD-check\.yaml$
^\.github$
^LICENSE\.md$
^codecov\.yml$
^_pkgdown\.yml$
^docs$
^pkgdown$
^man/figures/micribiome.png$


================================================
FILE: .gitattributes
================================================
* text=lf


================================================
FILE: .github/.gitignore
================================================
*.html


================================================
FILE: .github/ISSUE_TEMPLATE/issue_template.md
================================================
Please briefly describe your problem, what output actually happend, and what 
output you expect.

Please provide a minimal reproducible example. For more deails on how to make 
a great minimal reproducible example, see https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example and
https://www.tidyverse.org/help/#reprex.

```
Brief description of the problem

# insert minimal reprducible example here
```


================================================
FILE: .github/workflows/check-bioc.yml
================================================
## Read more about GitHub actions the features of this GitHub Actions workflow
## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action
##
## For more details, check the biocthis developer notes vignette at
## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html
##
## You can add this workflow to other packages using:
## > biocthis::use_bioc_github_action()
##
## Using GitHub Actions exposes you to many details about how R packages are
## compiled and installed in several operating system.s
### If you need help, please follow the steps listed at
## https://github.com/r-lib/actions#where-to-find-help
##
## If you found an issue specific to biocthis's GHA workflow, please report it
## with the information that will make it easier for others to help you.
## Thank you!

## Acronyms:
## * GHA: GitHub Action
## * OS: operating system

on:
  push:
  pull_request:

name: R-CMD-check-bioc

## These environment variables control whether to run GHA code later on that is
## specific to testthat, covr, and pkgdown.
##
## If you need to clear the cache of packages, update the number inside
## cache-version as discussed at https://github.com/r-lib/actions/issues/86.
## Note that you can always run a GHA test without the cache by using the word
## "/nocache" in the commit message.
env:
  has_testthat: 'true'
  run_covr: 'true'
  run_pkgdown: 'true'
  has_RUnit: 'false'
  cache-version: 'cache-v1'
  run_docker: 'false'

jobs:
  build-check:
    runs-on: ${{ matrix.config.os }}
    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
    container: ${{ matrix.config.cont }}
    ## Environment variables unique to this job.

    strategy:
      fail-fast: false
      matrix:
        config:
          - { os: ubuntu-latest, r: '4.2', bioc: '3.15', cont: "bioconductor/bioconductor_docker:RELEASE_3_15", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
          - { os: macOS-latest, r: '4.2', bioc: '3.15'}
          - { os: windows-latest, r: '4.2', bioc: '3.15'}
          ## Check https://github.com/r-lib/actions/tree/master/examples
          ## for examples using the http-user-agent
    env:
      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
      RSPM: ${{ matrix.config.rspm }}
      NOT_CRAN: true
      TZ: UTC
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

    steps:

      ## Set the R library to the directory matching the
      ## R packages cache step further below when running on Docker (Linux).
      - name: Set R Library home on Linux
        if: runner.os == 'Linux'
        run: |
          mkdir /__w/_temp/Library
          echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile

      ## Most of these steps are the same as the ones in
      ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
      ## If they update their steps, we will also need to update ours.
      - name: Checkout Repository
        uses: actions/checkout@v2

      ## R is already included in the Bioconductor docker images
      - name: Setup R from r-lib
        if: runner.os != 'Linux'
        uses: r-lib/actions/setup-r@master
        with:
          r-version: ${{ matrix.config.r }}
          http-user-agent: ${{ matrix.config.http-user-agent }}

      ## pandoc is already included in the Bioconductor docker images
      - name: Setup pandoc from r-lib
        if: runner.os != 'Linux'
        uses: r-lib/actions/setup-pandoc@master

      - name: Query dependencies
        run: |
          install.packages('remotes')
          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
        shell: Rscript {0}

      - name: Restore R package cache
        if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
        uses: actions/cache@v2
        with:
          path: ${{ env.R_LIBS_USER }}
          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-

      - name: Cache R packages on Linux
        if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
        uses: actions/cache@v2
        with:
          path: /home/runner/work/_temp/Library
          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-

      - name: Install Linux system dependencies
        if: runner.os == 'Linux'
        run: |
          sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))')
          echo $sysreqs
          sudo -s eval "$sysreqs"

      - name: Install macOS system dependencies
        if: matrix.config.os == 'macOS-latest'
        run: |
          ## Enable installing XML from source if needed
          brew install libxml2
          echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV

          ## Required to install magick as noted at
          ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2
          brew install imagemagick@6

          ## For textshaping, required by ragg, and required by pkgdown
          brew install harfbuzz fribidi

          ## For installing usethis's dependency gert
          brew install libgit2

          ## Required for tcltk
          brew install xquartz --cask

      - name: Install Windows system dependencies
        if: runner.os == 'Windows'
        run: |
          ## Edit below if you have any Windows system dependencies
        shell: Rscript {0}

      - name: Install BiocManager
        run: |
          message(paste('****', Sys.time(), 'installing BiocManager ****'))
          remotes::install_cran("BiocManager")
        shell: Rscript {0}

      - name: Set BiocVersion
        run: |
          BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE)
        shell: Rscript {0}

      - name: Install dependencies pass 1
        run: |
          ## Try installing the package dependencies in steps. First the local
          ## dependencies, then any remaining dependencies to avoid the
          ## issues described at
          ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
          ## https://github.com/r-lib/remotes/issues/296
          ## Ideally, all dependencies should get installed in the first pass.

          ## Set the repos source depending on the OS
          ## Alternatively use https://storage.googleapis.com/bioconductor_docker/packages/
          ## though based on https://bit.ly/bioc2021-package-binaries
          ## the Azure link will be the main one going forward.
          gha_repos <- if(
              .Platform$OS.type == "unix" && Sys.info()["sysname"] != "Darwin"
          ) c(
              "AnVIL" = "https://bioconductordocker.blob.core.windows.net/packages/3.15/bioc",
              BiocManager::repositories()
              ) else BiocManager::repositories()

          ## For running the checks
          message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
          install.packages(c("rcmdcheck", "BiocCheck"), repos = gha_repos)

          ## Pass #1 at installing dependencies
          ## This pass uses AnVIL-powered fast binaries
          ## details at https://github.com/nturaga/bioc2021-bioconductor-binaries
          ## The speed gains only apply to the docker builds.
          message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
          remotes::install_local(dependencies = TRUE, repos = gha_repos, build_vignettes = FALSE, upgrade = TRUE)
        continue-on-error: true
        shell: Rscript {0}

      - name: Install dependencies pass 2
        run: |
          ## Pass #2 at installing dependencies
          ## This pass does not use AnVIL and will thus update any packages
          ## that have seen been updated in Bioconductor
          message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
          remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
        shell: Rscript {0}

      - name: Install BiocGenerics
        if:  env.has_RUnit == 'true'
        run: |
          ## Install BiocGenerics
          BiocManager::install("BiocGenerics")
        shell: Rscript {0}

      - name: Install covr
        if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
        run: |
          remotes::install_cran("covr")
        shell: Rscript {0}

      - name: Install pkgdown
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: |
          remotes::install_cran("pkgdown")
        shell: Rscript {0}

      - name: Session info
        run: |
          options(width = 100)
          pkgs <- installed.packages()[, "Package"]
          sessioninfo::session_info(pkgs, include_base = TRUE)
        shell: Rscript {0}

      - name: Run CMD check
        env:
          _R_CHECK_CRAN_INCOMING_: false
          DISPLAY: 99.0
        run: |
          options(crayon.enabled = TRUE)
          rcmdcheck::rcmdcheck(
              args = c("--no-manual", "--no-vignettes", "--timings"),
              build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
              error_on = "warning",
              check_dir = "check"
          )
        shell: Rscript {0}

      ## Might need an to add this to the if:  && runner.os == 'Linux'
      - name: Reveal testthat details
        if:  env.has_testthat == 'true'
        run: find . -name testthat.Rout -exec cat '{}' ';'

      - name: Run RUnit tests
        if:  env.has_RUnit == 'true'
        run: |
          BiocGenerics:::testPackage()
        shell: Rscript {0}

      - name: Run BiocCheck
        env:
          DISPLAY: 99.0
        run: |
          BiocCheck::BiocCheck(
              dir('check', 'tar.gz$', full.names = TRUE),
              `quit-with-status` = TRUE,
              `no-check-R-ver` = TRUE,
              `no-check-bioc-help` = TRUE
          )
        shell: Rscript {0}

      - name: Test coverage
        if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux'
        run: |
          covr::codecov()
        shell: Rscript {0}

      - name: Install package
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: R CMD INSTALL .

      - name: Build and deploy pkgdown site
        if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux'
        run: |
          git config --local user.name "$GITHUB_ACTOR"
          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
          Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)"
        shell: bash {0}
        ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
        ## at least one locally before this will work. This creates the gh-pages
        ## branch (erasing anything you haven't version controlled!) and
        ## makes the git history recognizable by pkgdown.

      - name: Upload check results
        if: failure()
        uses: actions/upload-artifact@master
        with:
          name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results
          path: check

        ## Note that DOCKER_PASSWORD is really a token for your dockerhub
        ## account, not your actual dockerhub account password.
        ## This comes from
        ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html#6-add-secrets-to-github-repo
        ## Check https://github.com/docker/build-push-action/tree/releases/v1
        ## for more details.
      - uses: docker/build-push-action@v1
        if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && runner.os == 'Linux' "
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
          repository: yiluheihei/microbiomemarker
          tag_with_ref: true
          tag_with_sha: true
          tags: latest


================================================
FILE: .github/workflows/pkgdown.yaml
================================================
on:
  push:
    branches:
      - main
      - master
    tags:
      -'*'

name: pkgdown

jobs:
  pkgdown:
    runs-on: macOS-latest
    env:
      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
    steps:
      - uses: actions/checkout@v2

      - uses: r-lib/actions/setup-r@v1

      - uses: r-lib/actions/setup-pandoc@v1

      - name: Install XQuartz # .onLoad failed in loadNamespace() for 'Cairo'
        run: brew install xquartz --cask

      - name: Query dependencies
        run: |
          install.packages('remotes')
          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
        shell: Rscript {0}

      - name: Restore R package cache
        uses: actions/cache@v2
        with:
          path: ${{ env.R_LIBS_USER }}
          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-

      - name: Install dependencies
        run: |
          remotes::install_deps(dependencies = TRUE)
          install.packages("pkgdown", type = "binary")
        shell: Rscript {0}

      - name: Install package
        run: R CMD INSTALL .

      - name: Deploy package
        run: |
          git config --local user.email "actions@github.com"
          git config --local user.name "GitHub Actions"
          Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'


================================================
FILE: .gitignore
================================================
.RData
__MACOSX
docs
inst/doc
.Rproj.user
*.Rproj


================================================
FILE: DESCRIPTION
================================================
Package: microbiomeMarker
Title: microbiome biomarker analysis toolkit
Version: 1.13.2
Authors@R: 
    person(given = "Yang",
           family = "Cao",
           role = c("aut", "cre"),
           email = "caoyang.name@gmail.com")
Description: To date, a number of methods have been developed for microbiome 
    marker discovery based on metagenomic profiles, e.g. LEfSe. However, all of 
    these methods have its own advantages and disadvantages, and none of them is 
    considered standard or universal. Moreover, different programs or softwares 
    may be development using different programming languages, even in different 
    operating systems. Here, we have developed an all-in-one R package 
    microbiomeMarker that integrates commonly used differential analysis 
    methods as well as three machine learning-based approaches, including 
    Logistic regression, Random forest, and Support vector machine, to 
    facilitate the identification of microbiome markers.
License: GPL-3
biocViews: Metagenomics, Microbiome, DifferentialExpression
URL: https://github.com/yiluheihei/microbiomeMarker
BugReports: https://github.com/yiluheihei/microbiomeMarker/issues
Depends: R (>= 4.1.0)
Imports: 
    dplyr,
    phyloseq,
    magrittr,
    purrr,
    MASS,
    utils,
    ggplot2,
    tibble,
    rlang,
    stats,
    coin,
    ggtree,
    tidytree,
    methods,
    IRanges,
    tidyr,
    patchwork,
    ggsignif,
    metagenomeSeq,
    DESeq2,
    edgeR,
    BiocGenerics,
    Biostrings,
    yaml,
    biomformat,
    S4Vectors,
    Biobase,
    ComplexHeatmap,
    ANCOMBC,
    caret,
    limma,
    ALDEx2,
    multtest,
    plotROC,
    vegan,
    pROC,
    BiocParallel
Encoding: UTF-8
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
Suggests: 
    testthat,
    covr,
    glmnet,
    Matrix,
    kernlab,
    e1071,
    ranger,
    knitr,
    rmarkdown,
    BiocStyle,
    withr,
    microbiome
VignetteBuilder: knitr
Config/testthat/edition: 3


================================================
FILE: LICENSE.md
================================================
GNU General Public License
==========================

_Version 3, 29 June 2007_  
_Copyright © 2007 Free Software Foundation, Inc. &lt;<http://fsf.org/>&gt;_

Everyone is permitted to copy and distribute verbatim copies of this license
document, but changing it is not allowed.

## Preamble

The GNU General Public License is a free, copyleft license for software and other
kinds of works.

The licenses for most software and other practical works are designed to take away
your freedom to share and change the works. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change all versions of a
program--to make sure it remains free software for all its users. We, the Free
Software Foundation, use the GNU General Public License for most of our software; it
applies also to any other work released this way by its authors. You can apply it to
your programs, too.

When we speak of free software, we are referring to freedom, not price. Our General
Public Licenses are designed to make sure that you have the freedom to distribute
copies of free software (and charge for them if you wish), that you receive source
code or can get it if you want it, that you can change the software or use pieces of
it in new free programs, and that you know you can do these things.

To protect your rights, we need to prevent others from denying you these rights or
asking you to surrender the rights. Therefore, you have certain responsibilities if
you distribute copies of the software, or if you modify it: responsibilities to
respect the freedom of others.

For example, if you distribute copies of such a program, whether gratis or for a fee,
you must pass on to the recipients the same freedoms that you received. You must make
sure that they, too, receive or can get the source code. And you must show them these
terms so they know their rights.

Developers that use the GNU GPL protect your rights with two steps: **(1)** assert
copyright on the software, and **(2)** offer you this License giving you legal permission
to copy, distribute and/or modify it.

For the developers' and authors' protection, the GPL clearly explains that there is
no warranty for this free software. For both users' and authors' sake, the GPL
requires that modified versions be marked as changed, so that their problems will not
be attributed erroneously to authors of previous versions.

Some devices are designed to deny users access to install or run modified versions of
the software inside them, although the manufacturer can do so. This is fundamentally
incompatible with the aim of protecting users' freedom to change the software. The
systematic pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we have designed
this version of the GPL to prohibit the practice for those products. If such problems
arise substantially in other domains, we stand ready to extend this provision to
those domains in future versions of the GPL, as needed to protect the freedom of
users.

Finally, every program is threatened constantly by software patents. States should
not allow patents to restrict development and use of software on general-purpose
computers, but in those that do, we wish to avoid the special danger that patents
applied to a free program could make it effectively proprietary. To prevent this, the
GPL assures that patents cannot be used to render the program non-free.

The precise terms and conditions for copying, distribution and modification follow.

## TERMS AND CONDITIONS

### 0. Definitions

“This License” refers to version 3 of the GNU General Public License.

“Copyright” also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

“The Program” refers to any copyrightable work licensed under this
License. Each licensee is addressed as “you”. “Licensees” and
“recipients” may be individuals or organizations.

To “modify” a work means to copy from or adapt all or part of the work in
a fashion requiring copyright permission, other than the making of an exact copy. The
resulting work is called a “modified version” of the earlier work or a
work “based on” the earlier work.

A “covered work” means either the unmodified Program or a work based on
the Program.

To “propagate” a work means to do anything with it that, without
permission, would make you directly or secondarily liable for infringement under
applicable copyright law, except executing it on a computer or modifying a private
copy. Propagation includes copying, distribution (with or without modification),
making available to the public, and in some countries other activities as well.

To “convey” a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through a computer
network, with no transfer of a copy, is not conveying.

An interactive user interface displays “Appropriate Legal Notices” to the
extent that it includes a convenient and prominently visible feature that **(1)**
displays an appropriate copyright notice, and **(2)** tells the user that there is no
warranty for the work (except to the extent that warranties are provided), that
licensees may convey the work under this License, and how to view a copy of this
License. If the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

### 1. Source Code

The “source code” for a work means the preferred form of the work for
making modifications to it. “Object code” means any non-source form of a
work.

A “Standard Interface” means an interface that either is an official
standard defined by a recognized standards body, or, in the case of interfaces
specified for a particular programming language, one that is widely used among
developers working in that language.

The “System Libraries” of an executable work include anything, other than
the work as a whole, that **(a)** is included in the normal form of packaging a Major
Component, but which is not part of that Major Component, and **(b)** serves only to
enable use of the work with that Major Component, or to implement a Standard
Interface for which an implementation is available to the public in source code form.
A “Major Component”, in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system (if any) on which
the executable work runs, or a compiler used to produce the work, or an object code
interpreter used to run it.

The “Corresponding Source” for a work in object code form means all the
source code needed to generate, install, and (for an executable work) run the object
code and to modify the work, including scripts to control those activities. However,
it does not include the work's System Libraries, or general-purpose tools or
generally available free programs which are used unmodified in performing those
activities but which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for the work, and
the source code for shared libraries and dynamically linked subprograms that the work
is specifically designed to require, such as by intimate data communication or
control flow between those subprograms and other parts of the work.

The Corresponding Source need not include anything that users can regenerate
automatically from other parts of the Corresponding Source.

The Corresponding Source for a work in source code form is that same work.

### 2. Basic Permissions

All rights granted under this License are granted for the term of copyright on the
Program, and are irrevocable provided the stated conditions are met. This License
explicitly affirms your unlimited permission to run the unmodified Program. The
output from running a covered work is covered by this License only if the output,
given its content, constitutes a covered work. This License acknowledges your rights
of fair use or other equivalent, as provided by copyright law.

You may make, run and propagate covered works that you do not convey, without
conditions so long as your license otherwise remains in force. You may convey covered
works to others for the sole purpose of having them make modifications exclusively
for you, or provide you with facilities for running those works, provided that you
comply with the terms of this License in conveying all material for which you do not
control copyright. Those thus making or running the covered works for you must do so
exclusively on your behalf, under your direction and control, on terms that prohibit
them from making any copies of your copyrighted material outside their relationship
with you.

Conveying under any other circumstances is permitted solely under the conditions
stated below. Sublicensing is not allowed; section 10 makes it unnecessary.

### 3. Protecting Users' Legal Rights From Anti-Circumvention Law

No covered work shall be deemed part of an effective technological measure under any
applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
of such measures.

When you convey a covered work, you waive any legal power to forbid circumvention of
technological measures to the extent such circumvention is effected by exercising
rights under this License with respect to the covered work, and you disclaim any
intention to limit operation or modification of the work as a means of enforcing,
against the work's users, your or third parties' legal rights to forbid circumvention
of technological measures.

### 4. Conveying Verbatim Copies

You may convey verbatim copies of the Program's source code as you receive it, in any
medium, provided that you conspicuously and appropriately publish on each copy an
appropriate copyright notice; keep intact all notices stating that this License and
any non-permissive terms added in accord with section 7 apply to the code; keep
intact all notices of the absence of any warranty; and give all recipients a copy of
this License along with the Program.

You may charge any price or no price for each copy that you convey, and you may offer
support or warranty protection for a fee.

### 5. Conveying Modified Source Versions

You may convey a work based on the Program, or the modifications to produce it from
the Program, in the form of source code under the terms of section 4, provided that
you also meet all of these conditions:

* **a)** The work must carry prominent notices stating that you modified it, and giving a
relevant date.
* **b)** The work must carry prominent notices stating that it is released under this
License and any conditions added under section 7. This requirement modifies the
requirement in section 4 to “keep intact all notices”.
* **c)** You must license the entire work, as a whole, under this License to anyone who
comes into possession of a copy. This License will therefore apply, along with any
applicable section 7 additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no permission to license the
work in any other way, but it does not invalidate such permission if you have
separately received it.
* **d)** If the work has interactive user interfaces, each must display Appropriate Legal
Notices; however, if the Program has interactive interfaces that do not display
Appropriate Legal Notices, your work need not make them do so.

A compilation of a covered work with other separate and independent works, which are
not by their nature extensions of the covered work, and which are not combined with
it such as to form a larger program, in or on a volume of a storage or distribution
medium, is called an “aggregate” if the compilation and its resulting
copyright are not used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work in an aggregate
does not cause this License to apply to the other parts of the aggregate.

### 6. Conveying Non-Source Forms

You may convey a covered work in object code form under the terms of sections 4 and
5, provided that you also convey the machine-readable Corresponding Source under the
terms of this License, in one of these ways:

* **a)** Convey the object code in, or embodied in, a physical product (including a
physical distribution medium), accompanied by the Corresponding Source fixed on a
durable physical medium customarily used for software interchange.
* **b)** Convey the object code in, or embodied in, a physical product (including a
physical distribution medium), accompanied by a written offer, valid for at least
three years and valid for as long as you offer spare parts or customer support for
that product model, to give anyone who possesses the object code either **(1)** a copy of
the Corresponding Source for all the software in the product that is covered by this
License, on a durable physical medium customarily used for software interchange, for
a price no more than your reasonable cost of physically performing this conveying of
source, or **(2)** access to copy the Corresponding Source from a network server at no
charge.
* **c)** Convey individual copies of the object code with a copy of the written offer to
provide the Corresponding Source. This alternative is allowed only occasionally and
noncommercially, and only if you received the object code with such an offer, in
accord with subsection 6b.
* **d)** Convey the object code by offering access from a designated place (gratis or for
a charge), and offer equivalent access to the Corresponding Source in the same way
through the same place at no further charge. You need not require recipients to copy
the Corresponding Source along with the object code. If the place to copy the object
code is a network server, the Corresponding Source may be on a different server
(operated by you or a third party) that supports equivalent copying facilities,
provided you maintain clear directions next to the object code saying where to find
the Corresponding Source. Regardless of what server hosts the Corresponding Source,
you remain obligated to ensure that it is available for as long as needed to satisfy
these requirements.
* **e)** Convey the object code using peer-to-peer transmission, provided you inform
other peers where the object code and Corresponding Source of the work are being
offered to the general public at no charge under subsection 6d.

A separable portion of the object code, whose source code is excluded from the
Corresponding Source as a System Library, need not be included in conveying the
object code work.

A “User Product” is either **(1)** a “consumer product”, which
means any tangible personal property which is normally used for personal, family, or
household purposes, or **(2)** anything designed or sold for incorporation into a
dwelling. In determining whether a product is a consumer product, doubtful cases
shall be resolved in favor of coverage. For a particular product received by a
particular user, “normally used” refers to a typical or common use of
that class of product, regardless of the status of the particular user or of the way
in which the particular user actually uses, or expects or is expected to use, the
product. A product is a consumer product regardless of whether the product has
substantial commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

“Installation Information” for a User Product means any methods,
procedures, authorization keys, or other information required to install and execute
modified versions of a covered work in that User Product from a modified version of
its Corresponding Source. The information must suffice to ensure that the continued
functioning of the modified object code is in no case prevented or interfered with
solely because modification has been made.

If you convey an object code work under this section in, or with, or specifically for
use in, a User Product, and the conveying occurs as part of a transaction in which
the right of possession and use of the User Product is transferred to the recipient
in perpetuity or for a fixed term (regardless of how the transaction is
characterized), the Corresponding Source conveyed under this section must be
accompanied by the Installation Information. But this requirement does not apply if
neither you nor any third party retains the ability to install modified object code
on the User Product (for example, the work has been installed in ROM).

The requirement to provide Installation Information does not include a requirement to
continue to provide support service, warranty, or updates for a work that has been
modified or installed by the recipient, or for the User Product in which it has been
modified or installed. Access to a network may be denied when the modification itself
materially and adversely affects the operation of the network or violates the rules
and protocols for communication across the network.

Corresponding Source conveyed, and Installation Information provided, in accord with
this section must be in a format that is publicly documented (and with an
implementation available to the public in source code form), and must require no
special password or key for unpacking, reading or copying.

### 7. Additional Terms

“Additional permissions” are terms that supplement the terms of this
License by making exceptions from one or more of its conditions. Additional
permissions that are applicable to the entire Program shall be treated as though they
were included in this License, to the extent that they are valid under applicable
law. If additional permissions apply only to part of the Program, that part may be
used separately under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

When you convey a copy of a covered work, you may at your option remove any
additional permissions from that copy, or from any part of it. (Additional
permissions may be written to require their own removal in certain cases when you
modify the work.) You may place additional permissions on material, added by you to a
covered work, for which you have or can give appropriate copyright permission.

Notwithstanding any other provision of this License, for material you add to a
covered work, you may (if authorized by the copyright holders of that material)
supplement the terms of this License with terms:

* **a)** Disclaiming warranty or limiting liability differently from the terms of
sections 15 and 16 of this License; or
* **b)** Requiring preservation of specified reasonable legal notices or author
attributions in that material or in the Appropriate Legal Notices displayed by works
containing it; or
* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that
modified versions of such material be marked in reasonable ways as different from the
original version; or
* **d)** Limiting the use for publicity purposes of names of licensors or authors of the
material; or
* **e)** Declining to grant rights under trademark law for use of some trade names,
trademarks, or service marks; or
* **f)** Requiring indemnification of licensors and authors of that material by anyone
who conveys the material (or modified versions of it) with contractual assumptions of
liability to the recipient, for any liability that these contractual assumptions
directly impose on those licensors and authors.

All other non-permissive additional terms are considered “further
restrictions” within the meaning of section 10. If the Program as you received
it, or any part of it, contains a notice stating that it is governed by this License
along with a term that is a further restriction, you may remove that term. If a
license document contains a further restriction but permits relicensing or conveying
under this License, you may add to a covered work material governed by the terms of
that license document, provided that the further restriction does not survive such
relicensing or conveying.

If you add terms to a covered work in accord with this section, you must place, in
the relevant source files, a statement of the additional terms that apply to those
files, or a notice indicating where to find the applicable terms.

Additional terms, permissive or non-permissive, may be stated in the form of a
separately written license, or stated as exceptions; the above requirements apply
either way.

### 8. Termination

You may not propagate or modify a covered work except as expressly provided under
this License. Any attempt otherwise to propagate or modify it is void, and will
automatically terminate your rights under this License (including any patent licenses
granted under the third paragraph of section 11).

However, if you cease all violation of this License, then your license from a
particular copyright holder is reinstated **(a)** provisionally, unless and until the
copyright holder explicitly and finally terminates your license, and **(b)** permanently,
if the copyright holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

Moreover, your license from a particular copyright holder is reinstated permanently
if the copyright holder notifies you of the violation by some reasonable means, this
is the first time you have received notice of violation of this License (for any
work) from that copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

Termination of your rights under this section does not terminate the licenses of
parties who have received copies or rights from you under this License. If your
rights have been terminated and not permanently reinstated, you do not qualify to
receive new licenses for the same material under section 10.

### 9. Acceptance Not Required for Having Copies

You are not required to accept this License in order to receive or run a copy of the
Program. Ancillary propagation of a covered work occurring solely as a consequence of
using peer-to-peer transmission to receive a copy likewise does not require
acceptance. However, nothing other than this License grants you permission to
propagate or modify any covered work. These actions infringe copyright if you do not
accept this License. Therefore, by modifying or propagating a covered work, you
indicate your acceptance of this License to do so.

### 10. Automatic Licensing of Downstream Recipients

Each time you convey a covered work, the recipient automatically receives a license
from the original licensors, to run, modify and propagate that work, subject to this
License. You are not responsible for enforcing compliance by third parties with this
License.

An “entity transaction” is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an organization, or
merging organizations. If propagation of a covered work results from an entity
transaction, each party to that transaction who receives a copy of the work also
receives whatever licenses to the work the party's predecessor in interest had or
could give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if the predecessor
has it or can get it with reasonable efforts.

You may not impose any further restrictions on the exercise of the rights granted or
affirmed under this License. For example, you may not impose a license fee, royalty,
or other charge for exercise of rights granted under this License, and you may not
initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
that any patent claim is infringed by making, using, selling, offering for sale, or
importing the Program or any portion of it.

### 11. Patents

A “contributor” is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The work thus
licensed is called the contributor's “contributor version”.

A contributor's “essential patent claims” are all patent claims owned or
controlled by the contributor, whether already acquired or hereafter acquired, that
would be infringed by some manner, permitted by this License, of making, using, or
selling its contributor version, but do not include claims that would be infringed
only as a consequence of further modification of the contributor version. For
purposes of this definition, “control” includes the right to grant patent
sublicenses in a manner consistent with the requirements of this License.

Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
under the contributor's essential patent claims, to make, use, sell, offer for sale,
import and otherwise run, modify and propagate the contents of its contributor
version.

In the following three paragraphs, a “patent license” is any express
agreement or commitment, however denominated, not to enforce a patent (such as an
express permission to practice a patent or covenant not to sue for patent
infringement). To “grant” such a patent license to a party means to make
such an agreement or commitment not to enforce a patent against the party.

If you convey a covered work, knowingly relying on a patent license, and the
Corresponding Source of the work is not available for anyone to copy, free of charge
and under the terms of this License, through a publicly available network server or
other readily accessible means, then you must either **(1)** cause the Corresponding
Source to be so available, or **(2)** arrange to deprive yourself of the benefit of the
patent license for this particular work, or **(3)** arrange, in a manner consistent with
the requirements of this License, to extend the patent license to downstream
recipients. “Knowingly relying” means you have actual knowledge that, but
for the patent license, your conveying the covered work in a country, or your
recipient's use of the covered work in a country, would infringe one or more
identifiable patents in that country that you have reason to believe are valid.

If, pursuant to or in connection with a single transaction or arrangement, you
convey, or propagate by procuring conveyance of, a covered work, and grant a patent
license to some of the parties receiving the covered work authorizing them to use,
propagate, modify or convey a specific copy of the covered work, then the patent
license you grant is automatically extended to all recipients of the covered work and
works based on it.

A patent license is “discriminatory” if it does not include within the
scope of its coverage, prohibits the exercise of, or is conditioned on the
non-exercise of one or more of the rights that are specifically granted under this
License. You may not convey a covered work if you are a party to an arrangement with
a third party that is in the business of distributing software, under which you make
payment to the third party based on the extent of your activity of conveying the
work, and under which the third party grants, to any of the parties who would receive
the covered work from you, a discriminatory patent license **(a)** in connection with
copies of the covered work conveyed by you (or copies made from those copies), or **(b)**
primarily for and in connection with specific products or compilations that contain
the covered work, unless you entered into that arrangement, or that patent license
was granted, prior to 28 March 2007.

Nothing in this License shall be construed as excluding or limiting any implied
license or other defenses to infringement that may otherwise be available to you
under applicable patent law.

### 12. No Surrender of Others' Freedom

If conditions are imposed on you (whether by court order, agreement or otherwise)
that contradict the conditions of this License, they do not excuse you from the
conditions of this License. If you cannot convey a covered work so as to satisfy
simultaneously your obligations under this License and any other pertinent
obligations, then as a consequence you may not convey it at all. For example, if you
agree to terms that obligate you to collect a royalty for further conveying from
those to whom you convey the Program, the only way you could satisfy both those terms
and this License would be to refrain entirely from conveying the Program.

### 13. Use with the GNU Affero General Public License

Notwithstanding any other provision of this License, you have permission to link or
combine any covered work with a work licensed under version 3 of the GNU Affero
General Public License into a single combined work, and to convey the resulting work.
The terms of this License will continue to apply to the part which is the covered
work, but the special requirements of the GNU Affero General Public License, section
13, concerning interaction through a network will apply to the combination as such.

### 14. Revised Versions of this License

The Free Software Foundation may publish revised and/or new versions of the GNU
General Public License from time to time. Such new versions will be similar in spirit
to the present version, but may differ in detail to address new problems or concerns.

Each version is given a distinguishing version number. If the Program specifies that
a certain numbered version of the GNU General Public License “or any later
version” applies to it, you have the option of following the terms and
conditions either of that numbered version or of any later version published by the
Free Software Foundation. If the Program does not specify a version number of the GNU
General Public License, you may choose any version ever published by the Free
Software Foundation.

If the Program specifies that a proxy can decide which future versions of the GNU
General Public License can be used, that proxy's public statement of acceptance of a
version permanently authorizes you to choose that version for the Program.

Later license versions may give you additional or different permissions. However, no
additional obligations are imposed on any author or copyright holder as a result of
your choosing to follow a later version.

### 15. Disclaimer of Warranty

THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

### 16. Limitation of Liability

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

### 17. Interpretation of Sections 15 and 16

If the disclaimer of warranty and limitation of liability provided above cannot be
given local legal effect according to their terms, reviewing courts shall apply local
law that most closely approximates an absolute waiver of all civil liability in
connection with the Program, unless a warranty or assumption of liability accompanies
a copy of the Program in return for a fee.

_END OF TERMS AND CONDITIONS_

## How to Apply These Terms to Your New Programs

If you develop a new program, and you want it to be of the greatest possible use to
the public, the best way to achieve this is to make it free software which everyone
can redistribute and change under these terms.

To do so, attach the following notices to the program. It is safest to attach them
to the start of each source file to most effectively state the exclusion of warranty;
and each file should have at least the “copyright” line and a pointer to
where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) 2020 Yang Cao

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

If the program does terminal interaction, make it output a short notice like this
when it starts in an interactive mode:

    microbiomeMarker Copyright (C) 2020 Yang Cao
    This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type 'show c' for details.

The hypothetical commands `show w` and `show c` should show the appropriate parts of
the General Public License. Of course, your program's commands might be different;
for a GUI interface, you would use an “about box”.

You should also get your employer (if you work as a programmer) or school, if any, to
sign a “copyright disclaimer” for the program, if necessary. For more
information on this, and how to apply and follow the GNU GPL, see
&lt;<http://www.gnu.org/licenses/>&gt;.

The GNU General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may consider it
more useful to permit linking proprietary applications with the library. If this is
what you want to do, use the GNU Lesser General Public License instead of this
License. But first, please read
&lt;<http://www.gnu.org/philosophy/why-not-lgpl.html>&gt;.


================================================
FILE: NAMESPACE
================================================
# Generated by roxygen2: do not edit by hand

S3method(plot,compareDA)
S3method(summary,compareDA)
export("%>%")
export("marker_table<-")
export(abundances)
export(aggregate_taxa)
export(compare_DA)
export(confounder)
export(extract_posthoc_res)
export(import_biom)
export(import_dada2)
export(import_mothur)
export(import_picrust2)
export(import_qiime)
export(import_qiime2)
export(marker_table)
export(microbiomeMarker)
export(nmarker)
export(norm_clr)
export(norm_cpm)
export(norm_css)
export(norm_rarefy)
export(norm_rle)
export(norm_tmm)
export(norm_tss)
export(nsamples)
export(ntaxa)
export(otu_table)
export(otu_table2metagenomeSeq)
export(phyloseq2DESeq2)
export(phyloseq2edgeR)
export(phyloseq2metagenomeSeq)
export(plot_abundance)
export(plot_cladogram)
export(plot_ef_bar)
export(plot_ef_dot)
export(plot_heatmap)
export(plot_postHocTest)
export(plot_sl_roc)
export(postHocTest)
export(run_aldex)
export(run_ancom)
export(run_ancombc)
export(run_deseq2)
export(run_edger)
export(run_lefse)
export(run_limma_voom)
export(run_marker)
export(run_metagenomeseq)
export(run_posthoc_test)
export(run_simple_stat)
export(run_sl)
export(run_test_multiple_groups)
export(run_test_two_groups)
export(sample_data)
export(sample_names)
export(subset_marker)
export(summarize_taxa)
export(tax_table)
export(taxa_names)
export(transform_abundances)
exportClasses(marker_table)
exportClasses(microbiomeMarker)
exportClasses(postHocTest)
exportMethods("[")
exportMethods(normalize)
exportMethods(nsamples)
exportMethods(ntaxa)
exportMethods(otu_table)
exportMethods(sample_data)
exportMethods(sample_names)
exportMethods(show)
exportMethods(tax_table)
exportMethods(taxa_names)
importClassesFrom(IRanges,DataFrameList)
importClassesFrom(phyloseq,otu_table)
importClassesFrom(phyloseq,phyloseq)
importClassesFrom(phyloseq,taxonomyTable)
importFrom(ANCOMBC,ancombc)
importFrom(Biobase,"pData<-")
importFrom(Biobase,AnnotatedDataFrame)
importFrom(Biobase,pData)
importFrom(ComplexHeatmap,Heatmap)
importFrom(ComplexHeatmap,HeatmapAnnotation)
importFrom(DESeq2,"dispersions<-")
importFrom(IRanges,DataFrameList)
importFrom(biomformat,biom_data)
importFrom(biomformat,read_biom)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,desc)
importFrom(dplyr,everything)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,group_modify)
importFrom(dplyr,group_split)
importFrom(dplyr,mutate)
importFrom(dplyr,rowwise)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,summarise)
importFrom(dplyr,ungroup)
importFrom(ggplot2,aes)
importFrom(ggplot2,aes_)
importFrom(ggplot2,aes_string)
importFrom(ggplot2,annotate)
importFrom(ggplot2,coord_equal)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,element_text)
importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_boxplot)
importFrom(ggplot2,geom_col)
importFrom(ggplot2,geom_errorbar)
importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_rect)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,guide_axis)
importFrom(ggplot2,guide_legend)
importFrom(ggplot2,guides)
importFrom(ggplot2,labeller)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_shape_manual)
importFrom(ggplot2,scale_x_continuous)
importFrom(ggplot2,scale_y_discrete)
importFrom(ggplot2,theme)
importFrom(ggplot2,theme_bw)
importFrom(ggtree,geom_cladelabel)
importFrom(ggtree,geom_hilight)
importFrom(ggtree,geom_point2)
importFrom(ggtree,ggtree)
importFrom(magrittr,"%>%")
importFrom(metagenomeSeq,"normFactors<-")
importFrom(metagenomeSeq,MRcounts)
importFrom(metagenomeSeq,cumNorm)
importFrom(metagenomeSeq,cumNormStatFast)
importFrom(metagenomeSeq,newMRexperiment)
importFrom(methods,setClass)
importFrom(methods,setGeneric)
importFrom(methods,setMethod)
importFrom(phyloseq,"otu_table<-")
importFrom(phyloseq,"sample_data<-")
importFrom(phyloseq,"tax_table<-")
importFrom(phyloseq,"taxa_are_rows<-")
importFrom(phyloseq,"taxa_names<-")
importFrom(phyloseq,`otu_table<-`)
importFrom(phyloseq,import_biom)
importFrom(phyloseq,import_mothur)
importFrom(phyloseq,import_qiime)
importFrom(phyloseq,merge_phyloseq)
importFrom(phyloseq,nsamples)
importFrom(phyloseq,ntaxa)
importFrom(phyloseq,otu_table)
importFrom(phyloseq,phy_tree)
importFrom(phyloseq,phyloseq)
importFrom(phyloseq,prune_samples)
importFrom(phyloseq,prune_taxa)
importFrom(phyloseq,rank_names)
importFrom(phyloseq,rarefy_even_depth)
importFrom(phyloseq,read_tree)
importFrom(phyloseq,refseq)
importFrom(phyloseq,sample_data)
importFrom(phyloseq,sample_names)
importFrom(phyloseq,sample_sums)
importFrom(phyloseq,t)
importFrom(phyloseq,tax_glom)
importFrom(phyloseq,tax_table)
importFrom(phyloseq,taxa_are_rows)
importFrom(phyloseq,taxa_names)
importFrom(phyloseq,taxa_sums)
importFrom(phyloseq,transform_sample_counts)
importFrom(plotROC,calc_auc)
importFrom(plotROC,geom_roc)
importFrom(plotROC,style_roc)
importFrom(purrr,map_dbl)
importFrom(purrr,pmap_chr)
importFrom(purrr,pmap_dbl)
importFrom(rlang,.data)
importFrom(stats,TukeyHSD)
importFrom(stats,anova)
importFrom(stats,aov)
importFrom(stats,coef)
importFrom(stats,dnorm)
importFrom(stats,drop1)
importFrom(stats,fisher.test)
importFrom(stats,formula)
importFrom(stats,glm)
importFrom(stats,kruskal.test)
importFrom(stats,lm)
importFrom(stats,median)
importFrom(stats,model.matrix)
importFrom(stats,model.tables)
importFrom(stats,na.omit)
importFrom(stats,p.adjust)
importFrom(stats,pairwise.table)
importFrom(stats,pf)
importFrom(stats,pnorm)
importFrom(stats,psignrank)
importFrom(stats,pt)
importFrom(stats,ptukey)
importFrom(stats,pwilcox)
importFrom(stats,qf)
importFrom(stats,qtukey)
importFrom(stats,quantile)
importFrom(stats,relevel)
importFrom(stats,residuals)
importFrom(stats,sd)
importFrom(stats,var)
importFrom(stats,wilcox.test)
importFrom(tidytree,treedata)
importFrom(utils,read.delim)
importFrom(vegan,cca)
importFrom(yaml,read_yaml)
importMethodsFrom(BiocGenerics,"sizeFactors<-")
importMethodsFrom(BiocGenerics,counts)
importMethodsFrom(BiocGenerics,normalize)
importMethodsFrom(S4Vectors,mcols)
importMethodsFrom(phyloseq,"otu_table<-")
importMethodsFrom(phyloseq,nsamples)
importMethodsFrom(phyloseq,ntaxa)
importMethodsFrom(phyloseq,otu_table)
importMethodsFrom(phyloseq,sample_data)
importMethodsFrom(phyloseq,sample_names)
importMethodsFrom(phyloseq,t)
importMethodsFrom(phyloseq,tax_table)
importMethodsFrom(phyloseq,taxa_names)


================================================
FILE: NEWS.md
================================================
# microbiomeMarker 1.3.2

+ fix error on subgroup in lefse, #62, #55

# microbiomeMarker 1.3.1 (2022-05-26)

+ Development version on Bioconductor.

# microbiomeMarker 1.2.1 (2022-05-26)

+ Confounder analysis.
+ Comparison of different methods.

# microbiomeMarker 1.2.0 (2022-04-27) 

+ Released on Bioconductor 3.15.

# microbiomeMarker 1.1.2 (2022-03-07)

+ Development version on Bioconductor
+ Use 3rd version of testthat to fix test error (use `expect_snapshot()` rather 
than `expect_known_ouput`).
+ Add two new arguments in `plot_heatmap()` `scale_by_row` and `annotation_col`
to improve heatmap viaualization, #52.
+ Set slot `marker_table` to `NULL` if no marker was identified.
+ Add new import function `import_picrust2()` to import prediction functional
table from PICRUSt2, and all DA functions support for PICRUSt2 output data.
+ Keep color consistent between legend and plot in cladogram, #42. 
+ Add a new argument `clade_label_font_size` in `plot_cladogram()` to specify 
font size of clade label, #49.

# microbiomeMarker  1.1.1 (2021-03-07)

+ Add a para `only_marker` in `plot_cladogram` to specify whether only show the 
markers or all features in the cladogram.
+ Fix a bug in `run_test_multiple_groups()`, error group names for enrich 
groups (2021-10-12, #48).
+ Fix a bug in `plot_abundance()`, error var name of effect size in
`marker_table` (2021-10-17, #47).

# microbiomeMarker 1.0.0 (2021-10-27)

+ Released on Bioconductor.

# microbiomeMarker 0.99.1 (2021-10-11)

+ Accepted by Bioconductor.

# microbiomeMarker 0.99.0 (2021-09-01)

+ Submitted to Bioconductor


================================================
FILE: R/AllClasses.R
================================================
# marker_table class ------------------------------------------------------

#' The S4 class for storing microbiome marker information
#'
#' This Class is inherit from `data.frame`. Rows represent the microbiome
#' markers and variables represents feature of the marker.
#'
#' @name marker_table-class
#' @aliases marker_table-class
#' @field names,row.names a character vector, inherited from the input
#' data.frame
#' @field .data a list, each element corresponding the each column of the
#' input data.frame
#' @field .S3Class character, the S3 class `marker_table` inherited from:
#' "`data.frame`"
#' @author Yang Cao
#' @exportClass marker_table
setClass("marker_table", contains = "data.frame")

# validator of marker_table
validity_marker_table <- function(object) {
    msg <- NULL
    if (!"feature" %in% names(object)) {
        msg <- c(
            msg,
            "marker table must contain variable `feature`: the name of marker"
        )
    }
    if (any(dim(object) == 0)) {
        msg <- c(msg, "marker table must have non-zero dimensions")
    }

    if (length(msg)) {
        return(msg)
    } else {
        return(TRUE)
    }
}

setValidity("marker_table", validity_marker_table)

################################################################################
# A class may be defined as the union of other classes; that is, as a virtual
# class defined as a superclass of several other classes. This is a way of
# dealing with the expected scenarios in which one or more of the slot is not
# available, in which case NULL will be used instead.
################################################################################
#' @importClassesFrom phyloseq taxonomyTable
#' @keywords internal
setClassUnion("marker_tableOrNULL", c("marker_table", "NULL"))
#' @keywords internal
setClassUnion("taxonomyTableOrNULL", c("taxonomyTable", "NULL"))
#' @keywords internal
setClassUnion("characterOrNULL", c("character", "NULL"))
#' @keywords internal
setClassUnion("numericOrNULL", c("numeric", "NULL"))

# microbiomeMarker class --------------------------------------------------

#' The main class for microbiomeMarker data
#'
#' `microbiomeMarker-class` is inherited from the [`phyloseq::phyloseq-class`]
#' by adding a custom slot `microbiome_marker` to save the differential analysis
#' results. And it provides a seamless interface with **phyloseq**, which makes
#' **microbiomeMarker** simple and easy to use. For more details on see the
#' document of [`phyloseq::phyloseq-class`].
#' @name microbiomeMarker-class
#' @aliases microbiomeMarker-class
#' @importClassesFrom phyloseq phyloseq
#' @slot marker_table a data.frame, a [`marker_table-class`] object.
#' @slot norm_method character, method used to normalize the input `phyloseq`
#'   object.
#' @slot diff_method character, method used for marker identification.
#' @seealso [`phyloseq::phyloseq-class`], [`marker_table-class`],
#' [summarize_taxa()]
#' @exportClass microbiomeMarker
#' @return a [`microbiomeMarker-class`] object.
`microbiomeMarker-class` <- setClass("microbiomeMarker",
    slots = c(
        marker_table = "marker_tableOrNULL",
        norm_method = "characterOrNULL",
        diff_method = "characterOrNULL"
    ),
    contains = "phyloseq",
    prototype = list(
        marker_table = NULL,
        norm_method = NULL,
        diff_method = NULL
    )
)

#' Build microbiomeMarker-class objects
#'
#' This the constructor to build the [`microbiomeMarker-class`] object, don't
#' use the `new()` constructor.
#' @param marker_table a [`marker_table-class`] object differtial analysis.
#' @param norm_method character, method used to normalize the input `phyloseq`
#'   object.
#' @param diff_method character, method used for microbiome marker
#'   identification.
#' @param ... arguments passed to [phyloseq::phyloseq()]
#' @seealso [phyloseq::phyloseq()]
#' @name microbiomeMarker
#' @export
#' @return  a [`microbiomeMarker-class`] object.
#' @examples
#' microbiomeMarker(
#'     marker_table = marker_table(data.frame(
#'         feature = c("speciesA", "speciesB"),
#'         enrich_group = c("groupA", "groupB"),
#'         ef_logFC = c(-2, 2),
#'         pvalue = c(0.01, 0.01),
#'         padj = c(0.01, 0.01),
#'         row.names = c("marker1", "marker2")
#'     )),
#'     norm_method = "TSS",
#'     diff_method = "DESeq2",
#'     otu_table = otu_table(matrix(
#'         c(4, 1, 1, 4),
#'         nrow = 2, byrow = TRUE,
#'         dimnames = list(c("speciesA", "speciesB"), c("sample1", "sample2"))
#'     ),
#'     taxa_are_rows = TRUE
#'     ),
#'     tax_table = tax_table(matrix(
#'         c("speciesA", "speciesB"),
#'         nrow = 2,
#'         dimnames = list(c("speciesA", "speciesB"), "Species")
#'     )),
#'     sam_data = sample_data(data.frame(
#'         group = c("groupA", "groupB"),
#'         row.names = c("sample1", "sample2")
#'     ))
#' )
microbiomeMarker <- function(marker_table = NULL,
    norm_method = NULL,
    diff_method = NULL,
    ...) {
    ps_slots <- list(...)
    ps_component_cls <- vapply(ps_slots, class, character(1))
    if (!"otu_table" %in% ps_component_cls) {
        stop("otu_table is required")
    }
    if (!"taxonomyTable" %in% ps_component_cls) {
        stop("tax_table is required")
    }

    # set the rownmaes of marker_table as "markern"
    if (!is.null(marker_table)) {
        rownames(marker_table) <- paste0("marker", seq_len(nrow(marker_table)))
    }

    new(
        "microbiomeMarker",
        marker_table = marker_table,
        norm_method = norm_method,
        diff_method = diff_method,
        ...
    )
}

# validity for microbiomeMarker, at least contains two slots: otu_table,
#  tax_table
#' @importMethodsFrom phyloseq taxa_names
validity_microbiomeMarker <- function(object) {
    msg <- NULL
    otu <- object@otu_table
    tax <- object@tax_table
    marker <- object@marker_table
    norm_method <- object@norm_method
    diff_method <- object@diff_method

    # summarized taxa
    if (is.null(tax)) {
        msg <- c(msg, "tax_table is required")
    }

    if (is.null(otu)) {
        msg <- c(msg, "otu_table is required")
    }

    # marker in marker_table must be contained in tax_table
    if (!is.null(marker) && !is.null(tax) &&
        !all(marker$feature %in% tax@.Data[, 1])) {
        msg <- c(msg, "marker in marker_table must be contained in tax")
    }

    if (!is.null(otu) && !is.null(tax) && nrow(otu) != nrow(tax)) {
        msg <- c(
            msg,
            "nrow of `otu_table` must be equal to the length of `tax_table()`"
        )
    }

    if (!is.null(tax) && !is.null(marker) && nrow(marker) > nrow(tax)) {
        msg <- c(
            msg,
            paste0(
                "The number of different feature must be smaller than the",
                " total number of feature"
            )
        )
    }

    if (length(msg)) {
        return(msg)
    } else {
        return(TRUE)
    }
}

setValidity("microbiomeMarker", validity_microbiomeMarker)

# postHocTest  class ------------------------------------------------------

#' The postHocTest Class, represents the result of post-hoc test result among
#' multiple groups
#'
#' @slot result  a [`IRanges::DataFrameList-class`], each `DataFrame` consists
#' of five variables:
#' * `comparisons`: character, specify which two groups to test (the group names
#'   are separated by "_)
#' * `diff_mean`: numeric, difference in mean abundances
#' * `pvalue`: numeric, p values
#' * `ci_lower` and `ci_upper`: numeric, lower and upper confidence interval of
#'   difference in mean abundances
#' @slot abundance abundance of each feature in each group
#' @slot conf_level confidence level
#' @slot method method used for post-hoc test
#' @slot method_str method illustration
#' @name postHocTest-class
#' @aliases postHocTest-class
#' @author Yang Cao
#' @exportClass postHocTest
#' @importClassesFrom IRanges DataFrameList
#' @return a [`postHocTest-class`] object.
setClass("postHocTest",
    slots = c(
        result = "DataFrameList",
        abundance = "data.frame",
        conf_level = "numeric",
        method = "character",
        method_str = "character"
    ),
    prototype = list(
        result = NULL,
        conf_level = NULL,
        method = NULL,
        method_str = NULL
    )
)

# validity for postHocTest
validity_postHocTest <- function(object) {
    msg <- NULL
    conf_level <- object@conf_level
    if (!is.numeric(conf_level) || conf_level < 0 || conf_level > 1) {
        msg <- c(
            msg,
            "conf_level must in the range of (0,1)"
        )
    }

    method <- object@method
    if (!method %in% 
            c("tukey", "games_howell", "scheffe", "welch_uncorrected")) {
        msg <- c(
            msg,
            paste(
                "method must be one of tukey, games_howell, scheffe or",
                "welch_uncorrected"
            )
        )
    }

    if (length(msg)) {
        return(msg)
    } else {
        return(TRUE)
    }
}

setValidity("postHocTest", validity_postHocTest)


#' Build postHocTest object
#'
#' This function is used for create `postHocTest` object, and is only used for
#' developers.
#'
#' @param result a [`IRanges::SimpleDFrameList-class`] object.
#' @param abundance data.frame.
#' @param conf_level numeric, confidence level.
#' @param method character, method for posthoc test.
#' @param  method_str character, illustrates which method is used for posthoc
#'   test.
#' @return a [`postHocTest-class`] object.
#' @export
#' @examples
#' require(IRanges)
#' pht <- postHocTest(
#'     result = DataFrameList(
#'         featureA = DataFrame(
#'             comparisons = c("group2-group1", 
#'                 "group3-group1", 
#'                 "group3-group2"),
#'             diff_mean = runif(3),
#'             pvalue = rep(0.01, 3),
#'             ci_lower = rep(0.01, 3),
#'             ci_upper = rep(0.011, 3)
#'         ),
#'         featureB = DataFrame(
#'             comparisons = c("group2-group1", 
#'                 "group3-group1", 
#'                 "group3-group2"),
#'             diff_mean = runif(3),
#'             pvalue = rep(0.01, 3),
#'             ci_lower = rep(0.01, 3),
#'             ci_upper = rep(0.011, 3)
#'         )
#'     ),
#'     abundance = data.frame(
#'         featureA = runif(3),
#'         featureB = runif(3),
#'         group = c("group1", "group2", "grou3")
#'     )
#' )
#' pht
postHocTest <- function(result,
    abundance,
    conf_level = 0.95,
    method = "tukey",
    method_str =
        paste(
            "Posthoc multiple comparisons of means: ",
            method
        )) {
    new(
        "postHocTest",
        result = result,
        abundance = abundance,
        conf_level = conf_level,
        method = method,
        method_str = method_str
    )
}


================================================
FILE: R/AllGenerics.R
================================================
# marker_table class -----------------------------------------------------------

#' Build or access the marker_table
#'
#' This is the recommended function for both building and accessing microbiome
#' marker table ([`marker_table-class`]).
#' @param object an object among the set of classes defined by the
#' microbiomeMarker package that contain `marker_table`
#' @export
#' @rdname marker_table-methods
#' @return a [`marker_table-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.05,
#'     p_adjust = "fdr"
#' )
#' marker_table(mm)
setGeneric(
    "marker_table",
    function(object) standardGeneric("marker_table")
)

# build marker_table from data.frame
#' @aliases marker_table,data.frame-method
#' @rdname marker_table-methods
setMethod("marker_table", "data.frame", function(object) {
    mt <- new("marker_table", object)
    row.names(mt) <- paste0("marker", seq_len(nrow(object)))

    mt
})


# access the marker_table of a microbiomeMarker-class object
#' @rdname marker_table-methods
#' @aliases marker_table,microbiomeMarker-method

setMethod("marker_table", "microbiomeMarker", function(object) {
    object@marker_table
})


# Assign marker_table -----------------------------------------------------
#' Assign marker_table to `object`
#'
#' This function replace the `marker_table` slot of `object` with `value`.
#'
#' @param object a [`microbiomeMarker-class`] object to modify.
#' @param value new value to replace the `marker_table` slot of `object`.
#'   Either a `marker_table-class`, a `data.frame` that can be coerced
#'   into `marker_table-class`.
#' @export
#' @rdname assign-marker_table
#' @aliases assign-marker_table marker_table<-
#' @return a [`microbiomeMarker-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.1,
#'     p_adjust = "fdr"
#' )
#' mm_marker <- marker_table(mm)
#' mm_marker
#' marker_table(mm) <- mm_marker[1:2, ]
#' marker_table(mm)
"marker_table<-" <- function(object, value) {
    if (!inherits(value, "marker_table") && !is.null(value)) {
        value <- marker_table(value)
    }

    microbiomeMarker(
        marker_table = value,
        norm_method = object@norm_method,
        diff_method = object@diff_method,
        otu_table = object@otu_table,
        tax_table = object@tax_table,
        phy_tree = object@phy_tree,
        refseq = object@refseq
    )
}
# microbiomeMarker class ------------------------------------------------------

# modified from the show method of phyloseq
# https://github.com/joey711/phyloseq/blob/master/R/show-methods.R#L47-L82
#' @rdname microbiomeMarker-class
#' @param object a `microbiomeMarker-class` object
#' @export
setMethod("show", "microbiomeMarker", function(object) {
    cat("microbiomeMarker-class inherited from phyloseq-class", fill = TRUE)
    norm <- object@norm_method
    if (!is.null(norm)) {
        if (grepl("per-sample normalized", norm)) {
            norm <- gsub(".*to ", "", norm)
            cat(
                "normalization: per-sample to value [", norm, "]",
                fill = TRUE
            )
        } else {
            cat(
                "normalization method:              [", norm, "]",
                fill = TRUE
            )
        }
    }

    if (!is.null(object@diff_method)) {
        cat(
            "microbiome marker identity method: [",
            object@diff_method,
            "]",
            fill = TRUE
        )
    }
    
    if (!is.null(object@marker_table)) {
        cat(
            "marker_table() Marker Table:       [",
            nrow(object@marker_table), "microbiome markers with",
            ncol(object@marker_table), "variables ]",
            fill = TRUE
        )
    } else {
        cat(
            "marker_table() Marker Table:       [",
            "no microbiome markers were identified ]",
            fill = TRUE
        )
    }

    # print otu_table (always there).
    cat(
        "otu_table()    OTU Table:          [",
        ntaxa(otu_table(object)), "taxa and ",
        nsamples(otu_table(object)), "samples ]",
        fill = TRUE
    )

    # print Sample Data if there
    if (!is.null(sample_data(object, FALSE))) {
        cat(
            "sample_data()  Sample Data:        [", dim(sample_data(object))[1],
            "samples by ", dim(sample_data(object))[2],
            "sample variables ]",
            fill = TRUE
        )
    }

    # print tax Tab if there
    if (!is.null(tax_table(object, FALSE))) {
        cat(
            "tax_table()    Taxonomy Table:     [", dim(tax_table(object))[1],
            "taxa by", dim(tax_table(object))[2],
            "taxonomic ranks ]",
            fill = TRUE
        )
    }

    # print tree if there
    if (!is.null(phy_tree(object, FALSE))) {
        cat(
            "phy_tree()    Phylogenetic Tree:   [", ntaxa(phy_tree(object)),
            "tips and", phy_tree(object)$Nnode,
            "internal nodes ]",
            fill = TRUE
        )
    }

    # print refseq summary if there
    if (!is.null(refseq(object, FALSE))) {
        cat(
            "refseq()      ", class(refseq(object))[1],
            ":         [", ntaxa(refseq(object)),
            " reference sequences ]",
            fill = TRUE
        )
    }
})

# get the number of markers -----------------------------------------------

#' Get the number of microbiome markers
#' @param object a [`microbiomeMarker-class`] or [`marker_table-class`] object
#' @docType methods
#' @rdname nmarker-methods
#' @return an integer, the number of microbiome markers
#' @export
#' @examples
#' mt <- marker_table(data.frame(
#'     feature = c("speciesA", "speciesB"),
#'     enrich_group = c("groupA", "groupB"),
#'     ef_logFC = c(-2, 2),
#'     pvalue = c(0.01, 0.01),
#'     padj = c(0.01, 0.01),
#'     row.names = c("marker1", "marker2")
#' ))
#' nmarker(mt)
setGeneric("nmarker", function(object) standardGeneric("nmarker"))

#' @rdname nmarker-methods
#' @aliases nmarker,microbiomeMarker-method
setMethod("nmarker", "microbiomeMarker", function(object) {
    marker <- marker_table(object)
    ifelse(is.null(marker), 0L,  nrow(marker))
})

#' @rdname nmarker-methods
#' @aliases nmarker,marker_table-method
setMethod("nmarker", "marker_table", function(object) {
    ifelse(is.null(object), 0L,  nrow(object))
})

# postHocTest class -------------------------------------------------------
#' @rdname postHocTest-class
#' @aliases show, postHocTest-method
#' @param object a `postHocTest-class` object
#' @export
setMethod("show", "postHocTest", function(object) {
    cat("postHocTest-class object", fill = TRUE)
    result <- object@result
    var_mean <- c(
        "pair groups to test which separated by '-'",
        "difference in mean proportions",
        "post hoc test p values",
        "lower confidence interval",
        "upper confidence interval"
    )
    cat(
        "Pairwise test result of", length(result), " features, ",
        "DataFrameList object, each DataFrame has five variables:\n       ",
        paste0(
            names(result[[1]]),
            c("    : ", ": ", "        : ", " : ", " : "),
            var_mean,
            collapse = "        ",
            "\n"
        )
    )
    cat(
        "Posthoc multiple comparisons of means",
        " using ", object@method, " method",
        fill = TRUE
    )
})


================================================
FILE: R/DA-aldex.R
================================================
#' Perform differential analysis using ALDEx2
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param method test method, options include: "t.test" and "wilcox.test"
#'   for two groups comparison,  "kruskal" and "glm_anova" for multiple groups
#'   comparison.
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff cutoff of p value, default 0.05.
#' @param mc_samples integer, the number of Monte Carlo samples to use for
#'   underlying distributions estimation, 128 is usually sufficient.
#' @param denom character string, specifiy which features used to as the
#'   denominator for the geometric mean calculation. Options are:
#'   * "all", with all features.
#'   * "iqlr", accounts for data with systematic variation and centers the
#'    features on the set features that have variance that is between the lower
#'    and upper quartile of variance.
#'   *  "zero", a more extreme case where there are many non-zero features in
#'     one condition but many zeros in another. In this case the geometric mean
#'     of each group is calculated using the set of per-group non-zero features.
#'   * "lvha", with house keeping features.
#' @param paired logical, whether to perform paired tests, only worked for
#'   method "t.test" and "wilcox.test".
#' @export
#' @references Fernandes, A.D., Reid, J.N., Macklaim, J.M. et al. Unifying the
#'   analysis of high-throughput sequencing datasets: characterizing RNA-seq,
#'   16S rRNA gene sequencing and selective growth experiments by compositional
#'   data analysis. Microbiome 2, 15 (2014).
#' @seealso [`ALDEx2::aldex()`]
#' @return a [`microbiomeMarker-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_aldex(ps, group = "Enterotype")
run_aldex <- function(ps,
    group,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "none",
    norm_para = list(),
    method = c(
        "t.test", "wilcox.test",
        "kruskal", "glm_anova"
    ),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    mc_samples = 128,
    denom = c("all", "iqlr", "zero", "lvha"),
    paired = FALSE) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps) %>%
        check_taxa_rank( taxa_rank)

    denom <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    # trans method as argument test in ALDEx2::aldex
    method <- match.arg(
        method,
        c("t.test", "wilcox.test", "kruskal", "glm_anova")
    )
    if (method %in% c("t.test", "wilcox.test")) {
        test_method <- "t"
    } else {
        test_method <- "kw"
    }

    # check whether group is valid, write a function
    sample_meta <- sample_data(ps)
    meta_nms <- names(sample_meta)
    if (!group %in% meta_nms) {
        stop(
            group, " are not contained in the `sample_data` of `ps`",
            call. = FALSE
        )
    }

    transform <- match.arg(transform, c("identity", "log10", "log10p"))

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalize the data
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    groups <- sample_meta[[group]]
    abd <- abundances(ps_summarized, norm = TRUE)

    test_fun <- ifelse(test_method == "t", aldex_t, aldex_kw)
    test_para <- list(
        reads = abd,
        conditions = groups,
        method = method,
        mc_samples = mc_samples,
        denom = denom,
        p_adjust = p_adjust
    )
    if (test_method == "t") {
        test_para <- c(test_para, paired = paired)
    }

    test_out <- tryCatch(
        do.call(test_fun, test_para),
        error = function(e) e
    )

    # check whether counts are integers
    if (inherits(test_out, "error") &&
        conditionMessage(test_out) == "not all reads are integers") {
        warning(
            "Not all reads are integers, the reads are ceiled to integers.\n",
            "   Raw reads is recommended from the ALDEx2 paper.",
            call. = FALSE
        )
        test_para$reads <- ceiling(abd)
        test_out <- do.call(test_fun, test_para)
    }

    sig_feature <- dplyr::filter(test_out, .data$padj <= pvalue_cutoff)
    marker <- return_marker(sig_feature, test_out)

    feature <- test_out$feature
    tax <- matrix(feature) %>%
        tax_table()
    row.names(tax) <- row.names(abd)

    mm <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = paste0("ALDEx2_", method),
        sam_data = sample_data(ps_summarized),
        otu_table = otu_table(abd, taxa_are_rows = TRUE),
        tax_table = tax
    )

    mm
}


# aldex t test, wilcox test
# In the original version of ALDEx2, each p value is corrected using the
# Benjamini-Hochberg method. Here, we add a new argument `p_adjust` to
# make aldex support for other correction methods.
aldex_t <- function(reads,
    conditions,
    mc_samples,
    method = c("t.test", "wilcox.test"),
    denom = c("all", "iqlr", "zero", "lvha"),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    paired = FALSE) {
    method <- match.arg(method, c("t.test", "wilcox.test"))
    demon <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )
    conditions <- as.factor(conditions)

    if (!inherits(reads, "aldex.clr")) {
        reads_clr <- ALDEx2::aldex.clr(
            reads = reads,
            conds = as.character(conditions),
            mc.samples = mc_samples,
            denom = denom
        )
        feature <- row.names(reads)
    } else {
        reads_clr <- reads
        feature <- row.names(reads@reads)
    }

    mc_instance <- reads_clr@analysisData
    mc_instance_ldf <- convert_instance(mc_instance, mc_samples)

    if (method == "t.test") {
        pvalue <- purrr::map_dfc(
            mc_instance_ldf,
            t_fast,
            group = conditions, paired = paired)
    } else {
        pvalue <- purrr::map_dfc(
            mc_instance_ldf,
            wilcox_fast,
            group = conditions, paired = paired
        )
    }

    padj_greater <- purrr::map_dfc(
        pvalue,
        \(x) p.adjust (2 * x, method = p_adjust)
    )
    padj_less <- purrr::map_dfc(
        pvalue,
        \(x) p.adjust (2 * (1 - x), method = p_adjust)
    )

    # making this into a two-sided test
    pvalue_greater <-2 * pvalue
    pvalue_less <- 2 * (1 -  pvalue)

    # making sure the max p-value is 1
    pvalue_greater <- apply(pvalue_greater, c(1, 2), \(x) min(x, 1))
    pvalue_less <- apply(pvalue_less, c(1, 2), \(x) min(x, 1))

    # get the expected value of p value and adjusted p value
    e_pvalue <- cbind(rowMeans(pvalue_greater), rowMeans(pvalue_less)) |>
        apply(1, min)
    e_padj <- cbind(rowMeans(padj_greater), rowMeans(padj_less)) |>
        apply(1, min)

    # effect size
    ef <- ALDEx2::aldex.effect(
        reads_clr,
        include.sample.summary = FALSE,
        verbose = FALSE
    )
    # enrich group
    cds <- gsub("rab.win.", "", names(ef)[2:3])
    ef <- ef$effect
    enrich_group <- ifelse(ef > 0, cds[1], cds[2])

    res <- data.frame(
        feature = feature,
        enrich_group = enrich_group,
        ef_aldex = ef,
        pvalue = e_pvalue,
        padj = e_padj
    )

    res
}

# aldex kruskal-wallis test and glm anova statistics
#' @importFrom stats kruskal.test glm drop1
aldex_kw <- function(reads,
    conditions,
    method = c("kruskal", "glm_anova"),
    mc_samples = 128,
    denom = c("all", "iqlr", "zero", "lvha"),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    )) {
    method <- match.arg(method, c("kruskal", "glm_anova"))
    demon <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )
    conditions <- as.factor(conditions)

    if (!inherits(reads, "aldex.clr")) {
        reads_clr <- ALDEx2::aldex.clr(
            reads = reads,
            conds = conditions,
            mc.samples = mc_samples,
            denom = denom
        )
        feature <- row.names(reads)
    } else {
        reads_clr <- reads
        feature <- row.names(reads@reads)
    }

    mc_instance <- reads_clr@analysisData
    # convert mc_instance to a list of data frame, each element represents a mc
    # sample for all samples.
    mc_instance_ldf <- convert_instance(mc_instance, mc_samples)

    if (method == "kruskal") {
        pvalue <- purrr::map_dfc(
            mc_instance_ldf,
            function(x) {
                apply(
                    x, 1,
                    function(y) {
                        stats::kruskal.test(y, g = factor(conditions))[[3]]
                    }
                )
            }
        )
    } else {
        pvalue <- purrr::map_dfc(
            mc_instance_ldf,
            function(x) {
                apply(
                    x, 1,
                    function(y) {
                        stats::glm(as.numeric(y) ~ factor(conditions)) %>%
                            stats::drop1(test = "Chis") %>%
                            purrr::pluck(5, 2)
                    }
                )
            }
        )
    }

    padj <- purrr::map_dfc(pvalue, p.adjust, method = p_adjust)
    e_pvalue <- rowMeans(pvalue)
    e_padj <- rowMeans(padj)

    # f statistic
    ef_F_statistic <- purrr::map_dfc(
        mc_instance_ldf,
        function(x) {
            apply(
                x, 1,
                function(y) {
                    summary(aov(y ~ factor(conditions)))[[1]]$`F value`[1]
                }
            )
        }
    ) %>%
        rowMeans()

    enrich_group <- get_aldex_kwglm_enrich_group(mc_instance_ldf, conditions)

    res <- data.frame(
        feature = feature,
        enrich_group = enrich_group,
        ef_F_statistic = ef_F_statistic,
        pvalue = e_pvalue,
        padj = e_padj
    )

    res
}

# enriched group for kw and glm anova
get_aldex_kwglm_enrich_group <- function(mc_instance_ldf, conditions) {
    instance_split <- purrr::map(
        mc_instance_ldf,
        ~ split(data.frame(t(.x)), conditions)
    )
    instance_mean <- purrr::map(
        instance_split,
        ~ purrr::map_dfc(.x, colMeans)
    )
    instance_mean <- Reduce("+", instance_mean)
    max_idx <- apply(instance_mean, 1, which.max)
    enrich_group <- names(instance_mean)[max_idx]

    enrich_group
}

# Each element of mc instances of a clr object represents all instances of a
# sample, this function convert mc instances to list data frames where each
# element represents a mc instance for all samples
convert_instance <- function(mc_instance, mc_samples) {
    mc_instance_ldf <- purrr::map(
        seq.int(mc_samples),
        function(x) {
            res <- purrr::map_dfc(mc_instance, function(y) y[, x])
            names(res) <- names(mc_instance)
            res
        }
    )

    mc_instance_ldf
}


# fast test function modified from ALDEx2::t.fast
#' @importFrom stats pt
t_fast <- function(x, group, paired = FALSE) {
    grp1 <- group == unique(group)[1]
    grp2 <- group == unique(group)[2]
    n1 <- sum(grp1)
    n2 <- sum(grp2)

    if (paired) {
        # Order pairs for the mt.teststat function
        if (n1 != n2) stop("Cannot pair uneven groups.")
        idx1 <- which(grp1)
        idx2 <- which(grp2)
        paired_order <- unlist(
            lapply(
                seq_along(idx1),
                function(i) c(idx1[i], idx2[i])
            )
        )

        t <- multtest::mt.teststat(
            x[, paired_order],
            as.numeric(grp1)[paired_order],
            test = "pairt",
            nonpara = "n"
        )
        df <- length(idx1) - 1
    } else {
        t <- multtest::mt.teststat(x,
            as.numeric(grp1),
            test = "t",
            nonpara = "n"
        )
        s1 <- apply(x[, grp1], 1, sd)
        s2 <- apply(x[, grp2], 1, sd)
        df <- ((s1^2 / n1 + s2^2 / n2)^2) / ((s1^2 / n1)^2 / (n1 - 1) +
                (s2^2 / n2)^2 / (n2 - 1))
    }

    return(pt(t, df = df, lower.tail = FALSE))
}

# wilcox.fast function replaces wilcox.test
#  * runs much faster
#  * uses exact distribution for ties!
#    * this differs from ?wilcox.test
#  * optional paired test
#    * equivalent to wilcox.test(..., correct = FALSE)
#  * uses multtest
#' @importFrom stats psignrank pnorm pwilcox wilcox.test
wilcox_fast <- function(x, group, paired = FALSE) {
    stopifnot(ncol(x) == length(group))
    grp1 <- group == unique(group)[1]
    grp2 <- group == unique(group)[2]
    n1 <- sum(grp1)
    n2 <- sum(grp2)

    # Check for ties in i-th Monte-Carlo instance
    xt <- t(x)
    if (paired) {
        any_ties <- any(
            apply(xt[grp1, ] - xt[grp2, ], 2, function(y) length(unique(y))) !=
                ncol(x) / 2
        )
    } else {
        any_ties <- any(
            apply(xt, 2, function(y) length(unique(y))) != ncol(x)
        )
    }

    # Ties trigger slower, safer wilcox.test function
    if (any_ties) {
        res <- apply(
            xt, 2,
            function(i) {
                wilcox.test(
                    i[grp1], i[grp2],
                    paired = paired,
                    alternative = "greater",
                    correct = FALSE
                )$p.value
            }
        )

        return(res)
    }

    if (paired) {
        if (n1 != n2) stop("Cannot pair uneven groups.")
        x_diff <- xt[grp1, ] - xt[grp2, ]
        v <- apply(x_diff, 2, function(y) sum(rank(abs(y))[y > 0]))
        topscore <- (n1 * (n1 + 1)) / 2
        if (sum(grp1) < 50) {
            # as per wilcox test, use exact -- ASSUMES NO TIES!!
            res <- psignrank(v - 1, n1, lower.tail = FALSE)
        } else { # Use normal approximation
            v_std <- (v - topscore / 2) /
                sqrt(n1 * (n1 + 1) * (2 * n1 + 1) / 24)
            res <- pnorm(v_std, lower.tail = FALSE)
        }
    } else {
        w_std <- multtest::mt.teststat(x, as.numeric(grp1), test = "wilcoxon")
        if (sum(grp1) < 50 && sum(grp2) < 50) {
            # as per wilcox test, use exact -- ASSUMES NO TIES!!
            w_var <- sqrt((n1 * n2) * (n1 + n2 + 1) / 12)
            w <- w_std * w_var + (n1 * n2) / 2
            res <- pwilcox(w - 1, n1, n2, lower.tail = FALSE)
        } else { # Use normal approximation
            res <- pnorm(w_std, lower.tail = FALSE)
        }
    }

    res
}


================================================
FILE: R/DA-all.R
================================================
#' Find makers (differentially expressed metagenomic features)
#'
#' `run_marker` is a wrapper of all differential analysis functions.
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param da_method character to specify the differential analysis method. The
#'   options include:
#'   * "lefse", linear discriminant analysis (LDA) effect size (LEfSe) method,
#'     for more details see [`run_lefse()`].
#'   * "simple_t", "simple_welch", "simple_white", "simple_kruskal",
#'     and "simple_anova", simple statistic methods; "simple_t", "simple_welch"
#'     and "simple_white" for two groups comparison; "simple_kruskal", and
#'     "simple_anova" for multiple groups comparison. For more details see
#'     [`run_simple_stat()`].
#'   * "edger", see [`run_edger()`].
#'   * "deseq2", see [`run_deseq2()`].
#'   * "metagenomeseq", differential expression analysis based on the
#'     Zero-inflated Log-Normal mixture model or Zero-inflated Gaussian mixture
#'     model using metagenomeSeq, see [`run_metagenomeseq()`].
#'   * "ancom", see [`run_ancom()`].
#'   * "ancombc", differential analysis of compositions of microbiomes with
#'     bias correction, see [`run_ancombc()`].
#'   * "aldex", see [`run_aldex()`].
#'   * "limma_voom", see [`run_limma_voom()`].
#'   * "sl_lr", "sl_rf", and "sl_svm", there supervised leaning (SL) methods:
#'     logistic regression (lr), random forest (rf), or support vector machine
#'     (svm). For more details see [`run_sl()`].
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over
#'     the differences of the log-transformed gene-count fold-change between
#'     the sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param p_adjust method for multiple test correction, default `none`,
#'   for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05.
#' @param ... extra arguments passed to the corresponding differential analysis
#'   functions, e.g. [`run_lefse()`].
#' @return a [`microbiomeMarker-class`] object.
#' @details This function is only a wrapper of all differential analysis
#'   functions, We recommend to use the corresponding function, since it has a
#'   better default arguments setting.
#' @export
#' @seealso [`run_lefse()`],[`run_simple_stat()`],[`run_test_two_groups()`],
#'   [`run_test_multiple_groups()`],[`run_edger()`],[`run_deseq2()`],
#'   [`run_metagenomeseq`],[`run_ancom()`],[`run_ancombc()`],[`run_aldex()`],
#'   [`run_limma_voom()`],[`run_sl()`]
run_marker <- function(ps,
    group,
    da_method = c(
        "lefse", "simple_t", "simple_welch",
        "simple_white", "simple_kruskal",
        "simple_anova", "edger", "deseq2",
        "metagenomeseq", "ancom", "ancombc", "aldex",
        "limma_voom", "sl_lr", "sl_rf", "sl_svm"
    ),
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "none",
    norm_para = list(),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    ...) {
    stopifnot(inherits(ps, "phyloseq"))

    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    da_method <- match.arg(
        da_method,
        c(
            "lefse", "simple_t", "simple_welch",
            "simple_white", "simple_kruskal", "simple_anova",
            "edger", "deseq2", "metagenomeseq", "ancom",
            "ancombc", "aldex", "limma_voom",
            "sl_lr", "sl_rf", "sl_svm"
        )
    )
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    # group
    sample_meta <- sample_data(ps)
    if (!group %in% names(sample_meta)) {
        stop("`group` must in the field of sample meta data", call. = FALSE)
    }
    groups <- sample_meta[[group]]
    n_group <- length(unique(groups))
    if (n_group == 1) {
        stop("at least two groups required", call. = FALSE)
    }

    para <- c(
        list(...),
        ps = ps,
        group = group,
        taxa_rank = taxa_rank,
        transform = transform,
        norm = norm,
        norm_para = norm_para,
        p_adjust = p_adjust,
        pvalue_cutoff = pvalue_cutoff
    )

    test_fun <- switch(da_method,
        lefse = run_lefse,
        edger = run_edger,
        metagenomeseq = run_metagenomeseq,
        deseq2 = run_deseq2,
        ancom = run_ancom,
        ancombc = run_ancombc,
        aldex = run_aldex,
        limma_voom = run_limma_voom
    )

    if (da_method == "lefse") {
        para <- c(
            list(...),
            ps = ps,
            class = group,
            taxa_rank = taxa_rank,
            transform = transform,
            norm = norm,
            norm_para = norm_para
        )
    }
    if (da_method %in% c(
        "simple_t", "simple_welch",
        "simple_white", "simple_kruskal", "simple_anova"
    )) {
        test_method <- switch(da_method,
            simple_t = "t.test",
            simple_wilch = "welch.test",
            simple_white = "white.test",
            simple_anova = "anova",
            simple_kruskal = "kruskal"
        )
        para <- c(para, method = test_method)
        test_fun <- run_simple_stat
    }
    if (da_method %in% c("sl_lr", "sl_rf", "sl_svm")) { # sl methods
        sl_method <- gsub("sl_", "", da_method) %>%
            toupper()
        para <- c(
            list(...),
            ps = ps,
            group = group,
            taxa_rank = taxa_rank,
            transform = transform,
            norm = norm,
            norm_para = norm_para,
            method = sl_method
        )
        test_fun <- run_sl
    }
    mm <- do.call(test_fun, para)

    mm
}


================================================
FILE: R/DA-ancom.R
================================================
#' Perform differential analysis using ANCOM
#'
#' Perform significant test by comparing the pairwise log ratios between all
#' features.
#'
#' @param ps a \code{\link[phyloseq]{phyloseq-class}} object.
#' @param group character, the variable to set the group.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation.
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over
#'     the differences of the log-transformed gene-count fold-change between
#'     the sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para  named `list`. other arguments passed to specific
#'   normalization methods.  Most users will not need to pass any additional
#'   arguments here.
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff significance level for each of the statistical tests,
#'   default 0.05.
#' @param W_cutoff lower bound for the proportion for the W-statistic, default
#'   0.7.
#'
#' @details
#' In an experiment with only two treatments, this tests the following
#' hypothesis for feature \eqn{i}:
#'
#' \deqn{H_{0i}: E(log(\mu_i^1)) =  E(log(\mu_i^2))}
#'
#' where \eqn{\mu_i^1} and \eqn{\mu_i^2} are the mean abundances for feature
#' \eqn{i} in the two groups.
#'
#' The developers of this method recommend the following significance tests
#' if there are 2 groups, use non-parametric Wilcoxon rank sum test
#' [`stats::wilcox.test()`]. If there are more than 2 groups, use nonparametric
#' [`stats::kruskal.test()`] or one-way ANOVA [`stats::aov()`].
#'
#' @return a [microbiomeMarker-class] object, in which the `slot` of
#' `marker_table` contains four variables:
#' * `feature`, significantly different features.
#' * `enrich_group`, the class of the differential features enriched.
#' *  `effect_size`, differential means for two groups, or F statistic for more
#'   than two groups.
#' * `W`, the W-statistic, number of features that a single feature is tested
#'   to be significantly different against.
#'
#' @references Mandal et al. "Analysis of composition of microbiomes: a novel
#' method for studying microbial composition", Microbial Ecology in Health
#' & Disease, (2015), 26.
#' @author Huang Lin, Yang Cao
#' @export
#' @examples
#' \donttest{
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_ancom(ps, group = "Enterotype")
#' }
run_ancom <- function(ps,
    group,
    confounders = character(0),
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "TSS",
    norm_para = list(),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    W_cutoff = 0.75) {
    stopifnot(inherits(ps, "phyloseq"))
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    ps <- check_rank_names(ps) %>%
        check_taxa_rank( taxa_rank)

    if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }

    # check whether group is valid, write a function
    meta <- sample_data(ps)
    meta_nms <- names(meta)
    groups <- meta[[group]]
    groups <- make.names(groups)

    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    sample_data(ps)[[group]] <- groups
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    if (!length(confounders)) {
       tfun <- ifelse(n_lvl > 2, stats::kruskal.test, stats::wilcox.test)
       fml <- paste("x ~ ", group)
    } else {
        tfun <- stats::aov
        fml <- paste("x ~ ", group, "+",  paste(confounders, collapse = " + "))
    }

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalize the data
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    feature_table <- abundances(ps_summarized, norm = TRUE)

    # effect size: CLR mean_difference or aov f statistic
    feature_table_clr <- norm_clr(
        otu_table(feature_table, taxa_are_rows = TRUE)
    )
    feature_table_clr <- data.frame(t(feature_table_clr))
    ef <- vapply(
        feature_table_clr,
        calc_ef_md_f,
        FUN.VALUE = 0.0,
        group = groups
    )

    # enrich_group
    group_enriched <- vapply(
        feature_table_clr,
        get_ancom_enrich_group,
        FUN.VALUE = character(1),
        group = groups
    )

    # ANCOM requires log transformation
    feature_table <- log(as.matrix(feature_table) + 1)
    n_taxa <- nrow(feature_table)
    taxa_id <- row.names(feature_table)
    n_samp <- ncol(feature_table)

    # Calculate the p-value for each pairwise comparison of taxa.
    # para group is just for the main var in the formula
    test_var_dat <- data.frame(groups)
    names(test_var_dat) <- group
    if (length(confounders)) {
        test_var_dat[[confounders]] <- meta[[confounders]]
    }
    p <- calc_ancom_pmat(
        feature_table,
        test_var_dat,
        tfun,
        fml
    )

    # Multiple comparisons correction.
    p_adjusted <- vapply(
        data.frame(p),
        p.adjust,
        FUN.VALUE = numeric(n_taxa),
        method = p_adjust
    )

    # Calculate the W statistic of ANCOM.
    # For each taxon, count the number of q-values < pvalue_cutoff.
    W <- apply(p_adjusted, 2, function(x) sum(x < pvalue_cutoff))

    # Organize outputs
    out_comp <- data.frame(
        feature = taxa_id,
        enrich_group = group_enriched,
        ef = ef,
        W = W,
        row.names = NULL,
        check.names = FALSE
    )
    # Declare a taxon to be differentially abundant based on the quantile of W
    # statistic. We perform (n_taxa - 1) hypothesis testings on each taxon, so
    # the maximum number of rejections is (n_taxa - 1).
    sig_out <- out_comp[out_comp$W > W_cutoff * (n_taxa - 1), ]
    if (n_lvl == 2) {
        names(sig_out)[3] <- "ef_CLR_diff_mean"
    } else {
        names(sig_out)[3] <- "ef_CLR_F_statistic"
    }

    marker <- return_marker(sig_out, out_comp)
    tax <- matrix(taxa_id) %>%
        tax_table()
    row.names(tax) <- row.names(feature_table)

    mm <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = "ANCOM",
        otu_table = otu_table(feature_table, taxa_are_rows = TRUE),
        sam_data = sample_data(ps_normed),
        tax_table = tax
    )

    mm
}

#' Calculates pairwise pvalues between all features
#' @param feature_table matrix-like, logged feature table.
#' @param test_var_dat data.frame, variables data (sample meta data)
#' @param test  character, the test to determine the p value of log ratio,
#'   one of "aov", "wilcox.test",  "kruskal.test".
#' @param ... extra arguments passed to the test.
#' @references
#' github/biocore/scikit-bio/blob/master/skbio/stats/composition.py#L811
#' @noRd
calc_ancom_pmat <- function(feature_table, test_var_dat, test, fml) {

    taxas <- row.names(feature_table)
    feature_table <- data.frame(t(feature_table))
    taxa_n <- ncol(feature_table)
    p <- matrix(NA, nrow = taxa_n, ncol = taxa_n)
    row.names(p) <- taxas
    colnames(p) <- taxas


    for (i in seq_len(taxa_n - 1)) {
        new_table <- -(feature_table[(i + 1):taxa_n] - feature_table[[i]])
        p[-(seq_len(i)), i] <- vapply(
            new_table,
            calc_ancom_p,
            FUN.VALUE = numeric(1),
            test_var_dat = test_var_dat, test = test, fml = fml
        )
    }

    # Complete the p-value matrix.
    # What we got from above iterations is a lower triangle matrix of p-values.
    p[upper.tri(p)] <- t(p)[upper.tri(p)]
    diag(p) <- 1 # let p-values on diagonal equal to 1
    p[is.na(p)] <- 1 # let p-values of NA equal to 1

    p
}

#' calculate the p value of a pair-wise log ratio
#' @param log_ratio  a numeric vector, a pair-wise log ratio.
#' @param classes character vector, the same length with `log_ratio`.
#' @param test  character, the test to dtermine the p value of log ratio,
#'   one of "aov", "wilcox.test",  "kruskal.test".
#' @param ... extra arguments passed to the test.
#' @noRd
calc_ancom_p <- function(log_ratio, test_var_dat, test, fml) {
    # fist var is the target var (main var)
    group <- names(test_var_dat)[1]
    test_dat <- cbind(x = log_ratio, test_var_dat)
    fml <- stats::formula(fml)
    if (identical(test, stats::aov)) {
        fit = test(fml,
                   data = test_dat,
                   na.action = na.omit)
        p = summary(fit)[[1]][group, "Pr(>F)"]
    } else {
        suppressWarnings(p <- test(fml, data = test_dat)$p.value)
    }

    p
}


#' Identify structural zeros
#' from "FrederickHuangLin/ANCOMBC/R/get_struc_zero.R"
#'
#' @author Huang Lin, Yang Cao
#' @noRd
get_struc_zero <- function(ps, group, neg_lb) {
    stopifnot(inherits(ps, "phyloseq"))
    stopifnot(is.logical(neg_lb))
    stopifnot(length(group) == 1 & is.character(group))

    meta_tab <- sample_data(ps)
    check_var_in_meta(group, meta_tab)
    groups <- factor(meta_tab[[group]])

    feature_tab <- as(otu_table(ps), "matrix")
    present_tab <- feature_tab
    present_tab[is.na(present_tab)] <- 0
    present_tab[present_tab != 0] <- 1
    n_taxa <- nrow(feature_tab)
    n_group <- nlevels(groups)

    p_hat <- matrix(NA, nrow = n_taxa, ncol = n_group)
    rownames(p_hat) <- rownames(feature_tab)
    colnames(p_hat) <- levels(groups)
    samp_size <- p_hat

    for (i in seq_len(n_taxa)) {
        p_hat[i, ] <- tapply(
            present_tab[i, ],
            groups,
            function(x) mean(x, na.rm = TRUE)
        )
        samp_size[i, ] <- tapply(
            feature_tab[i, ],
            groups,
            function(x) length(x[!is.na(x)])
        )
    }

    p_hat_lo <- p_hat - 1.96 * sqrt(p_hat * (1 - p_hat) / samp_size)

    zero_ind <- p_hat == 0

    if (neg_lb) {
        zero_ind[p_hat_lo <= 0] <- TRUE
    }
    colnames(zero_ind) <- paste0(
        "structural_zero (", group, " = ", colnames(zero_ind), ")"
    )

    data.frame(zero_ind)
}

#' enrich group for ancom, rewrite this function in the later
#' split get_feature_enrich_group into two funcitons: enrich_group and
#' log max mean
#' @noRd
get_ancom_enrich_group <- function(feature_abd, group) {
    abd_split <- split(feature_abd, group)
    abd_mean_group <- vapply(abd_split, mean, FUN.VALUE = 0.0)
    enrich_group <- names(abd_split)[which.max(abd_mean_group)]

    enrich_group
}

#' preprocess feature data using methods of ANCOM-II
#' @noRd
#' @importFrom stats dnorm lm na.omit quantile residuals sd
preprocess_ancom <- function(feature_table,
    meta_data,
    sample_var,
    lib_cut,
    neg_lb,
    group = NULL,
    out_cut = 0.05,
    zero_cut = 0.90) {

    feature_table <- data.frame(feature_table, check.names = FALSE)
    meta_data <- data.frame(meta_data, check.names = FALSE)
    # Drop unused levels
    meta_data[] <- lapply(
        meta_data,
        function(x) if (is.factor(x)) factor(x) else x
    )
    # Match sample IDs between metadata and feature table
    sample_ID <- intersect(meta_data[, sample_var], colnames(feature_table))
    feature_table <- feature_table[, sample_ID]
    meta_data <- meta_data[match(sample_ID, meta_data[, sample_var]), ]

    # 1. Identify outliers within each taxon
    if (!is.null(group)) {
        groups <- meta_data[, group]
        z <- feature_table + 1 # Add pseudo-count (1)
        f <- log(z)
        f[f == 0] <- NA
        f <- colMeans(f, na.rm = TRUE)
        f_fit <- lm(f ~ groups)
        e <- rep(0, length(f))
        e[!is.na(groups)] <- residuals(f_fit)
        y <- t(t(z) - e)

        outlier_check <- function(x) {
            # Fitting the mixture model using the algorithm of Peddada, S. Das,
            # and JT Gene Hwang (2002)
            mu1 <- quantile(x, 0.25, na.rm = TRUE)
            mu2 <- quantile(x, 0.75, na.rm = TRUE)
            sigma1 <- quantile(x, 0.75, na.rm = TRUE) -
                quantile(x, 0.25, na.rm = TRUE)
            sigma2 <- sigma1
            pi <- 0.75
            n <- length(x)
            epsilon <- 100
            tol <- 1e-5
            score <- pi * dnorm(x, mean = mu1, sd = sigma1) /
                ((1 - pi) * dnorm(x, mean = mu2, sd = sigma2))
            while (epsilon > tol) {
                grp1_ind <- (score >= 1)
                mu1_new <- mean(x[grp1_ind])
                mu2_new <- mean(x[!grp1_ind])
                sigma1_new <- sd(x[grp1_ind])
                if (is.na(sigma1_new)) sigma1_new <- 0
                sigma2_new <- sd(x[!grp1_ind])
                if (is.na(sigma2_new)) sigma2_new <- 0
                pi_new <- sum(grp1_ind) / n

                para <- c(mu1_new, mu2_new, sigma1_new, sigma2_new, pi_new)
                if (any(is.na(para))) break

                score <- pi_new * dnorm(x, mean = mu1_new, sd = sigma1_new) /
                    ((1 - pi_new) * dnorm(x, mean = mu2_new, sd = sigma2_new))

                epsilon <- sqrt(
                    (mu1 - mu1_new)^2 +
                        (mu2 - mu2_new)^2 +
                        (sigma1 - sigma1_new)^2 +
                        (sigma2 - sigma2_new)^2 +
                        (pi - pi_new)^2
                )
                mu1 <- mu1_new
                mu2 <- mu2_new
                sigma1 <- sigma1_new
                sigma2 <- sigma2_new
                pi <- pi_new
            }

            if (mu1 + 1.96 * sigma1 < mu2 - 1.96 * sigma2) {
                if (pi < out_cut) {
                    out_ind <- grp1_ind
                } else if (pi > 1 - out_cut) {
                    out_ind <- (!grp1_ind)
                } else {
                    out_ind <- rep(FALSE, n)
                }
            } else {
                out_ind <- rep(FALSE, n)
            }
            return(out_ind)
        }
        out_ind <- matrix(
            FALSE,
            nrow = nrow(feature_table),
            ncol = ncol(feature_table)
        )
        out_ind[, !is.na(groups)] <- t(apply(
            y, 1,
            function(i) {
                unlist(tapply(i, groups, function(j) outlier_check(j)))
            }
        ))

        feature_table[out_ind] <- NA
    }

    # 2. Discard taxa with zeros  >=  zero_cut
    zero_prop <- apply(
        feature_table, 1,
        function(x) sum(x == 0, na.rm = TRUE) / length(x[!is.na(x)])
    )
    taxa_del <- which(zero_prop >= zero_cut)
    if (length(taxa_del) > 0) {
        feature_table <- feature_table[-taxa_del, ]
    }

    # 3. Discard samples with library size < lib_cut
    lib_size <- colSums(feature_table, na.rm = TRUE)
    if (any(lib_size < lib_cut)) {
        subj_del <- which(lib_size < lib_cut)
        feature_table <- feature_table[, -subj_del]
        meta_data <- meta_data[-subj_del, ]
    }

    # 4. Identify taxa with structure zeros
    if (!is.null(group)) {
        groups <- factor(meta_data[, group])
        present_table <- as.matrix(feature_table)
        present_table[is.na(present_table)] <- 0
        present_table[present_table != 0] <- 1

        p_hat <- t(apply(
            present_table, 1,
            function(x) {
                unlist(tapply(x, groups, function(y) mean(y, na.rm = TRUE)))
            }
        ))
        samp_size <- t(apply(
            feature_table, 1,
            function(x) {
                unlist(tapply(x, groups, function(y) length(y[!is.na(y)])))
            }
        ))
        p_hat_lo <- p_hat - 1.96 * sqrt(p_hat * (1 - p_hat) / samp_size)

        struc_zero <- (p_hat == 0) * 1
        # Whether we need to classify a taxon into structural zero by its
        # negative lower bound?
        if (neg_lb) struc_zero[p_hat_lo <= 0] <- 1

        # Entries considered to be structural zeros are set to be 0s
        struc_ind <- struc_zero[, groups]
        feature_table <- feature_table * (1 - struc_ind)

        colnames(struc_zero) <- paste0(
            "structural_zero (",
            colnames(struc_zero),
            ")"
        )
    } else {
        struc_zero <- NULL
    }

    # 5. Return results
    res <- list(
        feature_table = feature_table,
        meta_data = meta_data,
        structure_zeros = struc_zero
    )

    res
}


================================================
FILE: R/DA-ancombc.R
================================================
#' Differential analysis of compositions of microbiomes with bias correction
#' (ANCOM-BC).
#'
#' Differential abundance analysis for microbial absolute abundance data. This
#' function is a wrapper of [`ANCOMBC::ancombc()`].
#'
#' @param ps  a [`phyloseq::phyloseq-class`] object, which consists of a feature
#'   table, a sample metadata and a taxonomy table.
#' @param group the name of the group variable in metadata. Specifying
#'   `group` is required for detecting structural zeros and performing
#'   global test.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param contrast this parameter only used for two groups comparison while
#'   there are multiple groups. For more please see the following details.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para  named `list`. other arguments passed to specific
#'   normalization methods.  Most users will not need to pass any additional
#'   arguments here.
#' @param p_adjust method to adjust p-values by. Default is "holm".
#'   Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
#'   "fdr", "none". See [`stats::p.adjust()`] for more details.
#' @param prv_cut a numerical fraction between 0 and 1. Taxa with prevalences
#'   less than `prv_cut` will be excluded in the analysis. Default
#'   is 0.10.
#' @param lib_cut a numerical threshold for filtering samples based on library
#'   sizes. Samples with library sizes less than `lib_cut` will be excluded
#'   in the analysis. Default is 0, i.e. do not filter any sample.
#' @param struc_zero whether to detect structural zeros. Default is FALSE.
#' @param neg_lb whether to classify a taxon as a structural zero in the
#'   corresponding study group using its asymptotic lower bound.
#'   Default is FALSE.
#' @param tol the iteration convergence tolerance for the E-M algorithm.
#'   Default is 1e-05.
#' @param max_iter the maximum number of iterations for the E-M algorithm.
#'   Default is 100.
#' @param conserve whether to use a conservative variance estimate of
#'   the test statistic. It is recommended if the sample size is small and/or
#'   the number of differentially abundant taxa is believed to be large.
#'   Default is FALSE.
#' @param pvalue_cutoff level of significance. Default is 0.05.
#'
#' @details
#' `contrast` must be a two length character or `NULL` (default). It is only
#' required to set manually for two groups comparison when there are multiple
#' groups. The order determines the direction of comparison, the first element
#' is used to specify the reference group (control). This means that, the first
#' element is the denominator for the fold change, and the second element is
#' used as baseline (numerator for fold change). Otherwise, users do required
#' to concern this parameter (set as default `NULL`), and if there are
#' two groups, the first level of groups will set as the reference group; if
#' there are multiple groups, it will perform an ANOVA-like testing to find
#' markers which difference in any of the groups.
#'
#' @references
#' Lin, Huang, and Shyamal Das Peddada. "Analysis of compositions of microbiomes
#' with bias correction." Nature communications 11.1 (2020): 1-11.
#'
#' @seealso [`ANCOMBC::ancombc`]
#'
#' @importFrom ANCOMBC ancombc
#' @importFrom stats relevel
#' @export
#' @return a [`microbiomeMarker-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' if (requireNamespace("microbiome", quietly = TRUE)) {
#'     run_ancombc(ps, group = "Enterotype")
#' } else {
#'     message("The 'mirobiome' package is not installed, please install it to use this example")
#' }
run_ancombc <- function(ps,
    group,
    confounders = character(0),
    contrast = NULL,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "none",
    norm_para = list(),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    prv_cut = 0.1,
    lib_cut = 0,
    struc_zero = FALSE,
    neg_lb = FALSE,
    tol = 1e-05,
    max_iter = 100,
    conserve = FALSE,
    pvalue_cutoff = 0.05) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps) %>%
        check_taxa_rank( taxa_rank)

    if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }

    # if it contains missing values for any
    # variable specified in the formula, the corresponding sampling fraction
    # estimate for this sample will return NA since the sampling fraction is
    # not estimable with the presence of missing values.
    # remove this samples
    fml_char <- ifelse(length(confounders),
                       paste(c(confounders, group), collapse = " + "),
                       group)
    # fml_char <- paste(c(confounders, group), collapse = " + ")
    # fml <- stats::as.formula(paste("~", fml_char))
    # vars_fml <- all.vars(fml)
    for (var in c(confounders, group)) {
        ps <- remove_na_samples(ps, var)
    }

    # check whether group is valid, write a function
    meta <- sample_data(ps)
    meta_nms <- names(meta)
    groups <- meta[[group]]
    groups <- make.names(groups)
    if (!is.null(contrast)) {
        contrast <- make.names(contrast)
    }
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    groups <- set_lvl(groups, contrast)
    sample_data(ps)[[group]] <- groups
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    contrast <- check_contrast(contrast)

    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    # set the reference level for pair-wise comparison from mutliple groups
    # if (!is.null(contrast) && n_lvl > 2) {
    #     groups <- relevel(groups, ref = contrast[1])
    #     sample_data(ps)[[group]] <- groups
    # }


    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalize the data
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)

    global <- ifelse(n_lvl > 2, TRUE, FALSE)
    # ancombc differential abundance analysis

    if (taxa_rank == "all") {
        ancombc_taxa_rank <- rank_names(ps_summarized)[1]
    } else {
        ancombc_taxa_rank <- taxa_rank
    }

    ancombc_out <- ANCOMBC::ancombc(
        ps_summarized,
        tax_level = ancombc_taxa_rank,
        formula = fml_char,
        p_adj_method = p_adjust,
        prv_cut = prv_cut,
        lib_cut = lib_cut,
        group = group,
        struc_zero = struc_zero,
        neg_lb = neg_lb,
        tol = tol,
        max_iter = max_iter,
        conserve = conserve,
        alpha = pvalue_cutoff,
        global = global
    )

    # multiple-group comparison will be performed while the group
    # variable has > 2 levels
    keep_var <- c("W", "p_val", "q_val", "diff_abn")
    if (n_lvl > 2) {
        # ANCOM-BC global test to determine taxa that are differentially
        # abundant between three or more groups of multiple samples.
        # global result to marker_table
        if (is.null(contrast)) {
            mtab <- ancombc_out$res_global
        } else {
            exp_lvl <- paste0(group, contrast[2])
            ancombc_res <- ancombc_out$res
            mtab <- lapply(keep_var, function(x) ancombc_res[[x]][exp_lvl])
            mtab <- do.call(cbind, mtab)
        }
    } else {
        ancombc_out_res <- ancombc_out$res
        # drop intercept
        #  in the previous version of ancombc (Bioc 3.15), taxa names are saved
        #  as row names, while saved as the first column in the current version
        #  remove intercept and taxa names, and save the rownames as taxa names
        ancombc_out_res <- lapply(
            ancombc_out_res,
            function(x) {
                new_x <- x[-1:-2]
                rownames(new_x) <- x[[1]]
                new_x
            }
        )
        mtab <- do.call(
            cbind,
            ancombc_out_res[c("W", "p_val", "q_val", "diff_abn")]
        )
    }
    names(mtab) <- keep_var

    # determine enrich group based on coefficients
    # drop taxa and intercept
    cf <- ancombc_out$res$lfc[-1:-2]
    if (n_lvl > 2) {
        if (!is.null(contrast)) {
            cf <- cf[exp_lvl]
            enrich_group <- ifelse(cf[[1]] > 0, contrast[2], contrast[1])
        } else {
            cf <- cbind(0, cf)
            enrich_group <- lvl[apply(cf, 1, which.max)]
        }
    } else {
        enrich_group <- ifelse(cf[[1]] > 0, lvl[2], lvl[1])
    }

    # # enriched group
    enrich_abd <- get_ancombc_enrich_group(ps_summarized, ancombc_out, group)
    norm_abd <- enrich_abd$abd

    mtab <- cbind(feature = row.names(mtab), mtab, enrich_group)
    mtab_sig <- mtab[mtab$diff_abn, ]
    mtab_sig <- mtab_sig[c("feature", "enrich_group", "W", "p_val", "q_val")]
    names(mtab_sig) <- c("feature", "enrich_group", "ef_W", "pvalue", "padj")
    marker <- return_marker(mtab_sig, mtab)
    marker <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = "ancombc",
        sam_data = sample_data(ps_summarized),
        otu_table = otu_table(norm_abd, taxa_are_rows = TRUE),
        tax_table = tax_table(ps_summarized)
    )

    marker
}

get_ancombc_enrich_group <- function(ps, ancombc_out, group) {
    samp_frac <- ancombc_out$samp_frac
    # As shown in the ancombc vignette: if it contains missing values for any
    # variable specified in the formula, the corresponding sampling fraction
    # estimate for this sample will return NA since the sampling fraction is
    # not estimable with the presence of missing values.
    # Replace NA with 0
    samp_frac[is.na(samp_frac)] <- 0

    # Add pesudo-count (1) to avoid taking the log of 0
    log_abd <- log(abundances(ps, norm = TRUE) + 1)
    # Adjust the log observed abundances
    log_abd_adj <- sweep(log_abd, 2, samp_frac)

    groups <- sample_data(ps)[[group]]
    # remove groups with NA
    na_idx <- is.na(groups)
    log_abd_adj <- log_abd_adj[, !na_idx]
    groups <- groups[!na_idx]

    # mean absolute abundance
    abd_mean <- by(t(log_abd_adj), groups, colMeans)
    abd_mean <- do.call(cbind, abd_mean)
    idx_enrich <- apply(abd_mean, 1, which.max)
    group_enrich <- colnames(abd_mean)[idx_enrich]
    group_enrich <- data.frame(
        feature = rownames(abd_mean),
        enrich_group = group_enrich
    )

    list(abd = exp(log_abd_adj), group_enrich = group_enrich)
}


================================================
FILE: R/DA-comparing.R
================================================
# The module of comparing differential analysis is inspired from DAtest
# https://github.com/Russel88/DAtest
# If you use this function please cite the original paper:
# Russel, Jakob, et al. "DAtest: a framework for choosing differential abundance
# or expression method." BioRxiv (2018): 241802.

#' Comparing the results of differential analysis methods by Empirical power
#' and False Discovery Rate
#'
#' Calculating power, false discovery rates, false positive rates and auc (
#' area under the receiver operating characteristic (ROC) curve)
#' for various DA methods.
#'
#' @param ps,group,taxa_rank main arguments of all differential analysis
#'   methods. `ps`: a [`phyloseq::phyloseq-class`] object; `group`, character,
#'   the variable to set the group, must be one of the var of the sample
#'   metadata; `taxa_rank`: character, taxonomic rank, please not that **since
#'   the abundance table is spiked in the lowest level, only
#'   `taxa_rank = "none"` is allowed**.
#' @param methods character vector, differential analysis methods to be
#'   compared, available methods are "aldex", "ancom", "ancombc", "deseq2",
#'   "edger", "lefse", "limma_voom", "metagenomeseq", "simple_stat".
#' @param args named list, which used to set the extra arguments of the
#'   differential analysis methods, so the names must be contained in `methods`.
#'   For more see details below.
#' @param n_rep integer, number of times to run the differential analyses.
#' @param effect_size numeric, the effect size for the spike-ins. Default 5.
#' @param k numeric vector of length 3, number of features to spike in each
#'   tertile (lower, mid, upper), e.g. `k=c(5,10,15)` means 5 features spiked
#'   in low abundance tertile, 10 features spiked in mid abundance tertile and
#'   15 features spiked in high abundance tertile. Default `NULL`, which will
#'   spike 2 percent of the total amount of features in each tertile (a total
#'   of 6 percent), but minimum c(5,5,5).
#' @param relative logical, whether rescale the total number of individuals
#'   observed for each sample to the original level after spike-in. Default
#'   `TRUE`.
#' @param BPPARAM [`BiocParallel::BiocParallelParam`] instance defining the
#'   parallel back-end.
#'
#' @details
#' To make this function support for different arguments for a certain DA method
#' `args` allows list of list of list e.g. `args = list(lefse = list(list(norm = "CPM"), list(norm = "TSS")))`, which specify to compare the different norm
#' arguments for lefse analysis.
#'
#' For `taxa_rank`, only `taxa_rank = "none"` is supported, if this argument is
#' not "none", it will be forced to "none" internally.
#'
#'
#' @return an `compareDA` object, which contains a two-length list of:
#'   - `metrics`: `data.frame`, FPR, AUC and spike detection rate for each run.
#'   - `mm`: differential analysis results.
#'
#' @importFrom phyloseq `otu_table<-`
#' @importFrom stats median
#' @export
compare_DA <- function(ps,
                       group,
                       taxa_rank = "none",
                       methods,
                       args = list(),
                       n_rep = 20,
                       effect_size = 5,
                       k = NULL,
                       relative = TRUE,
                       BPPARAM = BiocParallel::SnowParam(progressbar = TRUE)) {
    stopifnot(inherits(ps, "phyloseq"))
    # check methods
    avlb_methods <- c("aldex", "ancom", "ancombc", "deseq2", "edger", "lefse",
                      "limma_voom", "metagenomeseq", "simple_stat")
    out_methods <- setdiff(methods, avlb_methods)
    if (length(out_methods)) {
        stop("methods ", paste(out_methods, collapse = ", "),
             " not available. \n",
             "Please check your `methods`.\n",
             paste(strwrap(paste("Available methods:",
                           paste(avlb_methods, collapse = ", ")),
                           width = 0.9 * getOption("width")),
                   collapse = paste("\n", space(nchar("Available methods:")))),
             ".\n",
             call. = FALSE)
    }

    ps_var_name <- deparse(substitute(ps))

    # support for different arguments for a DA method, list of list of list
    # e.g. args = list(lefse = list(list(norm = "CPM"), list(norm = "TSS")))
    # different norm arguments for lefse analysis
    new_args <- generate_compare_args(methods, args)
    methods <- new_args$methods
    args <- new_args$args

    meta <- sample_data(ps)
    groups <- meta[[group]] |> factor()
    n_lvl <- nlevels(groups)

    if (n_lvl == 2) {
        if ("test_multiple_groups" %in% methods) {
            warning("There are two categories of interested variable ", group,
                    ", method `test_multiple_groups` are dropped.")
            methods <- setdiff(methods, "test_multiple_groups")
        }
    } else if (n_lvl >= 3) {
        if ("test_two_groups" %in% methods) {
            warning("There are more than two categories of interested variable ",
                    group,
                    ", method `test_two_groups` are dropped.")
            methods <- setdiff(methods, "test_two_groups")
        }
    } else {
        stop("Only one category  of interested variable ", group, ".")
    }

    # taxa_rank must be "none"
    if (taxa_rank != "none") {
        warning("since the abundance table is spiked in the lowest level, ",
             "`taxa_rank` was forced set as 'none'",
             call. = FALSE)
        taxa_rank <- "none"
    }

    # taxa_ranks <- vapply(args, `[[`, "taxa_rank", FUN.VALUE = character(1))
    # wrong_taxa_rank <- taxa_ranks != "none"
    # if (any(wrong_taxa_rank)) {
    #     warning("Set `taxa_rank` of all methods to 'none'")
    #     for (i in which(wrong_taxa_rank)) {
    #         args[[i]]$taxa_rank <- "none"
    #     }
    # }

    count_tab <- otu_table(ps)
    features <- rownames(count_tab)

    # spike in, differential features
    n_feature <- nrow(count_tab)

    if (is.null(k)) {
        k <- rep(round(n_feature * 0.02), 3)
        if (sum(k) < 15) {
            k <- rep(5, 3)
        }
    }
    if (sum(k) == n_feature) {
        stop("Set to spike all features, can't calculate FDR or AUC",
             call. = FALSE)
    }
    if (sum(k) > n_feature) {
        stop("Set to spike more features than are present in the data",
             call. = FALSE)
    }
    if (sum(k) < 15 && sum(k) >= 10 && n_rep <= 10) {
        warning("Few features are spiked, increase `k` or set `n_rep` to ",
                "more than 10 to ensure proper estimation of AUC and FPR",,
                call. = FALSE)
    }
    if (sum(k) < 10 && sum(k) >= 5 && n_rep <= 20) {
        warning("Few features are spiked, increase `k` or set `n_rep` to ",
                "more than 20 to ensure proper estimation of AUC and FPR",
                call. = FALSE)
    }
    if (sum(k) < 5 && n_rep <= 50) {
        warning("Very few features are spiked, increase `k` set `n_rep` to ",
                "more than 50 to ensure proper estimation of AUC and FPR",
                call. = FALSE)
    }
    if (sum(k) > n_feature/2) {
        warning("Set to spike more than half of the features, ",
                "which might give unreliable estimates")
   }

    # if(verbose) cat("Spikeing...\n")
    # shuffle predictor
    # predictor <- sample_data(ps)[[group[[1]]]]
    rands <- lapply(seq_len(n_rep), \(x) sample(groups))
    # spikeins
    spikeds <- lapply(rands,
                      \(x) spikein(count_tab, x, effect_size, k, relative))
    count_tabs <- lapply(spikeds, `[[`, 1)
    spiked_features <- lapply(spikeds, `[[`, 2)
    spiked_features <- rep(spiked_features, each = length(methods))

    # spiked phyloseq objects
    generate_spiked_ps <- function(spiked_count, rand, group, ps = ps) {
        otu_table(ps) <- otu_table(spiked_count, taxa_are_rows = TRUE)
        meta <- sample_data(ps)
        meta[[group]] <- rand
        sample_data(ps) <- meta

        ps
    }
    pss <- mapply(generate_spiked_ps,
                  count_tabs, rands,
                  MoreArgs = list(group = group, ps = ps))
    pss <- rep(pss, each = length(methods))
    # rep methods
    rep_methods <- rep(methods, n_rep)
    all_methods <- methods

    # fun for performance metrics of DA methods
    for (item in names(args)) {
        args[[item]]$group <- group
        args[[item]]$taxa_rank <- taxa_rank
    }

    ## for ancombc, use bpmapply will raise an error:
    # `argument "formula" is missing, with no default`. This error could be due
    # to ANCOMBC package. We first run the ancombc method sequentially, and then
    # run other methods parallelly, finnally bind the results
    ancombc_md_idx <- grepl("ancombc", rep_methods, fixed = TRUE)
    if (any(ancombc_md_idx)) {
        ancombc_mds <- rep_methods[ancombc_md_idx]
        ancombc_args <- args[ancombc_md_idx]
        ancombc_pss <- pss[ancombc_md_idx]
        ancombc_spiked_features <- spiked_features[ancombc_md_idx]

        methods <- rep_methods[!ancombc_md_idx]
        args <- args[!ancombc_md_idx]
        pss <- pss[!ancombc_md_idx]
        spiked_features <- spiked_features[!ancombc_md_idx]
    }

    calc_da_metrics <- function(ps, method, args, features,
                                spiked_features, ps_var_name,
                                effect_size) {
        args <- args[[method]]
        args$ps <- ps
        fun <- paste0("run_", method)
        # remove number suffix for different args for a certern method
        fun <- gsub("(.*)_\\d+$", "\\1", fun)
        tm <- system.time(mm <- do.call(fun, args))

        marker <- data.frame(marker_table(mm))

        # pseduo pvalue of ancom
        if (method == "ancom" && !is.null(marker)) {
            w <- marker$W
            cf <- 0.05 * min(w)
        }

        if (!is.null(marker)) {
            spiked <- rep("no", nrow(marker))
            spiked[marker$feature %in% spiked_features] <- "yes"
            marker$spiked <- spiked
        }

        # confusion matrix
        n_pos <- ifelse(is.null(marker), 0, nrow(marker))
        n_feature <- length(features)
        neg_feature <- setdiff(features, spiked_features)
        n_neg <- n_feature - n_pos
        # for effect_size = 1
        true_neg <- n_neg
        true_pos <- 0
        false_neg <- 0
        # for effect size != 1
        if (effect_size != 1) {
            true_pos <- ifelse(is.null(marker), 0, sum(marker$spiked == "yes"))
            false_neg <- sum(neg_feature %in% spiked_features)
        }
        false_pos <- n_pos - true_pos
        true_neg <- n_neg - false_neg

        # fpr
        fpr <- ifelse((false_pos + true_neg) != 0,
                      false_pos / (false_pos + true_neg),
                      0)

        # sdr
        # sum(k) == length(spiked_features)
        sdr <- true_pos / length(spiked_features)

        # auc
        if(effect_size != 1) {
            test_roc <- NULL
            tryCatch(test_roc <- pROC::roc(
                as.numeric(marker$feature %in% spiked_features) ~ marker$pvalue,
                auc = TRUE,
                direction = ">",
                quiet = TRUE), error = function(e) NULL)
            auc <- ifelse(is.null(test_roc), 0.5, as.numeric(test_roc$auc))
        } else {
            auc <- 0.5
        }

        # fdr
        fdr <- ifelse(n_pos != 0, false_pos / n_pos, 0)

        # create call
        cmd_args <- args
        cmd_args$ps <- ps_var_name
        cmd_args$fun <- fun
        # reorder
        args_nms <- names(cmd_args)
        head_nms <- c("fun", "ps", "group", "taxa_rank")
        new_nms <- c(head_nms, setdiff(args_nms, head_nms))
        cmd_args <- cmd_args[new_nms]
        cmd_chr <- deparse1(as.call(cmd_args))
        cmd_chr <- gsub(paste0("\"(", fun, ")\""), "\\1", cmd_chr)

        metrics <- data.frame(auc = auc,
                              fpr = fpr,
                              fdr = fdr,
                              power = sdr,
                              # method = gsub("(.*)_\\d+$", "\\1", method),
                              method = method,
                              call = cmd_chr,
                              time_min = round(tm[3] / 60, 4))
        rownames(metrics) <- NULL

        list(metrics = metrics, mm = mm)
    }

    if (any(ancombc_md_idx)) {
        ancombc_out <- mapply(calc_da_metrics,
                              ps = ancombc_pss,
                              method = ancombc_mds,
                              spiked_features = ancombc_spiked_features,
                              MoreArgs = list(args = ancombc_args,
                                              features = features,
                                              ps_var_name = ps_var_name,
                                              effect_size = effect_size),
                              SIMPLIFY = FALSE)
    }
    if (!all(ancombc_md_idx)) {
        bp_out <- BiocParallel::bpmapply(calc_da_metrics,
                               ps = pss,
                               method = methods,
                               spiked_features = spiked_features,
                               MoreArgs = list(args = args,
                                               features = features,
                                               ps_var_name = ps_var_name,
                                               effect_size = effect_size),
                               BPPARAM = BPPARAM,
                               SIMPLIFY = FALSE)
    }

    if (all(ancombc_md_idx)) {
        da_out <- ancombc_out
    } else {
        if (any(ancombc_md_idx)) {
            da_out <- c(ancombc_out, bp_out)
        } else {
            da_out <- bp_out
        }
    }


    # order the out, to the original order
    idx <- order(c(which(ancombc_md_idx), which(!ancombc_md_idx)))
    da_out <- da_out[idx]

    da_metrics <- do.call(rbind, lapply(da_out, `[[`, "metrics"))
    da_metrics$run <- rep(seq_len(n_rep), each = length(all_methods))
    mms <- lapply(da_out, `[[`, "mm")

    # detail <- list(n_feature = n_feature,
    #                n_sample = ncol(count_tab),
    #                effect_size = effect_size,
    #                spike = paste0(c("Low:","Mid:","High:"), k,
    #                               collapse = ", "))

    out <- list(metrics = da_metrics, mm = mms)
    class(out) <- "compareDA"

    out
}


#' Summary differential analysis methods comparison results
#'
#' @param object an `compareDA` object, output from [`compare_DA()`].
#' @param sort character string specifying sort method. Possibilities are
#'   "score" which is calculated as \eqn{(auc - 0.5) * power - fdr}, "auc" for
#'   area under the ROC curve, "fpr" for false positive rate, "power" for
#'   empirical power.
#' @param boot logical, whether use bootstrap for confidence limites of the
#'   score, default `TRUE`. Recommended to be `TRUE` unless `n_rep` is larger
#'   then 100 in [`compare_DA()`].
#' @param boot_n integer, number of bootstraps, default 1000L.
#' @param prob two length numeric vector, confidence limits for score, default
#'   `c(0.05, 0.95)`.
#' @param ... extra arguments affecting the summary produced.
#' @return a `data.frame` containing measurements for differential analysis
#'    methods:
#'    - `call`: differential analysis commands.
#'    - `auc`: area under curve of ROC.
#'    - `fpr`: false positive rate
#'    - `power`: empirical power.
#'    - `fdr`: false discover7y rate.
#'    - `score`: score whch is calculated as \eqn{(auc - 0.5) * power - fdr}.
#'    - `score_*`: confidence limits of score.
#' @export
summary.compareDA <- function(object,
                              sort = c("score", "auc", "fpr", "power"),
                              boot = TRUE,
                              boot_n = 1000L,
                              prob = c(0.05, 0.95),
                              ...) {
    stopifnot(inherits(object, "compareDA"))
    sort <- match.arg(sort, c("score", "auc", "fpr", "power"))

    # medians
    metrics <- object$metrics
    calls <- metrics$call
    new_metrics <- metrics[c('auc', 'fpr', 'power', 'fdr')]
    metric_med <- stats::aggregate(new_metrics,
                                   by = list(call = calls),
                                   FUN = median)
    # score
    metric_med$score <- (metric_med$auc - 0.5) * metric_med$power -
        metric_med$fdr
    # interval
    new_metrics$score <- (new_metrics$auc - 0.5) * new_metrics$power -
        new_metrics$fdr
    metrics$score <- (metrics$auc - 0.5) * metrics$power - metrics$fdr

    if (boot) {
        boots <- dplyr::group_by(metrics, call) |>
            dplyr::group_modify(~ .x[sample(rownames(.x),
                                            boot_n,
                                            replace = TRUE),
                                    ])
        score_cl <- stats::aggregate(score ~ call,
                                  data = boots,
                                  FUN = function(x)
                                      stats::quantile(x, probs = prob))
    } else {
        score_cl <- stats::aggregate(score ~ call,
                                     data = metrics,
                                     FUN = \(x) stats::quantile(x, probs = prob))
    }
    score_cl <- data.frame(call = score_cl$call,
                           score_cl$score[, 1],
                           score_cl$score[, 2])
    names(score_cl) <- c("call", paste0("score_", prob))
    out <- merge(metric_med, score_cl, by = "call")
    # reorder score descreasing
    out <- out[order(out$score,
                     out[[paste0("score_", prob[1])]],
                     out[[paste0("score_", prob[2])]],
                     decreasing = TRUE),
               ]
    if (out$score[1] <= 0) {
        warning("Best score is <= 0.\n",
                "You might require to preprocessing your data or ",
                "re-run with a higher effect size.",
                call. = FALSE)
    }

    # mat <- vector("logical", nrow(out))
    # for (i in seq_len(nrow(out))) {
    #     mat[i] <- out$score[i] >= out[[paste0("score_", prob[1])]][1]
    # }
    # out$` ` <- " "
    # out[mat,]$` ` <- "*"


    if (sort == "auc") {
        out <- out[order(out$auc, decreasing = TRUE), ]
    }
    if (sort == "fpr") {
        out <- out[order(out$fpr, decreasing = FALSE), ]
    }
    if (sort == "power") {
        out <- out[order(out$power, decreasing = TRUE), ]
    }

    out
}

# compare_DA <- function(...,
#                        n_rep = 20,
#                        effect_size = 5,
#                        k = NULL,
#                        n_core = parallel::detectCores() -1,
#                        check_core = TRUE,
#                        relative = TRUE,
#                        verbose = TRUE) {
#     if (check_core) {
#         if (check_core > 20) {
#             ANSWER <- readline(paste("You are about to run compareDA using",
#                                      n_core,
#                                      "cores. Enter y to proceed "))
#             if (ANSWER != "y") {
#                 stop("Process aborted")
#             }
#         }
#     }
#
#     exp_chrs <- list(...)
#     t_start <- proc.time()
#     calls <- lapply(exp_chrs, \(x) standardise_call(str2lang(x)))
#
#     # extract the ps object and target variable
#     pss <- lapply(calls, `[[`, "ps")
#     group <- lapply(calls, `[[`, "group")
#     # all the ps and target variable must be the same
#     if (sum(duplicated(pss)) != (length(pss) - 1)) {
#         stop("`ps` objects in DA analysis must be the same")
#     }
#     if (sum(duplicated(group)) != (length(group) - 1)) {
#         stop("`group` var in all DA analysis must be the same")
#     }
#
#     # DA methods comparison only support for taxa_rank = "none", since
#     #  the abundance table is spiked in the lowest level
#     # full_paras <- lapply(calls, \(x) formals(match.fun(x[[1]])))
#     taxa_ranks <- lapply(calls, `[[`, "taxa_rank")
#     if (sum(duplicated(taxa_ranks)) != (length(group) - 1)) {
#         stop("`taxa_rank` objects in all DA analysis must be the same")
#     }
#     if (!is.null(taxa_ranks[[1]]) && taxa_ranks[[1]] != "none") {
#         stop("since the abundance table is spiked in the lowest level, ",
#              "`taxa_rank` must be 'none'",
#              call. = FALSE)
#     }
#
#      if (verbose) {
#         message("Comparing differential methods may take a long time")
#         message("Running on ", n_core, "cores")
#     }
#
#     # differential analysis functions
#     funs <- lapply(calls, `[[`, 1)
#     funs_chr <- vapply(funs, as.character, FUN.VALUE = character(1))
#
#     ps <- eval(pss[[1]], envir = parent.frame())
#     count_tab <- otu_table(ps)
#     features <- rownames(count_tab)
#
#     # spike in differential features
#     if (is.null(k)) {
#         k <- rep(round(nrow(count_tab) * 0.02), 3)
#         if (sum(k) < 15) {
#             k <- rep(5, 3)
#         }
#     }
#     n_feature <- nrow(count_tab)
#     if (sum(k) == n_feature) {
#         stop("Set to spike all features, can't calculate FDR or AUC",
#              call. = FALSE)
#     }
#     if (sum(k) > n_feature) {
#         stop("Set to spike more features than are present in the data",
#              call. = FALSE)
#     }
#     if (sum(k) < 15 && sum(k) >= 10 && n_rep <= 10) {
#         warning("Few features are spiked, increase `k` or set `n_rep` to ",
#                 "more than 10 to ensure proper estimation of AUC and FPR",,
#                 call. = FALSE)
#     }
#     if (sum(k) < 10 && sum(k) >= 5 && n_rep <= 20) {
#         warning("Few features are spiked, increase `k` or set `n_rep` to ",
#                 "more than 20 to ensure proper estimation of AUC and FPR",
#                 call. = FALSE)
#     }
#     if (sum(k) < 5 && n_rep <= 50) {
#         warning("Very few features are spiked, increase `k` set `n_rep` to ",
#                 "more than 50 to ensure proper estimation of AUC and FPR",
#                 call. = FALSE)
#     }
#     if (sum(k) > n_feature/2) {
#         warning("Set to spike more than half of the features, ",
#                 "which might give unreliable estimates")
#     }
#
#     if(verbose) cat("Spikeing...\n")
#     # shuffle predictor
#     predictor <- sample_data(ps)[[group[[1]]]]
#     rands <- lapply(seq_len(n_rep), \(x) sample(predictor))
#
#     # spikeins
#     spikeds <- lapply(rands,
#                       \(x) spikein(count_tab, x, effect_size, k, relative))
#     count_tabs <- lapply(spikeds, `[[`, 1)
#
#     if (verbose) {
#         cat(paste("Testing", length(exp_chrs),
#                   "methods", n_rep, "times each ...\n"))
#     }
#
#     # progress bar
#     # da_par <- paste(rep(seq_len(n_rep), each = 2),
#     #                funs_chr, sep = "-")
#     cmds <- rep(exp_chrs, n_rep)
#     run_no <- rep(seq_len(n_rep), each = length(exp_chrs))
#     pb <- utils::txtProgressBar(max = length(cmds), style = 3)
#     progress <- function(n) setTxtProgressBar(pb, n)
#     opts <- list(progress = progress)
#
#     # config parallel
#     if (n_core == 1) {
#         foreach::registerDoSEQ()
#     } else {
#         cl <- parallel::makeCluster(n_core)
#         doSNOW::registerDoSNOW(cl)
#         on.exit(parallel::stopCluster(cl))
#     }
#
#     # run the DA analysis in parallel
#     res <- foreach::foreach(exp_chr = cmds, i = run_no,
#                             .export = c("otu_table", "otu_table<-", funs_chr),
#                             .options.snow = opts) %dopar% {
#         t1_sub <- proc.time()
#         # construct new ps with spiked feature abundance table
#         new_count_tab <- count_tabs[[i]]
#         otu_table(ps) <- otu_table(new_count_tab, taxa_are_rows = TRUE)
#         res_sub <- eval(str2expression(exp_chr),
#                         list(ps = ps),
#                         enclos = parent.frame())
#
#         run_time_sub <- (proc.time() - t1_sub)[3]
#         return(list(res_sub, run_time_sub))
#     }
#     run_times <- lapply(res, `[[`, "run_time_sub")
#     da_res <- lapply(res, `[[`, "res_sub")
#
#     n_da <- length(exp_chrs)
#     r <- NULL
#     final_res <- foreach::foreach(r = seq_len(n_rep)) %do% {
#         da_sub <- da_res[(1 + (r - 1) * n_da):(r * n_da)]
#         curr_cmds <- cmds[(1 + (r - 1) * n_da):(r * n_da)]
#         curr_spiked_features <- spikeds[[r]][[2]]
#         # insert spiked column
#         rsp <- NULL
#         da_sub <- foreach::foreach(rsp = seq_along(da_sub)) %do% {
#             tmp <- da_sub[[rsp]]
#             tmp_marker <- data.frame(marker_table(tmp))
#
#             # psedudo pvalue of ancom
#             if (grepl("ancom(", curr_cmds[rsp], fixed = TRUE)) {
#                 w <- tmp_marker$W
#                 cf <- 0.05 * min(w)
#                 tmp_marker$pvalue <- (1 / w) * cf
#             }
#
#             tmp_spiked <- rep("no", nrow(tmp_marker))
#             tmp_spiked[tmp_marker$feature %in% curr_spiked_features] <- "yes"
#             tmp_marker$spiked <- tmp_spiked
#             return(tmp_marker)
#         }
#
#         # confusion matrix
#         n_pos <- vapply(da_sub, nrow, FUN.VALUE = integer(1))
#         neg_feature <- lapply(da_sub, \(x) setdiff(features, x$feature))
#         n_neg <- n_feature - n_pos # vapply(neg_feature, length, integer(1))
#         # for effect_size = 1
#         true_neg <- n_neg
#         true_pos <- 0
#         false_neg <- 0
#         # for effect_size != 1
#         if (effect_size != 1) {
#             true_pos <- vapply(da_sub,
#                                \(x) sum(da_sub$spiked == "yes"),
#                                FUN.VALUE = integer(1))
#             false_neg <- vapply(neg_feature,
#                                \(x) sum(x %in% curr_spiked_features),
#                                FUN.VALUE = integer(1))
#         }
#         false_pos <- n_pos - true_pos
#         true_neg <- n_neg - false_neg
#
#         # FPR: false positive rate
#         fprs <- vapply(seq_along(da_sub),
#                        \(x) ifelse((false_pos[x] + true_neg[x]) != 0,
#                                    false_pos[x] / (false_pos[x] + true_neg[x]),
#                                    0),
#                        FUN.VALUE = numeric(1))
#
#         # sdr: spike detection rate
#         sdrs <- vapply(true_pos, \(x) x / sum(k), FUN.VALUE = numeric(1))
#
#         # auc
#         aucs <- vapply(da_sub, \(x) {
#             if (effect_size != 1) {
#                 test_roc <- NULL
#                 spiked_idx <- as.numeric(x$feature %in% curr_spiked_features)
#                 tryCatch(
#                     test_roc <- pROC::roc(spiked_idx ~ x$pvalue,
#                                           auc = TRUE,
#                                           direction = ">",
#                                           quiet = TRUE),
#                     error = function(e) NULL)
#                     res <- ifelse(is.null(test_roc),
#                                   0.5,
#                                   as.numeric(test_roc$auc))
#             } else {
#                 res <- 0.5
#             }
#
#             res
#         }, FUN.VALUE = numeric(1))
#
#         # fdrs
#         fdrs <- vapply(seq_along(da_sub),
#                        \(x) ifelse(n_pos != 0, false_pos / n_pos, 0))
#
#         df_combine <- data.frame(call = curr_cmds,
#                                  AUC = aucs,
#                                  FPR = fprs,
#                                  FDR = fdrs,
#                                  Power = sdrs,
#                                  run = r)
#         rownames(df_combine) <- NULL
#
#         return(df_combine, da_sub)
#     }
#
#     out_res <- do.call(rbind, lapply(final_res, `[[`, 1))
#     out_res_marker <- lapply(final_res, `[[`, 2)
#
#     # running time
#     run_secs <- (proc.time() - t_start)[3]
#     if ((run_secs)/60/60 > 1) {
#         run_time <- paste(round((run_secs)/60/60, 2), "Hours")
#     } else {
#         run_time <- paste(round((run_secs)/60,2),"Minutes")
#     }
#
#     out_detail <- data.frame(n_feature = nrow(count_tab),
#                              n_sample = ncol(count_tab),
#                              run_time = run_time,
#                              effect_size = effect_size,
#                              spiked = paste0(c("Low:","Mid:","High:"), k,
#                                              collapse = ", "))
#     out_detail <- as.data.frame(t(out_detail))
#     names(out_detail) <- NULL
#
#     # run times
#     run_times <- data.frame(DA = cmds,
#                             minites = round(unlist(run_times) / 60, 4))
#
#     out <- list(res = out_res,
#                 marker = out_res_marker,
#                 detail = out_detail,
#                 run_time = run_times)
#
#     out
# }


# spike in features
spikein <- function(count_tab,
                    predictor,
                    effect_size = 2,
                    k,
                    relative = TRUE) {
    if (effect_size < 0) {
        stop("Effect size should be positive")
    }

    spike_method <- ifelse(effect_size == 1, "none", "mult")
    if (is.null(rownames(count_tab))) {
        rownames(count_tab) <- seq_len(nrow(count_tab))
    }
    count_tab <- as.data.frame(count_tab)
    predictor <- as.numeric(as.factor(predictor)) - 1

    # Choose Features to spike
    propcount <- sweep(count_tab, 2, colSums(count_tab), "/")
    # propcount <- apply(count_tab, 2, function(x) x/sum(x))
    count_abundances <- sort(rowSums(propcount)/ncol(propcount))

    # Only spike Features present in cases (except if predictor is numeric)
    case_count_tab <- count_tab[
        rowSums(count_tab[, predictor == 1]) > 0, predictor == 1]
    approved_count_abundances <- count_abundances[
        names(count_abundances) %in% row.names(case_count_tab)]

    # Which to spike in each tertile
    lower_tert <- names(approved_count_abundances[
        approved_count_abundances < quantile(approved_count_abundances,1/3)])
    mid_tert <- names(approved_count_abundances[
      approved_count_abundances >= quantile(approved_count_abundances,1/3) &
          approved_count_abundances < quantile(approved_count_abundances,2/3)])
    upper_tert <- names(approved_count_abundances[
        approved_count_abundances >= quantile(approved_count_abundances,2/3)])

    spike_features <- c(sample(lower_tert, k[1]),
                        sample(mid_tert, k[2]),
                        sample(upper_tert,k[3]))
    spike_feature_index <- which(row.names(count_tab) %in% spike_features)

    # Spike Features by multiplication
    old_sums <- colSums(count_tab)

    if (spike_method == "mult"){
        count_tab[spike_feature_index, predictor==1] <-
            count_tab[spike_feature_index, predictor==1] * effect_size
    }

    # Rescale to original sample sums
    new_sums <- colSums(count_tab)
    if (relative) {
        count_tab <- round(sweep(count_tab, 2, old_sums/new_sums, "*"))
    }

    list(count_tab, spike_features)
}


# from pryr: Standardise a function call
standardise_call <- function(call, env = parent.frame()) {
    stopifnot(is.call(call))
    f <- eval(call[[1]], env)
    if (is.primitive(f)) {
        return(call)
    }

    return(match.call(f, call))
}

# To make compare_DA() support for different arguments for a certain DA method,
# args allows list of list of list
# e.g. args = list(lefse = list(list(norm = "CPM"), list(norm = "TSS"))),
# represents compare the different norm arguments for lefse analysis. So we
# need to flattern the args and extend methods for DA analysis:
# methods = c("lefse", "lefse"),
# args = list(list(norm = "CPM"), list(norm = "TSS"))
#
# For method with no args provided, set it to list(), e.g. list(ancom = list()).
generate_compare_args <- function(methods, args) {
    # check args
    args_nms <- names(args)
    if (length(args)) {
        out_args <- setdiff(args_nms, methods)
        if (length(out_args)) {
            stop("names of `args` must be contained in `methods`.\n",
                paste(args[out_args], collapse = ", "), " in names of `args` ",
                "does not match DA methods",
                call. = FALSE)
        }
    }

    # create args list for each method
    method_no_args <- setdiff(methods, args_nms)
    for (i in seq_along(method_no_args)) {
        args[[method_no_args[i]]] <- list()
    }

    new_args <- list()
    n_arg <- vector("integer", length(args))
    for (i in seq_along(args)) {
        curr_arg <- args[i]
        if (purrr::pluck_depth(curr_arg) > 4) {
            stop("`args` could be 'list of list', ",
                 "'list of list of list' to support for different arguments ",
                 "for a certain DA method")
        }
        if (purrr::pluck_depth(curr_arg) == 4) {
            curr_arg <- unlist(curr_arg, recursive = FALSE)
            names(curr_arg) <- paste(names(args)[i],
                                     seq_along(curr_arg),
                                     sep = "_")

        }
        new_args <- c(new_args, curr_arg)
        n_arg[i] <- length(curr_arg)
    }
    methods <- rep(methods, times = n_arg)
    methods_suffix <- lapply(n_arg, \(x) {
        if (x > 1) {
            as.character(paste0("_", seq_len(x)))
        } else {
            ""
        }
    }) |> unlist()
    methods <- paste(methods, methods_suffix, sep = "")

    return(list(methods = methods, args = new_args))
}


================================================
FILE: R/DA-deseq2.R
================================================
# In the vignette of DESeq2:
# The values in the matrix should be un-normalized counts or estimated counts
# of sequencing reads (for single-end RNA-seq) or fragments (for paired-end
# RNA-seq). The RNA-seq workflow describes multiple techniques for preparing
# such count matrices. It is important to provide count matrices as input for
# DESeq2’s statistical model (Love, Huber, and Anders 2014) to hold, as only
# the count values allow assessing the measurement precision correctly. The
# DESeq2 model internally corrects for library size, so transformed or
# normalized values such as counts scaled by library size should not be used
# as input.
#
# DESeq2 contrast: https://github.com/tavareshugo/tutorial_DESeq2_contrasts
#
# reference source code:
# from biocore/qiime/blob/master/qiime/support_files/R/DESeq2_nbinom.r
# https://github.com/hbctraining/DGE_workshop/blob/master/schedule/1.5-day.md
#
#
## p value and logFC from LRT
# From https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html
# https://support.bioconductor.org/p/133804/#133856
# By default the Wald test is used to generate the results table, but DESeq2
# also offers the LRT which is used to identify any genes that show change in
# expression across the different levels. The LRT is comparing the full model
# to the reduced model to identify significant genes. The p-values are
# determined solely by the difference in deviance between the ‘full’ and
# "reduced" model formula (not log2 fold changes).
#
# The log2 fold change LRT results  is calculated using Wald (two groups
# comparison).
#
#
#
#' Perform DESeq differential analysis
#'
#' Differential expression analysis based on the Negative Binomial distribution
#' using **DESeq2**.
#'
#' @param ps  a [`phyloseq::phyloseq-class`] object.
#' @param group  character, the variable to set the group, must be one of
#'   the var of the sample metadata.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param contrast this parameter only used for two groups comparison while
#'   there are multiple groups. For more please see the following details.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#' @param norm_para arguments passed to specific normalization methods. Most
#'   users will not need to pass any additional arguments here.
#' @param fitType,sfType,betaPrior,modelMatrixType,useT,minmu these seven
#'   parameters are inherited form [`DESeq2::DESeq()`].
#'   - `fitType`, either "parametric", "local", "mean", or "glmGamPoi" for the
#'     type of fitting of dispersions to the mean intensity.
#'   - `sfType`, either "ratio", "poscounts", or "iterate" for the type of size
#'     factor estimation. We recommend to use "poscounts".
#'   - `betaPrior`, whether or not to put a zero-mean normal prior on the
#'     non-intercept coefficients.
#'   - `modelMatrixType`, either "standard" or "expanded", which describe how
#'     the model matrix,
#'   - `useT`, logical, where Wald statistics are assumed to follow a standard
#'     Normal.
#'   - `minmu`, lower bound on the estimated count for fitting gene-wise
#'     dispersion.
#'
#'   For more details, see [`DESeq2::DESeq()`].  Most users will not need to
#'   set this arguments (just use the defaults).
#'
#' @param p_adjust method for multiple test correction, default `none`, for
#'   more details see [stats::p.adjust].
#' @param pvalue_cutoff pvalue_cutoff numeric, p value cutoff, default 0.05.
#' @param ... extra parameters passed to [`DESeq2::DESeq()`].
#'
#' @details
#' **Note**: DESeq2 requires the input is raw counts (un-normalized counts), as
#' only the counts values allow assessing the measurement precision correctly.
#' For more details see the vignette of DESeq2 (`vignette("DESeq2")`).
#'
#' Thus, this function only supports "none", "rarefy", "RLE", "CSS", and
#' "TMM" normalization methods. We strongly recommend using the "RLE" method
#' (default normalization method in the DESeq2 package). The other
#' normalization methods are used for expert users and comparisons among
#' different normalization methods.
#'
#' For two groups comparison, this function utilizes the Wald test (defined by
#' [`DESeq2::nbinomWaldTest()`]) for hypothesis testing. A Wald test statistic
#' is computed along with a probability (p-value) that a test statistic at least
#' as extreme as the observed value were selected at random. `contrasts` are
#' used to specify which two groups to compare. The order of the names
#' determines the direction of fold change that is reported.
#'
#' Likelihood ratio test (LRT) is used to identify the genes that significantly
#' changed across all the different levels for multiple groups comparisons. The
#' LRT identified the significant features by comparing the full model to the
#' reduced model. It is testing whether a feature removed in the reduced
#' model explains a significant variation in the data.
#'
#' `contrast` must be a two length character or `NULL` (default). It is only
#' required to set manually for two groups comparison when there are multiple
#' groups. The order determines the direction of comparison, the first element
#' is used to specify the reference group (control). This means that, the first
#' element is the denominator for the fold change, and the second element is
#' used as baseline (numerator for fold change). Otherwise, users do required
#' to concern this parameter (set as default `NULL`), and if there are
#' two groups, the first level of groups will set as the reference group; if
#' there are multiple groups, it will perform an ANOVA-like testing to find
#' markers which difference in any of the groups.
#'
#' @export
#' @return a [`microbiomeMarker-class`] object.
#' @seealso [`DESeq2::results()`],[`DESeq2::DESeq()`]
#' @importFrom stats formula coef
#' @importFrom DESeq2 dispersions<-
#' @importMethodsFrom S4Vectors mcols
#' @importMethodsFrom BiocGenerics sizeFactors<- counts
#' @references
#' Love, Michael I., Wolfgang Huber, and Simon Anders. "Moderated estimation
#' of fold change and dispersion for RNA-seq data with DESeq2." Genome
#' biology 15.12 (2014): 1-21.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")) %>%
#'     phyloseq::subset_taxa(Phylum %in% c("Firmicutes"))
#' run_deseq2(ps, group = "Enterotype")
run_deseq2 <- function(ps,
    group,
    confounders = character(0),
    contrast = NULL,
    taxa_rank = "all",
    norm = "RLE",
    norm_para = list(),
    transform = c("identity", "log10", "log10p"),
    # test = c("Wald", "LRT"),
    fitType = c("parametric", "local", "mean", "glmGamPoi"),
    sfType = "poscounts",
    betaPrior = FALSE,
    modelMatrixType,
    useT = FALSE,
    minmu = ifelse(fitType == "glmGamPoi", 1e-06, 0.5),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    ...) {
    ps <- check_rank_names(ps) %>% 
        check_taxa_rank( taxa_rank)
    
    norm_methods <- c("none", "rarefy", "RLE", "CSS", "TMM")
    if (!norm %in% norm_methods) {
        stop(
            "`norm` must be one of 'none', 'rarefy', 'RLE', 'CSS', or 'TMM'",
            call. = FALSE
        )
    }
    
    if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }

    # groups
    meta <- sample_data(ps)
    groups <- meta[[group]]
    groups <- make.names(groups)
    if (!is.null(contrast)) {
        contrast <- make.names(contrast)
    }
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    groups <- set_lvl(groups, contrast)
    sample_data(ps)[[group]] <- groups
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    if (n_lvl < 2) {
        stop("Differential analysis requires at least two groups.")
    }

    # contrast, test method, name of effect size
    if (n_lvl == 2) { # two groups
        if (!is.null(contrast)) {
            warning(
                "`contrast` is ignored, you do not need to set it",
                call. = FALSE
            )
        }
        contrast_new <- c(group, lvl[2], lvl[1])
    } else {
        if (!is.null(contrast)) {
            if (!is.character(contrast) || length(contrast) != 2) {
                stop("`contrast` must be a two length character", call. = FALSE)
            }
            idx <- match(contrast, lvl, nomatch = 0L)
            if (!all(idx)) {
                stop(
                    "all elements of `contrast` must be contained in `groups`",
                    call. = FALSE
                )
            }
            contrast_new <- c(group, contrast[2], contrast[1])
        }
    }
    test <- ifelse(n_lvl > 2 && is.null(contrast), "LRT", "Wald")
    ef_name <- ifelse(test == "Wald", "logFC", "F")

    fitType <- match.arg(fitType, c("parametric", "local", "mean", "glmGamPoi"))
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    if (!sfType %in% c("ratio", "poscounts", "iterate")) {
        stop("`sfType` muste be one of poscounts, ratio, or iterate")
    }

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # prenormalize the data
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    
    if (!length(confounders)) {
        dsg <- formula(paste("~", group))
    } else {
        dsg <- formula(paste(
            "~", 
            paste(c(confounders, group), collapse = " + ")
        ))
    }
    
    dds_summarized <- phyloseq2DESeq2(
        ps_summarized,
        design = dsg
    )
    nf <- get_norm_factors(ps_normed)
    if (!is.null(nf)) {
        sizeFactors(dds_summarized) <- nf
    }


    # error: all gene-wise dispersion estimates are within 2 orders of magnitude
    # from the minimum value, which indicates that the count are not
    # overdispersed
    #
    # If dispersion values are less than 1e-6  (minimal value is 1e-8),
    # it would be problematic to fit a dispersion trend in DESeq2.
    # The reason for a minimal value, is that for a given row of the count 
    # matrix, the maximum likelihood estimate can tend to 0 (and so we have a 
    # rule to stop after 1e-8)
    # https://support.bioconductor.org/p/63845/
    # https://support.bioconductor.org/p/122757/
    # from biocore/qiime/blob/master/qiime/support_files/R/DESeq2_nbinom.r

    # LRT is used to analyze all levels of a factor at once, and the
    # The p values are determined solely by the difference in deviance between
    # the "full" and "reduced" model formula (not log2 fold changes). Only Wast
    # method was used for pair-wise comparison. Thus, for pair-wise comparison,
    # we use Wald test. Moreover, you can set the argument `test` in `results()`
    # when extract the results from LRT, and it is equivalent to Wast test.
    #
    # However, even though there are fold changes  present in the results of
    # LRT, they are not directly associated with the actual hypothesis test (
    # actually determined by the arguments name or contrast).
    if (test == "Wald") { # two groups comparison
        res_deseq <- try(
            DESeq2::DESeq(
                dds_summarized,
                test = test,
                fitType = fitType,
                sfType = sfType,
                quiet = TRUE,
                betaPrior = betaPrior,
                modelMatrixType = modelMatrixType,
                useT = useT,
                minmu = minmu,
                ...
            ),
            silent = TRUE
        )

        if (inherits(res_deseq, "try-error") && fitType != "local") {
            warning("data is not overdispered, try `fitType = 'local'`")
            res_deseq <- try(
                DESeq2::DESeq(
                    dds_summarized,
                    test = test,
                    fitType = "local",
                    sfType = sfType,
                    quiet = TRUE,
                    betaPrior = betaPrior,
                    modelMatrixType = modelMatrixType,
                    useT = useT,
                    minmu = minmu,
                    ...
                ),
                silent = TRUE
            )
        }

        if (inherits(res_deseq, "try-error") && fitType != "mean") {
            warning("data is not overdispered, try `fitType = 'mean'`")
            res_deseq <- try(
                DESeq2::DESeq(
                    dds_summarized,
                    test = test,
                    fitType = "mean",
                    sfType = sfType,
                    quiet = TRUE,
                    betaPrior = betaPrior,
                    modelMatrixType = modelMatrixType,
                    useT = useT,
                    minmu = minmu,
                    ...
                ),
                silent = TRUE
            )
        }
        if (inherits(res_deseq, "try-error")) {
            warning(
                "data is not overdispered, use gene-wise estimates ",
                "as final estimates"
            )
            dds_summarized <- DESeq2::estimateDispersionsGeneEst(dds_summarized)
            DESeq2::dispersions(dds_summarized) <- 
                mcols(dds_summarized)$dispGeneEst

            dds_summarized <- DESeq2::nbinomWaldTest(
                dds_summarized,
                betaPrior = betaPrior,
                quiet = TRUE,
                modelMatrixType = modelMatrixType,
                useT = useT,
                minmu = minmu
            )
        } else {
            dds_summarized <- res_deseq
        }

        res <- DESeq2::results(
            object = dds_summarized,
            contrast = contrast_new,
            pAdjustMethod = p_adjust
        )
        # rename log2FoldChange to logFC, use base R rather than dplyr::rename
        names(res)[names(res) == "log2FoldChange"] <- "logFC"
    } else {
        dds_summarized <- DESeq2::DESeq(
            dds_summarized,
            test = test,
            fitType = fitType,
            sfType = sfType,
            quiet = TRUE,
            minmu = minmu,
            reduced = ~1,
            ...
        )
        res <- DESeq2::results(
            object = dds_summarized,
            pAdjustMethod = p_adjust
        )
    }

    # Why p value is NA?
    # By default, independent filtering is performed to select a set of genes
    # for multiple test correction which maximizes the number of adjusted p
    # values less than a given critical value alpha (by default 0.1).
    # The adjusted p-values for the genes which do not pass the filter threshold
    # are set to NA.
    # By default, results assigns a p-value of NA to genes containing count
    # outliers, as identified using Cook's distance.

    # normalized counts
    counts_normalized <- DESeq2::counts(dds_summarized, normalized = TRUE)

    # one way anova f statistic for LRT
    if (test == "LRT") {
        temp_count <- data.frame(t(counts_normalized))
        f_stat <- vapply(
            temp_count,
            calc_ef_md_f,
            FUN.VALUE = 0.0,
            group = groups
        )
        res[["F"]] <- f_stat
    }

    res <- data.frame(res)
    # enrich group
    if (test == "Wald") {
        enrich_group <- ifelse(res$logFC > 0, contrast_new[2], contrast_new[3])
    } else {
        cf <- coef(dds_summarized)
        
        # extract coef of interested var
        target_idx <- grepl(group, colnames(cf))
        cf <- cf[, target_idx]
        # the first coef is intercept, bind the coef of the reference group as 0
        # (the first column)
        cf <- cbind(0, cf)
        enrich_idx <- apply(
            cf, 1,
            function(x) ifelse(any(is.na(x)), NA, which.max(x))
        )
        enrich_group <- lvl[enrich_idx]
        enrich_group <- enrich_group[match(row.names(res), row.names(cf))]
    }
    res$enrich_group <- enrich_group
    # order according to padj
    res_ordered <- res[order(res$padj), ]
    # filter sig feature
    padj <- res_ordered$padj
    res_ordered <- cbind(feature = row.names(res_ordered), res_ordered)
    # rownames in the form of marker*
    row.names(res_ordered) <- paste0("marker", seq_len(nrow(res_ordered)))

    # reorder columns: feature, enrich_group, other columns
    other_col <- setdiff(names(res_ordered), c("feature", "enrich_group"))
    res_ordered <- res_ordered[, c("feature", "enrich_group", other_col)]
    row.names(res_ordered) <- paste0("marker", seq_len(nrow(res_ordered)))

    # only keep five variables: feature, enrich_group, effect_size (logFC),
    # pvalue, and padj
    keep_var <- c("feature", "enrich_group", ef_name, "pvalue", "padj")
    res_ordered <- res_ordered[keep_var]
    names(res_ordered)[3] <- paste0("ef_", ef_name)
    sig_res <-  res_ordered[!is.na(padj) & padj < pvalue_cutoff, ]
    marker <- return_marker(sig_res, res_ordered)

    marker <- microbiomeMarker(
        marker_table = marker,
        # if no pre-calculated size factors, DESeq2 will calculate the 
        # size factors internally, so norm method shoule be RLE
        norm_method = ifelse(is.null(nf), "RLE", get_norm_method(norm)),
        diff_method = paste0("DESeq2: ", test),
        sam_data = sample_data(ps_normed),
        tax_table = tax_table(ps_summarized),
        otu_table = otu_table(counts_normalized, taxa_are_rows = TRUE)
    )

    marker
}

#' Convert `phyloseq-class` object to `DESeqDataSet-class` object
#'
#' This function convert [phyloseq::phyloseq-class`] to
#' [`DESeq2::DESeqDataSet-class`], which can then be tested using
#' [`DESeq2::DESeq()`].
#'
#' @param ps the [phyloseq::phyloseq-class`] object to convert, which must have
#'   a [`phyloseq::sample_data()`] component.
#' @param design a `formula` or `matrix`, the formula expresses how the counts
#'   for each gene depend on the variables in colData. Many R formula are valid,
#'   including designs with multiple variables, e.g., `~ group + condition`.
#'   This argument is passed to [`DESeq2::DESeqDataSetFromMatrix()`].
#' @param ... additional arguments passed to
#'   [`DESeq2::DESeqDataSetFromMatrix()`], Most users will not need to pass any
#'   additional arguments here.
#' @export
#' @return a [`DESeq2::DESeqDataSet-class`] object.
#' @seealso [`DESeq2::DESeqDataSetFromMatrix()`],[`DESeq2::DESeq()`]
#' @examples
#' data(caporaso)
#' phyloseq2DESeq2(caporaso, ~SampleType)
phyloseq2DESeq2 <- function(ps, design, ...) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- keep_taxa_in_rows(ps)

    # sample data
    samp <- sample_data(ps, errorIfNULL = FALSE)
    if (is.null(samp)) {
        stop(
            "`sample_data` of `ps` is required,",
            " for specifying experimental design.",
            call. = FALSE
        )
    }
    # count data
    ct <- as(otu_table(ps), "matrix")

    # deseq2 requires raw counts, means the counts must be integer
    dds <- tryCatch(
        DESeq2::DESeqDataSetFromMatrix(
            countData = ct,
            colData = data.frame(samp),
            design = design,
            ...
        ),
        error = function(e) e
    )
    if (inherits(dds, "error") &&
        conditionMessage(dds) == "some values in assay are not integers") {
        warning(
            "Some counts are non-integers, they are rounded to integers.\n",
            "Raw count is recommended for reliable results for deseq2 method.",
            call. = FALSE
        )
        dds <- DESeq2::DESeqDataSetFromMatrix(
            countData = round(ct),
            colData = data.frame(samp),
            design = design,
            ...
        )
    }


    dds
}

# Modified from `DESeq2::estimateFactorsForMatrix()` directly
# for `estimateSizeFactors`:
# `sizeFactors(estimateSizeFactors(dds, type = "poscounts"))` is identical to
# `sizeFactors(estimateSizeFactors(dds, geoMeans = geoMeans))`
#
# The original function of `DESeq2::estimateFactorsForMatrix()` does not
# stabilize size factors to have geometric mean of 1 while `type = "poscounts"`.
# This modified function is to make
# `estimateSizeFactorsForMatrix(counts(diagdds2),geoMeans = geoMeans)` is equal
# to `estimateSizeFactorsForMatrix(counts(diagdds2), type = "poscounts")` by
# stabilize size factors if `type = "poscounts"`.
estimateSizeFactorsForMatrix <- function(counts,
    locfunc = stats::median,
    geoMeans,
    controlGenes,
    type = c("ratio", "poscounts")) {
    type <- match.arg(type, c("ratio", "poscounts"))
    if (missing(geoMeans)) {
        incomingGeoMeans <- FALSE
        if (type == "ratio") {
            loggeomeans <- rowMeans(log(counts))
        } else if (type == "poscounts") {
            lc <- log(counts)
            lc[!is.finite(lc)] <- 0
            loggeomeans <- rowMeans(lc)
            allZero <- rowSums(counts) == 0
            loggeomeans[allZero] <- -Inf
        }
    } else {
        incomingGeoMeans <- TRUE
        if (length(geoMeans) != nrow(counts)) {
            stop("geoMeans should be as long as the number of rows of counts")
        }
        loggeomeans <- log(geoMeans)
    }
    if (all(is.infinite(loggeomeans))) {
        stop(
            "every gene contains at least one zero ",
            "cannot compute log geometric means",
            call. = FALSE
        )
    }
    sf <- if (missing(controlGenes)) {
        apply(counts, 2, function(cnts) {
            exp(locfunc((log(cnts) - loggeomeans)[
                is.finite(loggeomeans) & cnts > 0]))
        })
    } else {
        if (!(is.numeric(controlGenes) | is.logical(controlGenes))) {
            stop("controlGenes should be either a numeric or logical vector")
        }
        loggeomeansSub <- loggeomeans[controlGenes]
        apply(
            counts[controlGenes, , drop = FALSE], 2,
            function(cnts) {
                idx <- is.finite(loggeomeansSub) & cnts > 0
                exp(locfunc((log(cnts) - loggeomeansSub)[idx]))
            }
        )
    }
    if (incomingGeoMeans | type == "poscounts") {
        # stabilize size factors to have geometric mean of 1
        sf <- sf / exp(mean(log(sf)))
    }
    sf
}


================================================
FILE: R/DA-edgeR.R
================================================
#' Perform differential analysis using edgeR
#'
#' Differential expression analysis based on the Negative Binomial distribution
#' using **edgeR**.
#'
#' @param ps  ps a [`phyloseq::phyloseq-class`] object.
#' @param group  character, the variable to set the group, must be one of
#'   the var of the sample metadata.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param contrast this parameter only used for two groups comparison while
#'   there are multiple groups. For more please see the following details.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param method character, used for differential analysis, please see details
#'   below for more info.
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over
#'     the differences of the log-transformed gene-count fold-change between
#'     the sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods. Most
#'   users will not need to pass any additional arguments here.
#' @param disp_para additional arguments passed to [`edgeR::estimateDisp()`]
#'   used for dispersions estimation. Most users will not need to pass any
#'   additional arguments here.
#' @param p_adjust method for multiple test correction, default `none`,
#'   for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05
#' @param ... extra arguments passed to the model. See [`edgeR::glmQLFit()`]
#'   and [`edgeR::glmFit()`] for more details.
#' @return  a [`microbiomeMarker-class`] object.
#'
#' @details
#' **Note** that edgeR is designed to work with actual counts. This means that
#' transformation is not required in any way before inputting them to edgeR.
#'
#' There are two test methods for differential analysis in **edgeR**,
#' likelihood ratio test (LRT) and quasi-likelihood F-test (QLFT). The QLFT
#' method is recommended as it allows stricter error rate control by
#' accounting for the uncertainty in dispersion estimation.
#'
#' `contrast` must be a two length character or `NULL` (default). It is only
#' required to set manually for two groups comparison when there are multiple
#' groups. The order determines the direction of comparison, the first element
#' is used to specify the reference group (control). This means that, the first
#' element is the denominator for the fold change, and the second element is
#' used as baseline (numerator for fold change). Otherwise, users do required
#' to concern this parameter (set as default `NULL`), and if there are
#' two groups, the first level of groups will set as the reference group; if
#' there are multiple groups, it will perform an ANOVA-like testing to find
#' markers which difference in any of the groups.
#'
#' @export
#' @seealso [`edgeR::glmFit()`],[`edgeR::glmQLFit()`],[`edgeR::estimateDisp()`]
#'   ,[`normalize()`]
#' @author Yang Cao
#' @references
#' Robinson, Mark D., and Alicia Oshlack. "A scaling normalization method for
#' differential expression analysis of RNA-seq data." Genome biology 11.3
#' (2010): 1-9.
#'
#' Robinson, Mark D., Davis J. McCarthy, and Gordon K. Smyth. "edgeR: a
#' Bioconductor package for differential expression analysis of digital
#' gene expression data." Bioinformatics 26.1 (2010): 139-140.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_edger(ps, group = "Enterotype")
run_edger <- function(ps,
    group,
    confounders =  character(0),
    contrast = NULL,
    taxa_rank = "all",
    method = c("LRT", "QLFT"),
    transform = c("identity", "log10", "log10p"),
    norm = "TMM",
    norm_para = list(),
    disp_para = list(),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    ...) {
    ps <- check_rank_names(ps) %>%
        check_taxa_rank( taxa_rank)
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    method <- match.arg(method, c("LRT", "QLFT"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }

    meta <- data.frame(sample_data(ps))
    groups <- meta[[group]]
    groups <- make.names(groups)
    if (!is.null(contrast)) {
        contrast <- make.names(contrast)
    }
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    groups <- set_lvl(groups, contrast)
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)

    # summarize data and  add norm.factors var to samples of DGEList
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    dge_summarized <- phyloseq2edgeR(ps_summarized)

    nf <- get_norm_factors(ps_normed)
    if (!is.null(nf)) {
        dge_summarized$samples$norm.factors <- nf
    } else {
        # for TSS, CRL and rarefy (no norm factors is saved),
        # normalized the feature table using TMM method in edgeR
        # using the default arguments.
        dge_summarized <- edgeR::calcNormFactors(dge_summarized, method = "TMM")
    }

    # estimate dispersion
    # if (!length(confounders)) {
    #     model_data <- data.frame(group = groups)
    #     design <- stats::model.matrix(~ 0 + group, data = model_data)
    # } else {
    #     model_data <- data.frame(group = groups)
    #     model_data[confounders] <- meta[confounders]
    #     design <- stats::model.matrix(
    #         formula(paste(
    #             "~ + ",
    #             paste(c(confounders, "group"), collapse = " + "))),
    #         data = model_data
    #     )
    # }
    design <- create_design(groups, meta, confounders)
    disp_para <- c(disp_para, y = list(dge_summarized), design = list(design))
    dge_summarized <- do.call(edgeR::estimateDisp, disp_para)

    # differential expression
    # quasi-likelihood (QL) F-test is used as it reflects the uncertainty in
    #  estimating the dispersion for each feature, and gives stricter error
    #  rate control
    fit_fun <- ifelse(method == "LRT", edgeR::glmFit, edgeR::glmQLFit)
    test_fun <- ifelse(method == "LRT", edgeR::glmLRT, edgeR::glmQLFTest)

    # New statistical methods implemented in glmQLFit() to ensure accurate
    # estimation of the quasi-dispersion for data with small counts. However,
    # R will be crash for this  me w ihe nhod on
    # `legacy = TRUE`.
    if (method == "LRT") {
        fit <- fit_fun(dge_summarized, design, ...)
    } else {
        fit <- fit_fun(dge_summarized, design, legacy = TRUE, ...)
    }

    para_cf <- calc_coef(groups, design, contrast)
    lrt <- test_fun(fit, coef = para_cf)
    # lrt <- test_fun(fit, contrast = contrast_new)
    res <- edgeR::topTags(
        lrt,
        n = ntaxa(ps_summarized),
        adjust.method = p_adjust,
        sort.by = "PValue"
    )
    res <- res$table
    if ("FDR" %in% names(res)) {
        res <- dplyr::rename(res, pvalue = "PValue", padj = "FDR")
    } else if ("FWER" %in% names(res)) {
        res <- dplyr::rename(res, pvalue = "PValue", padj = "FWER")
    } else {
        res <- dplyr::rename(res, pvalue = "PValue")
        res$padj <- res$pvalue
    }

    # normalized counts
    ef_nf <- dge_summarized$samples$lib.size *
        dge_summarized$samples$norm.factors
    ref_nf <- mean(ef_nf)
    counts_normalized <-
        sweep(
            as(otu_table(ps_summarized), "matrix"),
            MARGIN = 2,
            ef_nf,
            "/"
        ) *
            ref_nf
    row.names(counts_normalized) <- row.names(tax_table(ps_summarized))

    if (n_lvl > 2) {
        if (is.null(contrast)) {
            cf <- fit$coefficients
            target_idx <- grepl("group", colnames(cf))
            cf <- cf[, target_idx]
            # the first coef is intercept, bind the the reference group as 0
            # (the first column)
            cf <- cbind(0, cf)
            enrich_group <- lvl[apply(cf, 1, which.max)]
            # sort the enrich_group according to the DE of topTags
            de_idx <- match(row.names(res), row.names(cf))
            enrich_group <- enrich_group[de_idx]
        } else {
            enrich_group <- ifelse(res$logFC > 0, contrast[2], contrast[1])
        }
    } else {
        enrich_group <- ifelse(res$logFC > 0, lvl[2], lvl[1])
    }
    res$enrich_group <- enrich_group

    # edgeR::decideTestsDGE(), identify which genes are significantly
    # differentially expressed from an edgeR fit object containing p-values and
    # test statistics.

    # first two columns: feature enrich_group (write a function)
    res <- cbind(feature = row.names(res), res)
    other_col <- setdiff(names(res), c("feature", "enrich_group"))
    res <- res[, c("feature", "enrich_group", other_col)]
    row.names(res) <- paste0("marker", seq_len(nrow(res)))

    # var of effect size: named as ef_<name> of the actual effect size,
    # two groups: logFC, multiple groups: F for QLFT method, LR for LFT method
    if (n_lvl == 2) { # two groups
        ef_name <- "logFC"
    } else { # multiple groups
        if (!is.null(contrast)) { # two groups comparison from multiple groups
            ef_name <- "logFC"
        } else {
            ef_name <- ifelse(method == "LRT", "LR", "F")
        }
    }

    # only keep five variables: feature, enrich_group, effect_size (LR for LRT
    # F for QLFT), pvalue, and padj, write a function? select_marker_var
    # (effect_size = "")
    keep_var <- c("feature", "enrich_group", ef_name, "pvalue", "padj")
    res <- res[keep_var]
    names(res)[3] <- paste0("ef_", ef_name)
    sig_res <- res[res$padj < pvalue_cutoff & !is.na(res$padj), ]
    marker <- return_marker(sig_res, res)

    marker <- microbiomeMarker(
        marker_table = marker,
        norm_method = ifelse(is.null(nf), "TMM", get_norm_method(norm)),
        diff_method = paste("edgeR:", method),
        sam_data = sample_data(ps_normed),
        otu_table = otu_table(counts_normalized, taxa_are_rows = TRUE),
        tax_table = tax_table(ps_summarized)
    )

    marker
}

#' Convert phyloseq data to edgeR `DGEList` object
#'
#' This function convert [`phyloseq::phyloseq-class`] object to
#' [`edgeR::DGEList-class`] object, can then can be used to perform
#' differential analysis using the methods in **edgeR**.
#'
#' @param ps a [`phyloseq::phyloseq-class`] object.
#' @param ... optional, additional named arguments passed  to
#'   [`edgeR::DGEList()`]. Most users will not need to pass any additional
#'   arguments here.
#' @return A [`edgeR::DGEList-class`] object.
#' @export
#' @examples
#' data(caporaso)
#' dge <- phyloseq2edgeR(caporaso)
phyloseq2edgeR <- function(ps, ...) {
    ps <- keep_taxa_in_rows(ps)
    abd <- as(otu_table(ps), "matrix")

    if (any(round(abd) != abd)) {
        warning(
            "Some counts are non-integers, they are rounded to integers.\n",
            "Raw count is recommended for reliable results for edger method.",
            call. = FALSE
        )
    }

    # tax_table: annotation information
    taxa <- tax_table(ps, FALSE)
    if (!is.null(taxa)) {
        taxa <- data.frame(as(taxa, "matrix"))
    }

    # sample_data: information on each sample
    samp <- sample_data(ps, FALSE)

    dge <- edgeR::DGEList(
        counts = abd,
        samples = samp,
        genes = taxa,
        ...
    )

    dge
}


================================================
FILE: R/DA-lefse.R
================================================
#' Liner discriminant analysis (LDA) effect size (LEFSe) analysis
#'
#' Perform Metagenomic LEFSe analysis based on phyloseq object.
#'
#' @param ps a \code{\link[phyloseq]{phyloseq-class}} object
#' @param group character, the column name to set the group
#' @param subgroup character, the column name to set the subgroup
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para  named `list`. other arguments passed to specific
#'   normalization methods.  Most users will not need to pass any additional
#'   arguments here.
#' @param kw_cutoff numeric, p value cutoff of kw test, default 0.05
#' @param wilcoxon_cutoff numeric, p value cutoff of wilcoxon test, default 0.05
#' @param lda_cutoff numeric, lda score cutoff, default 2
#' @param bootstrap_n integer, the number of bootstrap iteration for LDA,
#'   default 30
#' @param bootstrap_fraction numeric, the subsampling fraction value for each
#'   bootstrap iteration, default `2/3`
#' @param multigrp_strat logical, for multiple group tasks, whether the test is
#'   performed in a one-against one (more strict) or in a one-against all
#'   setting, default `FALSE`.
#' @param strict multiple testing options, 0 for no correction (default), 1 for
#'   independent comparisons, 2 for independent comparison.
#' @param sample_min integer, minimum number of samples per subclass for
#'   performing wilcoxon test, default 10
#' @param only_same_subgrp logical, whether perform the wilcoxon test only
#'   among the subgroups with the same name, default `FALSE`
#' @param curv logical, whether perform the wilcoxon test using the
#'   Curtis's approach, defalt `FALSE`
#' @importFrom  dplyr mutate filter arrange rowwise select
#' @importFrom  purrr map_dbl pmap_dbl pmap_chr
#' @importFrom stats p.adjust
#' @importFrom phyloseq rank_names tax_glom
#' @export
#' @return a [microbiomeMarker-class] object, in which the `slot` of
#' `marker_table`
#' contains four variables:
#' * `feature`, significantly different features.
#' * `enrich_group`, the class of the differential features enriched.
#' * `lda`, logarithmic LDA score (effect size)
#' * `pvalue`, p value of kw test.
#' @author Yang Cao
#' @seealso [normalize]
#' @references Segata, Nicola, et al. Metagenomic biomarker discovery and
#' explanation. Genome biology 12.6 (2011): R60.
#' @examples
#' data(kostic_crc)
#' kostic_crc_small <- phyloseq::subset_taxa(
#'     kostic_crc,
#'     Phylum == "Firmicutes"
#' )
#' mm_lefse <- run_lefse(
#'     kostic_crc_small,
#'     wilcoxon_cutoff = 0.01,
#'     group = "DIAGNOSIS",
#'     kw_cutoff = 0.01,
#'     multigrp_strat = TRUE,
#'     lda_cutoff = 4
#' )
run_lefse <- function(ps,
    group,
    subgroup = NULL,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "CPM",
    norm_para = list(),
    kw_cutoff = 0.05,
    lda_cutoff = 2,
    bootstrap_n = 30,
    bootstrap_fraction = 2 / 3,
    wilcoxon_cutoff = 0.05,
    multigrp_strat = FALSE,
    strict = c("0", "1", "2"),
    sample_min = 10,
    only_same_subgrp = FALSE,
    curv = FALSE) {
    if (!inherits(ps, "phyloseq")) {
        stop("`ps` must be phyloseq object", call. = FALSE)
    }

    # check rank names and para taxa_rank
    ps <- check_rank_names(ps)
    ps <- check_taxa_rank(ps, taxa_rank)

    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    strict <- match.arg(strict, c("0", "1", "2"))
    strict <- as.numeric(strict)

    # import input from the original lefse python script or galaxy,
    # will be dropped in the next release version
    summarized <- check_tax_summarize(ps)
    if (summarized && norm != "CPM") {
        stop(
            "`norm` must be a 'CPM' or 'none' while `ps` has been summarized",
            call. = FALSE
        )
    }

    # pre-processing, including: keep taxa in rows, filter taxa whose abundance
    # is zero, fix duplicated tax, transformation and normalization
    ps <- preprocess_ps(ps)
    # transformation
    ps <- transform_abundances(ps, transform = transform)
    # normalization
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)

    sample_meta <- sample_data(ps_normed)
    grp_info <- lefse_format_grp(sample_meta, group, subgroup = subgroup)
    grp <- grp_info$group
    subgrps <- grp_info$subgroup
    grp_hie <- grp_info$group_hie

    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)

    otus <- abundances(ps_summarized, norm = TRUE)
    # transform it for test
    otus_test <- as.data.frame(t(otus), stringsAsFactors = FALSE)
    feature <- tax_table(ps_summarized)@.Data[, 1]
    names(otus_test) <- feature

    # tax table
    tax <- matrix(feature) %>%
        tax_table()
    row.names(tax) <- row.names(otus)

    # kw rank sum test among classes
    kw_p <- purrr::map_dbl(otus_test, ~ kruskal.test(.x, grp)$p.value)

    # remove the taxa, while pvalue is na
    na_ind <- is.na(kw_p)
    if (sum(na_ind) >= 1) {
        otus_test <- otus_test[!na_ind]
        kw_p <- kw_p[!na_ind]
    }

    sig_ind <- kw_p <= kw_cutoff
    sig_otus <- otus_test[, sig_ind]

    # wilcox test is preformed for each class, if there is no subclass
    features_nms <- names(sig_otus)
    wilcoxon_p <- purrr::map2_lgl(
        sig_otus, features_nms,
        ~ test_rep_wilcoxon(
            subgrps, grp_hie,
            .x, .y,
            wilcoxon_cutoff = wilcoxon_cutoff,
            multicls_strat = multigrp_strat,
            strict = strict,
            sample_min = sample_min,
            only_same_subcls = only_same_subgrp,
            curv = curv
        )
    )
    sig_otus <- sig_otus[, wilcoxon_p, drop = FALSE]

    if (ncol(sig_otus) == 0) {
        warning("No marker was identified", call. = FALSE)
        mm <- microbiomeMarker(
            marker_table = NULL,
            norm_method = get_norm_method(norm),
            diff_method = "lefse",
            otu_table = otu_table(otus, taxa_are_rows = TRUE), # normalized
            # new var norm_factor (if it is calculated in normalize)
            sam_data = sample_data(ps_normed),
            tax_table = tax
        )

        return(mm)
    }

    # mean abundance in each group
    otus_enriched_group <- get_feature_enrich_group(grp, sig_otus)

    # bootsrap iteration of lda
    ldas <- bootstap_lda(
        sig_otus,
        boot_n = bootstrap_n,
        class = grp,
        sample_fract = bootstrap_fraction
    )

    lefse_res <- data.frame(
        feature = names(sig_otus),
        enrich_group = otus_enriched_group$group,
        ef_lda = ldas,
        pvalue = kw_p[sig_ind][wilcoxon_p],
        stringsAsFactors = FALSE
    )

    lefse_sig <- filter(lefse_res, .data$ef_lda >= lda_cutoff) %>%
        arrange(.data$enrich_group, desc(.data$ef_lda))

    lefse_out <- return_marker(lefse_sig, lefse_res)
    lefse_out$padj <- lefse_out$pvalue

    mm <- microbiomeMarker(
        marker_table = lefse_out,
        norm_method = get_norm_method(norm),
        diff_method = "lefse",
        otu_table = otu_table(otus, taxa_are_rows = TRUE), # normalized
        # new var norm_factor (if it is calculated in normalize)
        sam_data = sample_data(ps_normed),
        tax_table = tax
    )

    mm
}


================================================
FILE: R/DA-limma-voom.R
================================================
#' Differential analysis using limma-voom
#'
#' @param ps  ps a [`phyloseq::phyloseq-class`] object.
#' @param group  character, the variable to set the group, must be one of
#'   the var of the sample metadata.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param contrast this parameter only used for two groups comparison while
#'   there are multiple groups. For more please see the following details.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#' @param norm_para arguments passed to specific normalization methods. Most
#'   users will not need to pass any additional arguments here.
#' @param voom_span width of the smoothing window used for the lowess
#'   mean-variance trend for [`limma::voom()`]. Expressed as a proportion
#'   between 0 and 1.
#' @param p_adjust method for multiple test correction, default `none`,
#'   for more details see [stats::p.adjust].
#' @param pvalue_cutoff cutoff of p value, default 0.05.
#' @param ... extra arguments passed to [`limma::eBayes()`].
#'
#' @export
#' @return a [`microbiomeMarker-class`] object.
#' @references Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014).
#'   voom: Precision weights unlock linear model analysis tools for RNA-seq read
#'   counts. Genome biology, 15(2), 1-17.
#'
#' @details
#' `contrast` must be a two length character or `NULL` (default). It is only
#' required to set manually for two groups comparison when there are multiple
#' groups. The order determines the direction of comparison, the first element
#' is used to specify the reference group (control). This means that, the first
#' element is the denominator for the fold change, and the second element is
#' used as baseline (numerator for fold change). Otherwise, users do required
#' to concern this parameter (set as default `NULL`), and if there are
#' two groups, the first level of groups will set as the reference group; if
#' there are multiple groups, it will perform an ANOVA-like testing to find
#' markers which difference in any of the groups.
#'
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.01,
#'     p_adjust = "none"
#' )
#' mm
run_limma_voom <- function(ps,
    group,
    confounders =  character(0),
    contrast = NULL,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "none",
    norm_para = list(),
    voom_span = 0.5,
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    ...) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps) %>% 
        check_taxa_rank(taxa_rank)
    
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )
    
     if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }

    meta <- sample_data(ps)
    # meta_nms <- names(meta)
    # if (!group %in% meta_nms) {
    #     stop(
    #         group, " are not contained in the `sample_data` of `ps`",
    #         call. = FALSE
    #     )
    # }
    groups <- meta[[group]]
    groups <- make.names(groups)
    if (!is.null(contrast)) {
        contrast <- make.names(contrast)
    }
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    groups <- set_lvl(groups, contrast)
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    transform <- match.arg(transform, c("identity", "log10", "log10p"))

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalize the data
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    counts <- abundances(ps_summarized, norm = FALSE)
    # row.names(counts) <- tax_table(ps_summarized)[, 1]

    # design matrix
    design <- create_design(groups, meta, confounders)

    # library size
    nf <- get_norm_factors(ps_normed)
    lib_size <- phyloseq::sample_sums(ps)
    if (!is.null(nf)) {
        lib_size <- nf * lib_size
    }

    voom_out <- limma::voom(
        counts,
        design = design,
        lib.size = lib_size,
        span = voom_span
    )
    fit_out <- limma::lmFit(voom_out, design = design)
    
    para_cf <- calc_coef(groups, design, contrast)
    # fit_out <- limma::contrasts.fit(fit_out, coefficients = para_cf)
    
    # if (length(contrast_new) == n_lvl) {
    #     # warning: row names of contrasts don't match col names of coefficients
    #     fit_out <- limma::contrasts.fit(fit_out, contrast_new)
    # }
    test_out <- limma::eBayes(fit_out, ...)
    test_df <- limma::topTable(
        test_out,
        coef = para_cf,
        number = nrow(counts),
        adjust.method = p_adjust
    )

    counts_normed <- abundances(ps_summarized, norm = TRUE)
    if (n_lvl == 2 || !is.null(contrast)) {
        enrich_group <- ifelse(test_df$logFC > 0, lvl[2], lvl[1])
        
        ef <- test_df[["logFC"]]
        ef_name <- "ef_logFC"
    } else {
        cf <- fit_out$coefficients
        target_idx <- grepl("group", colnames(cf))
        cf <- cf[, target_idx]
        cf <- cbind(0, cf)
        enrich_group <- lvl[apply(cf, 1, which.max)]
        enrich_group <- enrich_group[match(row.names(test_df), row.names(cf))]
        
        ef <- test_df[["F"]]
        ef_name <- "ef_F_statistic"
    }
    # if (length(contrast_new) == n_lvl) {
    #     exp_lvl <- lvl[contrast_new == 1]
    #     ref_lvl <- lvl[contrast_new == -1]
    #     enrich_group <- ifelse(test_df$logFC > 0, exp_lvl, ref_lvl)
    # } else {
    #     cf <- fit_out$coefficients
    #     enrich_idx <- apply(cf, 1, which.max)
    #     enrich_group <- lvl[enrich_idx]
    #     enrich_group <- enrich_group[match(row.names(test_df), row.names(cf))]
    # }

    # if (length(contrast_new) == n_lvl) {
    #     ef <- test_df[["logFC"]]
    #     ef_name <- "ef_logFC"
    # } else {
    #     ef <- test_df[["F"]]
    #     ef_name <- "ef_F_statistic"
    # }

    marker <- data.frame(
        feature = row.names(test_df),
        enrich_group = enrich_group,
        ef = ef,
        pvalue = test_df$P.Value,
        padj = test_df$adj.P.Val
    )
    names(marker)[3] <- ef_name
    sig_marker <- dplyr::filter(marker, .data$padj <= pvalue_cutoff)
    marker <- return_marker(sig_marker, marker)
    mm <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = "limma_voom",
        sam_data = sample_data(ps_normed),
        otu_table = otu_table(counts_normed, taxa_are_rows = TRUE),
        tax_table = tax_table(ps_summarized)
    )

    mm
}


================================================
FILE: R/DA-metagenomeSeq.R
================================================
# We recommend fitFeatureModel over fitZig. MRcoefs, MRtable and MRfulltable
# are useful summary tables of the model outputs. We currently recommend using
# the zero-inflated log-normal model as implemented in fitFeatureModel.
#
# from biocore/qiime/blob/master/qiime/support_files/R/fitZIG.r
# /xia-lab/MicrobiomeAnalystR/blob/master/R/general_anal.R#L505
# https://support.bioconductor.org/p/78230/

# Difference between fitFeatureModel and fitZIG in metagenomeSeq,
# https://support.bioconductor.org/p/94138/.
#
# fitFeatureModel doesn't seem to allow for multiple comparisons.

#' metagenomeSeq differential analysis
#'
#' Differential expression analysis based on the Zero-inflated Log-Normal
#' mixture model or Zero-inflated Gaussian mixture model using metagenomeSeq.
#'
#' @param ps  ps a [`phyloseq::phyloseq-class`] object.
#' @param group  character, the variable to set the group, must be one of
#'   the var of the sample metadata.
#' @param confounders character vector, the confounding variables to be adjusted.
#'   default `character(0)`, indicating no confounding variable.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of `phyloseq::rank_names(ps)`,
#'   or "all" means to summarize the taxa by the top taxa ranks
#'   (`summarize_taxa(ps, level = rank_names(ps)[1])`), or "none" means perform
#'   differential analysis on the original taxa (`taxa_names(ps)`, e.g.,
#'   OTU or ASV).
#' @param contrast this parameter only used for two groups comparison while
#'   there are multiple groups. For more please see the following details.
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods.
#' @param method character, which model used for differential analysis,
#'   "ZILN" (Zero-inflated Log-Normal mixture model)" or "ZIG" (Zero-inflated
#'    Gaussian mixture model). And the zero-inflated log-normal model is
#'    preferred due to the high sensitivity and low FDR.
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05
#' @param ... extra arguments passed to the model. more details see
#'   [`metagenomeSeq::fitFeatureModel()`] and [`metagenomeSeq::fitZig()`],
#'   e.g. `control` (can be setted using [`metagenomeSeq::zigControl()`]) for
#'   [`metagenomeSeq::fitZig()`].
#'
#' @details
#' metagnomeSeq provides two differential analysis methods, zero-inflated
#' log-normal mixture model (implemented in
#' [`metagenomeSeq::fitFeatureModel()`]) and zero-inflated Gaussian mixture
#' model (implemented in [`metagenomeSeq::fitZig()`]). We recommend
#' fitFeatureModel over fitZig due to high sensitivity and low FDR. Both
#' [`metagenomeSeq::fitFeatureModel()`] and [`metagenomeSeq::fitZig()`] require
#' the abundance profiles before normalization.
#'
#' For [`metagenomeSeq::fitZig()`], the output column is the coefficient of
#' interest, and logFC column in the output of
#' [`metagenomeSeq::fitFeatureModel()`] is analogous to coefficient. Thus,
#' logFC is really just the estimate the coefficient of interest in
#' [`metagenomeSeq::fitFeatureModel()`]. For more details see
#' these question [Difference between fitFeatureModel and fitZIG
#' in metagenomeSeq](https://support.bioconductor.org/p/94138/).
#'
#' `contrast` must be a two length character or `NULL` (default). It is only
#' required to set manually for two groups comparison when there are multiple
#' groups. The order determines the direction of comparison, the first element
#' is used to specify the reference group (control). This means that, the first
#' element is the denominator for the fold change, and the second element is
#' used as baseline (numerator for fold change). Otherwise, users do required
#' to concern this paramerter (set as default `NULL`), and if there are
#' two groups, the first level of groups will set as the reference group; if
#' there are multiple groups, it will perform an ANOVA-like testing to find
#' markers which difference in any of the groups.
#'
#' Of note, [`metagenomeSeq::fitFeatureModel()`] is not allows for multiple
#' groups comparison.
#'
#' @return  a [`microbiomeMarker-class`] object.
#' @export
#' @author Yang Cao
#' @importFrom stats model.matrix
#' @importFrom metagenomeSeq normFactors<- MRcounts
#' @importFrom Biobase pData<- pData
#' @references
#' Paulson, Joseph N., et al. "Differential abundance analysis for microbial
#' marker-gene surveys." Nature methods 10.12 (2013): 1200-1202.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_metagenomeseq(ps, group = "Enterotype")
run_metagenomeseq <- function(ps,
    group,
    confounders =  character(0),
    contrast = NULL,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "CSS",
    norm_para = list(),
    method = c("ZILN", "ZIG"),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    ...) {
    ps <- check_rank_names(ps) %>% 
        check_taxa_rank(taxa_rank)
    
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    method <- match.arg(method, c("ZILN", "ZIG"))
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )
    
    if (length(confounders)) {
        confounders <- check_confounder(ps, group, confounders)
    }
    
    meta <- sample_data(ps)
    groups <- meta[[group]]
    groups <- make.names(groups)
    if (!is.null(contrast)) {
        contrast <- make.names(contrast)
    }
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    groups <- set_lvl(groups, contrast)
    lvl <- levels(groups)
    n_lvl <- length(lvl)

    if (n_lvl > 2 && method == "ZILN") {
        stop(
            "ZILN method do not allows for multiple groups comparison,\n",
            "please try set method = `ZIG`",
            call. = FALSE
        )
    }

    # contrast_new <- create_contrast(groups, contrast)
    # When running fitZig by default there is an additional covariate added to
    # the design matrix (scalingFactor), add var scalingFactor (set as zero)
    # if (n_lvl > 2) {
    #     old_contrast_nms <- row.names(contrast_new)
    #     contrast_new <- rbind(contrast_new, rep(0, ncol(contrast_new)))
    #     # row names of contrasts consistent with of coefficients
    #     # otherwise, warning: row names of contrasts don't match col names of
    #     # coefficients in the following `contrast.fit()`
    #     row.names(contrast_new) <- c(
    #         paste0("groups", old_contrast_nms),
    #         "scalingFactor"
    #     )
    # }

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalization, write a function here
    # fitZig fitFeatureModel
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)

    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    mgs_summarized <- phyloseq2metagenomeSeq(ps_summarized)

    # extract norm factors and set the norm factors of MRexperiment
    nf <- get_norm_factors(ps_normed)
    if (!is.null(nf)) {
        pData(mgs_summarized@expSummary$expSummary)$normFactors <- nf
    } else {
        # for TSS, CRL and rarefy: normalized the feature table using CSS method
        ct <- metagenomeSeq::MRcounts(mgs_summarized, norm = FALSE)
        fun_p <- select_quantile_func(ct)
        mgs_summarized <- metagenomeSeq::cumNorm(
            mgs_summarized,
            p = fun_p(mgs_summarized)
        )
    }

    sl <- ifelse("sl" %in% names(norm_para), norm_para[["sl"]], 1000)
    counts_normalized <- metagenomeSeq::MRcounts(
        mgs_summarized,
        norm = TRUE,
        sl = sl
    )

    # mod <- model.matrix(~ 0 + groups)
    mod <- create_design(groups, meta, confounders)

    if (n_lvl == 2) {
        if (method == "ZILN") {
            tryCatch(
                fit <- metagenomeSeq::fitFeatureModel(mgs_summarized, mod, ...),
                error = function(e) {
                    paste0(
                        "fitFeatureModel model failed to fit to your data! ",
                        "Consider fitZig model or further ",
                        "filtering your dataset!"
                    )
                }
            )
        } else {
            tryCatch(
                fit <- metagenomeSeq::fitZig(mgs_summarized, mod, ...),
                error = function(e) {
                    paste0(
                        "fitZig model failed to fit to your data! ",
                        "Consider fitFeatureModel model or further ",
                        "filtering your dataset!"
                    )
                }
            )
        }

        # metagenomeSeq vignette: We recommend the user remove features based on
        # the number of estimated effective samples, please see
        # calculateEffectiveSamples. We recommend removing features with less 
        # than the average number of effective samples in all features. In 
        # essence, setting eff = .5 when using MRcoefs, MRfulltable, or MRtable.
        res <- metagenomeSeq::MRcoefs(
            fit,
            number = ntaxa(ps_summarized),
            adjustMethod = p_adjust,
            group = 3,
            eff = 0.5
        )
        res <- dplyr::rename(
            res, 
            pvalue = .data$pvalues, 
            padj = .data$adjPvalues
        )

        # For fitZig, the output var is the coefficient of interest (effect size
        # ), For fitFeaturemodel, logFC is anologous to coefficient of fitZig
        # (as logFC is really just the estimate the coefficient of interest).
        # Thus, we change the var of coefficent of interest to logFC for fitZig
        # https://support.bioconductor.org/p/94138/
        if (method == "ZIG") {
            names(res)[2] <- "logFC"
        }
        ef_var <- "logFC"
        res$enrich_group <- ifelse(res[[ef_var]] > 0, lvl[2], lvl[1])
    } else {
        fit <- metagenomeSeq::fitZig(mgs_summarized, mod, ...)
        zigfit <- slot(fit, "fit")
        # warning: row names of contrasts don't match col names of coefficients
        para_cf <- calc_coef(groups, mod, contrast)
        new_fit <- limma::contrasts.fit(zigfit, coefficients = para_cf)
        new_fit <- limma::eBayes(new_fit)
        res <- limma::topTable(
            new_fit,
            number = Inf,
            adjust.method = p_adjust,
        )
        res <- dplyr::filter(res, .data$adj.P.Val <= pvalue_cutoff) %>%
            dplyr::rename(pvalue = .data$P.Value, padj = .data$adj.P.Val)

        ef_var <- ifelse(is.null(contrast), "F", "logFC")

        # enrich group
        if (ef_var == "logFC") {
            # exp_lvl <- lvl[contrast_new == 1]
            # ref_lvl <- lvl[contrast_new == -1]
            # enrich_group <- ifelse(res$logFC > 0, exp_lvl, ref_lvl)
            enrich_group <- ifelse(res$logFC > 0, lvl[2], lvl[1])
        } else {
            cf <- zigfit$coefficients
            target_idx <- grepl("group", colnames(cf))
            cf <- cf[, target_idx]
            cf <- cbind(0, cf)
            enrich_group <- lvl[apply(cf, 1, which.max)]
            # sort the enrich_group according to the DE of topTags
            enrich_group <- enrich_group[match(row.names(res), row.names(cf))]
        }
        res$enrich_group <- enrich_group
    }
    res <- cbind(feature = row.names(res), res)
    res <- res[, c(
        "feature", "enrich_group",
        ef_var, "pvalue", "padj"
    )]
    row.names(res) <- paste0("marker", seq_len(nrow(res)))

    # rename the ef
    names(res)[3] <- ifelse(
        ef_var %in% c("logFC", "F"),
        paste0("ef_", ef_var),
        paste0("ef_", "coef")
    )
    sig_res <- res[res$padj < pvalue_cutoff & !is.na(res$padj), ]
    marker <- return_marker(sig_res, res)
    marker <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = paste0("metagenomeSeq: ", method),
        otu_table = otu_table(counts_normalized, taxa_are_rows = TRUE),
        sam_data = sample_data(ps_normed),
        tax_table = tax_table(ps_summarized)
    )

    marker
}

# This function is modified from `phyloseq::phyloseq_to_metagenomeSeq()`,
# There two changes: 1) do not coerce count data to vanilla matrix of integers;
# 2) do not normalize the count.
#
#
#' Convert phyloseq data to MetagenomeSeq `MRexperiment` object
#'
#' The phyloseq data is converted to the relevant
#' [`metagenomeSeq::MRexperiment-class`] object, which can then be tested in
#' the zero-inflated mixture model framework in the metagenomeSeq package.
#'
#' @param ps [`phyloseq::phyloseq-class`] object for
#'   `phyloseq2metagenomeSeq()`, or [`phyloseq::otu_table-class`] object
#'   for `otu_table2metagenomeseq()`.
#' @param ... optional, additional named arguments passed  to
#'   [`metagenomeSeq::newMRexperiment()`]. Most users will not need to pass
#'   any additional arguments here.
#' @return A [`metagenomeSeq::MRexperiment-class`] object.
#' @seealso [`metagenomeSeq::fitTimeSeries()`],
#'   [`metagenomeSeq::fitLogNormal()`],[`metagenomeSeq::fitZig()`],
#'   [`metagenomeSeq::MRtable()`],[`metagenomeSeq::MRfulltable()`]
#' @export
#' @importFrom Biobase AnnotatedDataFrame
#' @importMethodsFrom phyloseq t
#' @examples
#' data(caporaso)
#' phyloseq2metagenomeSeq(caporaso)
phyloseq2metagenomeSeq <- function(ps, ...) {
    # Enforce orientation. Samples are columns
    if (!taxa_are_rows(ps)) {
        ps <- t(ps)
    }

    count <- as(otu_table(ps), "matrix")
    # Create sample annotation if possible
    if (!is.null(sample_data(ps, FALSE))) {
        adf <- AnnotatedDataFrame(data.frame(sample_data(ps)))
    } else {
        adf <- NULL
    }

    # Create taxa annotation if possible
    if (!is.null(tax_table(ps, FALSE))) {
        tdf <- AnnotatedDataFrame(
            data.frame(
                OTUname = taxa_names(ps),
                data.frame(tax_table(ps)),
                row.names = taxa_names(ps)
            )
        )
    } else {
        tdf <- AnnotatedDataFrame(
            data.frame(
                OTUname = taxa_names(ps),
                row.names = taxa_names(ps)
            )
        )
    }

    # Create MRexperiment
    mr_obj <- metagenomeSeq::newMRexperiment(
        counts = count,
        phenoData = adf,
        featureData = tdf,
        ...
    )

    mr_obj
}


#' @rdname phyloseq2metagenomeSeq
#' @export
otu_table2metagenomeSeq <- function(ps, ...) {
    stopifnot(inherits(ps, "otu_table"))
    # create a sample data with only one var "sample": sam1, sam2
    sdf <- sample_data(data.frame(sample = paste0("sam", seq_len(ncol(ps)))))
    row.names(sdf) <- colnames(ps)

    ps <- phyloseq(
        ps,
        sdf
    )
    mgs <- phyloseq2metagenomeSeq(ps)

    mgs
}


# get enrich group of a feature for multiple groups comparison
# group_pairs and logFC_pairs are the same length
get_mgs_enrich_group <- function(group_pairs, logFC_pairs) {
    all_groups <- unique(unlist(group_pairs))
    for (i in seq_along(group_pairs)) {
        group_low <- ifelse(
            logFC_pairs[i] > 0,
            group_pairs[[i]][2],
            group_pairs[[i]][1]
        )
        all_groups <- setdiff(all_groups, group_low)
    }

    all_groups
}


================================================
FILE: R/DA-simple-statistic.R
================================================
#' Simple statistical analysis of metagenomic profiles
#'
#' Perform simple statistical analysis of metagenomic profiles. This function
#' is a wrapper of `run_test_two_groups` and `run_test_multiple_groups`.
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param method test method, options include: "welch.test", "t.test" and
#'   "white.test" for two groups comparison,  "anova"and "kruskal" for multiple
#'   groups comparison.
#' @param p_adjust method for multiple test correction, default `none`,
#'   for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05
#' @param diff_mean_cutoff,ratio_cutoff only used for two groups comparison,
#'   cutoff of different means and ratios, default `NULL` which means no effect
#'   size filter.
#' @param eta_squared_cutoff only used for multiple groups comparison, numeric,
#'   cutoff of effect size (eta squared) default `NULL` which means no effect
#'   size filter.
#' @param conf_level only used for two groups comparison, numeric, confidence
#'   level of interval.
#' @param nperm integer, only used for two groups comparison, number of
#'   permutations for white non parametric t test estimation
#' @param ... only used for two groups comparison, extra arguments passed to
#' [`t.test()`] or [`fisher.test()`].
#' @return a [`microbiomeMarker-class`] object.
#' @seealso [`run_test_two_groups()`],[`run_test_multiple_groups()`]
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_simple_stat(ps, group = "Enterotype")
run_simple_stat <- function(ps,
    group,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "TSS",
    norm_para = list(),
    method = c(
        "welch.test", "t.test", "white.test",
        "anova", "kruskal"
    ),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    diff_mean_cutoff = NULL,
    ratio_cutoff = NULL,
    eta_squared_cutoff = NULL,
    conf_level = 0.95,
    nperm = 1000,
    ...) {
    stopifnot(inherits(ps, "phyloseq"))
    
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    method <- match.arg(
        method,
        c("welch.test", "t.test", "white.test", "anova", "kruskal")
    )
    p_adjust <- match.arg(
        p_adjust,
        c(
            "none", "fdr", "bonferroni", "holm",
            "hochberg", "hommel", "BH", "BY"
        )
    )

    # group
    sample_meta <- sample_data(ps)
    if (!group %in% names(sample_meta)) {
        stop("`group` must in the field of sample meta data", call. = FALSE)
    }
    groups <- sample_meta[[group]]
    n_group <- length(unique(groups))
    if (n_group == 1) {
        stop("at least two groups required", call. = FALSE)
    }

    if (n_group == 2) {
        if (!method %in% c("welch.test", "t.test", "white.test")) {
            stop(
                "There are two groups here, please select welch.test, t.test, ",
                "or white.test for two groups comparison",
                call. = FALSE
            )
        }
        if (!missing(eta_squared_cutoff)) {
            warning(
                "`eta_squared_cutoff` is ignored since it is only used for ",
                "multiple groups comparison",
                call. = FALSE
            )
        }

        res <- run_test_two_groups(
            ps = ps,
            group = group,
            taxa_rank = taxa_rank,
            transform = transform,
            norm = norm,
            norm_para = norm_para,
            method = method,
            p_adjust = p_adjust,
            pvalue_cutoff = pvalue_cutoff,
            diff_mean_cutoff = diff_mean_cutoff,
            ratio_cutoff = ratio_cutoff,
            conf_level = conf_level,
            nperm = nperm,
            ...
        )
    } else {
        if (!method %in% c("anova", "kruskal")) {
            stop(
                "There are more than two groups, please select anova or ",
                "kruskal for multiple groups comparison",
                call. = FALSE
            )
        }

        if (!missing(diff_mean_cutoff)) {
            warning(
                "`diff_mean_cutoff` only worked for two groups comparison",
                call. = FALSE
            )
        }
        if (!missing(ratio_cutoff)) {
            warning(
                "`ratio_cutoff` only worked for two groups comparison",
                call. = FALSE
            )
        }
        if (!missing(nperm)) {
            warning(
                "`nperm` only worked for two groups comparison",
                call. = FALSE
            )
        }
        if (!missing(conf_level)) {
            warning(
                "`conf_level` only worked for two groups comparison",
                call. = FALSE
            )
        }

        res <- run_test_multiple_groups(
            ps = ps,
            group = group,
            taxa_rank = taxa_rank,
            transform = transform,
            norm = norm,
            norm_para = norm_para,
            method = method,
            p_adjust = p_adjust,
            pvalue_cutoff = pvalue_cutoff,
            effect_size_cutoff = eta_squared_cutoff
        )
    }

    res
}


================================================
FILE: R/DA-sl.R
================================================
#' Identify biomarkers using supervised leaning (SL) methods
#'
#' Identify biomarkers using logistic regression, random forest, or support
#' vector machine.
#'
#' @param ps a \code{\link[phyloseq]{phyloseq-class}} object.
#' @param group character, the variable to set the group.
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para  named `list`. other arguments passed to specific
#'   normalization methods.  Most users will not need to pass any additional
#'   arguments here.
#' @param nfolds the number of splits in CV.
#' @param nrepeats the number of complete sets of folds to compute.
#' @param sampling a single character value describing the type of additional
#'   sampling that is conducted after resampling (usually to resolve class
#'   imbalances). Values are "none", "down", "up", "smote", or "rose". For
#'   more details see [`caret::trainControl()`].
#' @param tune_length an integer denoting the amount of granularity in the
#'   tuning parameter grid. For more details see [`caret::train()`].
#' @param top_n an integer denoting the top `n` features as the biomarker
#'   according the importance score.
#' @param method supervised learning method, options are "LR" (logistic
#'   regression), "RF" (rando forest), or "SVM" (support vector machine).
#' @param ... extra arguments passed to the classification. e.g., `importance`
#'   for `randomForest::randomForest`.
#'
#' @details Only support two groups comparison in the current version. And the
#'  marker was selected based on its importance score. Moreover, The
#'  hyper-parameters are selected automatically by a grid-search based method
#'  in the N-time K-fold cross-validation. Thus, the identified biomarker based
#'  can be biased due to model overfitting for small datasets (e.g., with less
#'  than 100 samples).
#'
#'  The argument `top_n` is used to denote the number of markers based on the
#'  importance score. There is no rule or principle on how to select `top_n`,
#'  however, usually it is very useful to try a different `top_n` and compare
#'  the performance of the marker predictions for the testing data.
#'
#' @return a [microbiomeMarker-class] object.
#'
#' @seealso [`caret::train()`],[`caret::trainControl()`]
#'
#' @author Yang Cao
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' # small example phyloseq object for test
#' ps_small <- phyloseq::subset_taxa(
#'     enterotypes_arumugam,
#'     Phylum %in% c("Firmicutes", "Bacteroidetes")
#' )
#'
#' set.seed(2021)
#' mm <- run_sl(
#'     ps_small,
#'     group = "Gender",
#'     taxa_rank = "Genus",
#'     nfolds = 2,
#'     nrepeats = 1,
#'     top_n = 15,
#'     norm = "TSS",
#'     method = "LR",
#' )
#' mm
run_sl <- function(ps,
    group,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "none",
    norm_para = list(),
    nfolds = 3,
    nrepeats = 3,
    sampling = NULL,
    tune_length = 5,
    top_n = 10,
    method = c("LR", "RF", "SVM"),
    ...) {
    sample_meta <- sample_data(ps)
    meta_nms <- names(sample_meta)
    if (!group %in% meta_nms) {
        stop(
            group, " are not contained in the `sample_data` of `ps`",
            call. = FALSE
        )
    }
    ps <- check_rank_names(ps)
    ps <- check_taxa_rank(ps, taxa_rank)

    # In current version, sl just for two groups comparisons
    groups <- sample_meta[[group]]
    group_n <- length(unique(groups))
    if (group_n != 2) {
        stop(
            "Supervised method only support for two groups comparisons",
            call. = FALSE
        )
    }

    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    method <- match.arg(method, choices = c("LR", "RF", "SVM"))
    full_method <- switch(method,
        LR = "logistic regression",
        RF = "random forest",
        SVM = "support vector machine"
    )
    train_method <- switch(method,
        LR = "glmnet",
        RF = "ranger",
        SVM = "svmLinear"
    )


    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalization, write a function here
    # fitZig fitFeatureModel
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)

    # summarize data
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    counts_tab <- abundances(ps_summarized, norm = TRUE)
    tax_tab <- as.data.frame(tax_table(ps_summarized))
    # in the animalcules, the counts were transferred as cpm, counts per million

    # number of markers must smaller than features
    feature_n <- nrow(tax_tab)
    if (feature_n < top_n) {
        stop(
            "There are ", feature_n, " features, ",
            "`top_n` must be smaller than number of features",
            call. = FALSE
        )
    }

    # transpose for modeling train
    counts_tab <- transpose_and_2df(counts_tab)
    colnames(counts_tab) <- tax_tab[, 1]

    # filter zero or near zero-variance predictors
    # https://topepo.github.io/caret/pre-processing.html#nzv
    # {stackoverflow}questions/47060233/stacking-models-from-different-packages

    # add target variable
    # to compute the varImp of svm model y must be a factor,
    # or there is a error: Error in y - mean(y, rm.na = TRUE): non-numeric
    # argument to binary operator
    counts_tab$y <- factor(groups)

    # set up classification model parameters
    fit_control <- caret::trainControl(
        method = "repeatedcv", # k fold cross validation
        number = nfolds,
        repeats = nrepeats,
        classProbs = TRUE,
        summaryFunction = caret::twoClassSummary, # is only for two classes
        sampling = sampling,
        savePredictions = TRUE
    )

    model_fit <- caret::train(
        y ~ .,
        data = counts_tab,
        method = train_method,
        tuneLength = tune_length,
        trControl = fit_control,
        metric = "ROC",
        ...
    )
    imp_df <- caret::varImp(model_fit)$importance
    # remove backtick
    feature <- gsub("`", "", row.names(imp_df))
    imp_df <- data.frame(
        feature = feature,
        imp = imp_df[, 1]
    )
    imp_df <- imp_df[order(imp_df$imp, decreasing = TRUE), ]
    marker <- imp_df[seq_len(top_n), ]

    # remove target variable
    counts_tab$y <- NULL

    # get the enrich_group
    marker_idx <- match(marker$feature, tax_tab[, 1])
    counts_tab_marker <- counts_tab[, marker_idx]
    enrich_group <- get_sl_enrich_group(
        counts_tab_marker,
        groups,
        sample_in_cols = FALSE
    )
    marker$enrich_group <- enrich_group
    marker <- marker[c("feature", "enrich_group", "imp")]
    names(marker) <- c("feature", "enrich_group", "ef_imp")
    ot <- otu_table(t(counts_tab), taxa_are_rows = TRUE)
    tt <- tax_table(ps_summarized)
    row.names(ot) <- row.names(tt)

    mm <- microbiomeMarker(
        marker_table = marker_table(marker),
        norm_method = get_norm_method(norm),
        diff_method = full_method,
        otu_table = ot,
        sam_data = sample_data(ps_summarized),
        tax_table = tt
    )

    mm
}

get_sl_enrich_group <- function(counts_tab, groups, sample_in_cols = TRUE) {
    if (sample_in_cols) {
        counts_tab <- t(counts_tab)
    }

    counts_mean <- by(counts_tab, groups, colMeans)
    counts_mean <- do.call(cbind, counts_mean)
    idx_enrich <- apply(counts_mean, 1, which.max)
    group_enrich <- colnames(counts_mean)[idx_enrich]

    group_enrich
}


================================================
FILE: R/DA-test-multiple-groups.R
================================================
#' Statistical test for multiple groups
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param method test method, must be one of "anova" or "kruskal"
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05.
#' @param effect_size_cutoff numeric, cutoff of effect size default `NULL`
#'   which means no effect size filter. The eta squared is used to measure the
#'   effect size for anova/kruskal test.
#' @importFrom dplyr mutate bind_cols filter select
#' @importFrom stats p.adjust
#' @seealso [run_posthoc_test()],[`run_test_two_groups()`],[`run_simple_stat()`]
#' @export
#' @return a [`microbiomeMarker-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
#' )
#' mm_anova <- run_test_multiple_groups(
#'     ps,
#'     group = "Enterotype",
#'     method = "anova"
#' )
run_test_multiple_groups <- function(ps,
    group,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "TSS",
    norm_para = list(),
    method = c("anova", "kruskal"),
    p_adjust = c(
        "none", "fdr", "bonferroni",
        "holm", "hochberg", "hommel",
        "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    effect_size_cutoff = NULL) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps)
    ps <- check_taxa_rank(ps, taxa_rank)

    p_adjust <- match.arg(
        p_adjust,
        c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY")
    )
    method <- match.arg(method, c("anova", "kruskal"))

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # normalize
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    # summarize
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)

    feature <- tax_table(ps_summarized)@.Data[, 1]
    abd_norm <- abundances(ps_summarized, norm = TRUE) %>%
        transpose_and_2df()

    sample_meta <- sample_data(ps_summarized)
    if (!group %in% names(sample_meta)) {
        stop("`group` must in the field of sample meta data")
    }
    groups <- sample_meta[[group]]

    if (method == "anova") {
        aov_df <- mutate(abd_norm, groups = groups)

        # separator "|" and some strings (such as "/", "-", "+") have a special
        # meaning in formula
        # replace this strings with ___(three underscores) before aov 
        # (new_feature), and reset the names as `feature`
        names(aov_df) <- gsub("[-|+*//]", "___", names(aov_df))
        new_features <- setdiff(names(aov_df), "groups")

        formula_char <- paste(new_features, "~", "groups")
        pvalue <- purrr::map(
            formula_char,
            ~ aov(as.formula(.x), aov_df) %>% summary(.)
        ) %>%
            purrr::map_dbl(~ .x[[1]][["Pr(>F)"]][1])
    } else {
        pvalue <- purrr::map_dbl(abd_norm, ~ kruskal.test(.x, groups)$p.value)
    }
    pvalue[is.na(pvalue)] <- 1

    # p value correction for multiple comparisons
    padj <- p.adjust(pvalue, method = p_adjust)

    ef <- purrr::map_dbl(abd_norm, calc_etasq, groups)

    # mean abundances
    abd_means <- calc_mean(abd_norm, groups)
    row.names(abd_means) <- feature

    # enriched group
    group_enriched_idx <- apply(abd_means, 1, which.max)
    groups_uniq <- unique(groups)
    group_nms <- strsplit(names(abd_means), ":") %>% 
        vapply(function(x)x[[1]], FUN.VALUE = "a")
    group_enriched <- group_nms[group_enriched_idx]

    res <- bind_cols(
        data.frame(
            enrich_group = group_enriched,
            pvalue = pvalue,
            padj = padj,
            ef_eta_squared = ef
        ),
        abd_means
    )

    # append feature
    res <- mutate(res, feature = feature) %>%
        select(.data$feature, .data$enrich_group, everything())
    row.names(res) <- paste0("feature", seq_len(nrow(res)))

    # filter: pvalue and effect size
    res_filtered <- filter(res, .data$padj <= pvalue_cutoff)

    if (!is.null(effect_size_cutoff)) {
        res_filtered <- filter(
            res_filtered,
            .data$ef_eta_squared >= effect_size_cutoff
        )
    }

    # summarized tax table
    tax <- matrix(feature) %>%
        tax_table()
    row.names(tax) <- colnames(abd_norm)

    # only keep five variables: feature, enrich_group, effect_size (diff_mean),
    # pvalue, and padj
    res <- res[, c(
        "feature", "enrich_group", 
        "ef_eta_squared", "pvalue", "padj"
    )]
    res_filtered <- res_filtered[, c(
        "feature", "enrich_group",
        "ef_eta_squared", "pvalue", "padj"
    )]
    row.names(res_filtered) <- paste0("marker", seq_len(nrow(res_filtered)))

    marker <- return_marker(res_filtered, res)
    marker <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = method,
        sam_data = sample_data(ps_normed),
        otu_table = otu_table(t(abd_norm), taxa_are_rows = TRUE),
        tax_table = tax
    )

    marker
}

# calculate mean abundance of each feature in each group
#' @importFrom dplyr bind_cols
#' @noRd
calc_mean <- function(abd_norm, groups) {
    abd_norm_groups <- split(abd_norm, groups)
    abd_means <- purrr::map(abd_norm_groups, ~ colMeans(.x)) %>%
        bind_cols() %>%
        as.data.frame()
    row.names(abd_means) <- names(abd_norm)
    names(abd_means) <- paste(
        names(abd_norm_groups),
        "mean_abundance",
        sep = ":"
    )

    abd_means
}

#' calculate eta-squared measurement of effect size commonly used in multiple
#' group statistical analysis
#' @param feature numeric vector, abundance of a given feature
#' @param group vector in the same length with argument `feature`, groups of the
#' feature
#' @noRd
calc_etasq <- function(feature, group) {
    group_n <- table(group)

    if (any(group_n < 1)) {
        return(-1)
    }

    total_sum <- sum(feature)
    n <- length(feature)
    grand_mean <- total_sum / n

    total_ss <- sum((feature - grand_mean)^2)
    feature_split <- split(feature, group)
    between_group_ss <- purrr::map_dbl(
        feature_split,
        ~ sum(.x) * sum(.x) / length(.x)
    )
    between_group_ss <- sum(between_group_ss) - total_sum * total_sum / n

    etasq <- ifelse(total_ss == 0, -1, between_group_ss / total_ss)

    etasq
}


================================================
FILE: R/DA-test-two-groups.R
================================================
#' Statistical test between two groups
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param taxa_rank character to specify taxonomic rank to perform
#'   differential analysis on. Should be one of
#'   `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#'   the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#'   "none" means perform differential analysis on the original taxa
#'   (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param method test method, must be one of "welch.test", "t.test" or
#' "white.test"
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff numeric, p value cutoff, default 0.05
#' @param diff_mean_cutoff,ratio_cutoff cutoff of different means and ratios,
#'   default `NULL` which means no effect size filter.
#' @param conf_level numeric, confidence level of interval.
#' @param nperm integer, number of permutations for white non parametric t test
#'  estimation
#' @param ... extra arguments passed to [t.test()] or [fisher.test()]
#' @importFrom phyloseq rank_names tax_glom
#' @importFrom dplyr select everything filter
#' @export
#' @author Yang Cao
#' @return a [`microbiomeMarker-class`] object.
#' @seealso [`run_test_multiple_groups()`],[`run_simple_stat`]
#' @examples
#' data(enterotypes_arumugam)
#' mm_welch <- run_test_two_groups(
#'     enterotypes_arumugam,
#'     group = "Gender",
#'     method = "welch.test"
#' )
#' mm_welch
run_test_two_groups <- function(ps,
    group,
    taxa_rank = "all",
    transform = c("identity", "log10", "log10p"),
    norm = "TSS",
    norm_para = list(),
    method = c("welch.test", "t.test", "white.test"),
    p_adjust = c(
        "none", "fdr", "bonferroni", "holm",
        "hochberg", "hommel", "BH", "BY"
    ),
    pvalue_cutoff = 0.05,
    diff_mean_cutoff = NULL,
    ratio_cutoff = NULL,
    conf_level = 0.95,
    nperm = 1000,
    ...) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps)

    # ps_rank <- rank_names(ps)
    # if ("Picrust_trait" %in% ps_rank) {
    #     picrust_rank <- c("Picrust_trait", "Picrust_description")
    #     diff_rank <- setdiff(ps_rank, picrust_rank)
    #     if (length(diff_rank)) {
    #         stop("ranks of picrust2 functional profile must be one of ",
    #              paste(picrust_rank, collapse = ", "))
    #     }
    #     warning("para `taxa_rank` is not worked for picrust2 function profile ",
    #             "and is ignored")
    # } else {
    #     if (!check_rank_names(ps)) {
    #         stop(
    #             "ranks of `ps` must be one of ",
    #             paste(available_ranks, collapse = ", ")
    #         )
    #     }
    # }

    p_adjust <- match.arg(
        p_adjust,
        c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY")
    )
    method <- match.arg(method, c("welch.test", "t.test", "white.test"))

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    # original abundance for white test
    # check taxa_rank
    ps <- check_taxa_rank(ps, taxa_rank)
    ps_orig_summarized <- pre_ps_taxa_rank(ps, taxa_rank)
    # if (taxa_rank == "all") {
    #     ps_orig_summarized <- summarize_taxa(ps)
    # } else if (taxa_rank == "none") {
    #     ps_orig_summarized <- extract_rank(ps, taxa_rank)
    # } else {
    #     ps_orig_summarized <- aggregate_taxa(ps, taxa_rank) %>%
    #         extract_rank(taxa_rank)
    # }
    otus <- abundances(ps_orig_summarized, norm = FALSE)
    abd <- transpose_and_2df(otus)

    # normalize, normalize first, then summarize
    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_normed <- do.call(normalize, norm_para)
    # if (taxa_rank == "all") {
    #     ps_summarized <- summarize_taxa(ps_normed)
    # } else if (taxa_rank == "none") {
    #     ps_summarized <- extract_rank(ps_normed, taxa_rank)
    # } else {
    #     ps_summarized <- aggregate_taxa(ps_normed, taxa_rank) %>%
    #         extract_rank(taxa_rank)
    # }
    ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
    abd_norm <- abundances(ps_summarized, norm = TRUE) %>%
        transpose_and_2df()

    sample_meta <- sample_data(ps_summarized)
    if (!group %in% names(sample_meta)) {
        stop("`group` must in the field of sample meta data")
    }
    groups <- sample_meta[[group]]
    abd_norm_group <- split(abd_norm, groups)

    # used for permute statistic in white's non parametric t test method
    orig_abd_group <- split(abd, groups)

    if (method == "welch.test") {
        test_res <- run_t_test(abd_norm_group, conf_level = conf_level, ...)
    } else if (method == "t.test") {
        test_res <- run_t_test(
            abd_norm_group,
            conf_level,
            var_equal = TRUE, ...
        )
    } else if (method == "white.test") {
        test_res <- run_white_test(
            abd_norm_group[[1]],
            abd_norm_group[[2]],
            orig_abd_group[[1]],
            orig_abd_group[[2]],
            group_names = names(abd_norm_group),
            conf_level = conf_level,
            nperm = nperm,
            ...
        )
    }

    feature <- tax_table(ps_summarized)@.Data[, 1]
    test_res[["feature"]] <- feature

    # ratio
    ratio <- purrr::pmap_dbl(abd_norm_group, ~ calc_ratio(.x, .y))
    test_res$ratio <- ratio

    # set the ci and ratio to 0, if both of the mean is 0
    test_res <- mutate(
        test_res,
        ci_lower = ifelse(.data$pvalue == 1, 0, .data$ci_lower),
        ci_upper = ifelse(.data$pvalue == 1, 0, .data$ci_upper)
    ) %>%
        select(.data$feature, .data$enrich_group, everything())

    # p value correction for multiple comparisons
    test_res$padj <- p.adjust(test_res$pvalue, method = p_adjust)
    row.names(test_res) <- paste0("feature", seq_len(nrow(test_res)))

    test_filtered <- filter(test_res, .data$padj <= pvalue_cutoff)

    if (!is.null(diff_mean_cutoff)) {
        test_filtered <- filter(
            test_filtered,
            abs(.data$ef_diff_mean) >= diff_mean_cutoff
        )
    }

    if (!is.null(ratio_cutoff)) {
        test_filtered <- filter(
            test_filtered,
            .data$ratio >= ratio_cutoff | .data$ratio <= 1 / ratio_cutoff
        )
    }

    # summarized tax table
    tax <- matrix(feature) %>%
        tax_table()
    row.names(tax) <- row.names(otus)

    # only keep five variables: feature, enrich_group, effect_size (diff_mean),
    # pvalue, and padj
    test_res <- test_res[, c(
        "feature", "enrich_group",
        "ef_diff_mean", "pvalue", "padj"
    )]
    test_filtered <- test_filtered[, c(
        "feature", "enrich_group",
        "ef_diff_mean", "pvalue", "padj"
    )]
    # row.names(test_filtered) <- paste0("marker", seq_len(nrow(test_filtered)))

    marker <- return_marker(test_filtered, test_res)
    marker <- microbiomeMarker(
        marker_table = marker,
        norm_method = get_norm_method(norm),
        diff_method = method,
        sam_data = sample_data(ps_normed),
        otu_table = otu_table(t(abd), taxa_are_rows = TRUE),
        tax_table = tax
    )

    marker
}

# t test and welch test ---------------------------------------------------

#' run t test or welch test
#'
#' @param abd_group a two length list, each element represents the feature
#' abundance of a group
#' @param conf_level numeric, confidence level of the interval, default 0.95
#' @param var_equal a logical variable indicating whether to treat the two
#' variances as being equal. If TRUE then the pooled variance is used to
#' estimate the variance otherwise the Welch (or Satterthwaite) approximation
#' to the degrees of freedom is used.
#' @param ... extra arguments passed to [t.test()]
#' @seealso [stats::t.test()]
#' @noRd
run_t_test <- function(abd_group, conf_level = 0.95, var_equal = FALSE, ...) {
    if (length(abd_group) != 2) {
        stop("welch test requires test between two groups")
    }

    t_res <- purrr::pmap(
        abd_group,
        ~ t.test(.x, .y, conf.level = conf_level, var.equal = var_equal, ...)
    )

    # p value
    p <- purrr::map_dbl(t_res, ~ .x$p.value)
    # set the p value to 1 is the result is NA
    p[is.na(p)] <- 1

    # means each group
    # different between means
    t_estimate <- purrr::map(t_res, ~ .x$estimate)
    mean_g1 <- purrr::map_dbl(t_estimate, ~ .x[1])
    mean_g2 <- purrr::map_dbl(t_estimate, ~ .x[2])
    diff_means <- mean_g1 - mean_g2

    # confidence interval
    ci <- purrr::map(t_res, ~ .x$conf.int)
    ci_lower <- purrr::map_dbl(ci, ~ .x[1])
    ci_upper <- purrr::map_dbl(ci, ~ .x[2])

    group_names <- names(abd_group)
    mean_names <- paste(group_names, "mean", sep = "_")
    res <- data.frame(
        p,
        mean_g1,
        mean_g2,
        diff_means,
        ci_lower,
        ci_upper
    )
    names(res) <- c(
        "pvalue", mean_names,
        "ef_diff_mean", "ci_lower", "ci_upper"
    )

    # enrich_group
    means_df <- data.frame(mean_g1, mean_g2)
    group_enriched <- group_names[apply(means_df, 1, which.max)]
    res$enrich_group <- group_enriched

    res
}

# white's non parametric t test -------------------------------------------

#' White's non-parametric t-test
#' @param norm_group1,norm_group2 a `data.frame`, normalized abundance of
#'   group 1 and group 2
#' @param orig_group1,orig_group2 a `data.frame`, absolute abundance of
#'   group 1 and group 2
#' @param group_names character vector, group names
#' @param conf_level numeric, confidence level of the interval, default 0.95
#' @param nperm number of permutations, default 1000
#' @param ... extra arguments passed to [t.test()]
#' @noRd
run_white_test <- function(norm_group1,
    norm_group2,
    orig_group1,
    orig_group2,
    group_names,
    conf_level = 0.95,
    nperm = 1000,
    ...) {
    two_sample_ts <- calc_twosample_ts(norm_group1, norm_group2)
    t_statistic <- purrr::map_dbl(two_sample_ts, ~ .x["t_static"])
    diff_means <- purrr::map_dbl(two_sample_ts, ~ .x["diff_means"])

    permute_p <- calc_permute_p(
        norm_group1, norm_group2,
        orig_group1, orig_group2,
        t_statistic,
        conf_level = conf_level,
        nperm = nperm
    )

    bootstrap_ci <- calc_bootstrap_ci(
        norm_group1,
        norm_group2,
        conf_level = conf_level,
        replicates = nperm
    )

    # sparse feature -----------------------------------------------------------
    n1 <- nrow(orig_group1)
    n2 <- nrow(orig_group2)
    sparse_index1 <- purrr::map_dbl(orig_group1, sum) < n1
    sparse_index2 <- purrr::map_dbl(orig_group2, sum) < n2
    sparse_index <- which(sparse_index1 & sparse_index2)

    sparse_res <- calc_sparse_p(orig_group1, orig_group2, sparse_index, ...)
    # p value
    sparse_p <- purrr::map_dbl(sparse_res, ~ .x$p.value)
    # set the p value to 1 is the result is NA
    sparse_p[is.na(sparse_p)] <- 1

    # means of each group
    sparse_diff_means <- calc_sparse_diff_mean(
        orig_group1,
        orig_group2,
        sparse_index
    )

    # confidence interval
    sparse_ci <- purrr::map(sparse_res, ~ .x$conf.int)
    sparse_ci_lower <- purrr::map_dbl(sparse_ci, ~ .x[1])
    sparse_ci_upper <- purrr::map_dbl(sparse_ci, ~ .x[2])

    permute_p$pvalue_two_side[sparse_index] <- sparse_p
    diff_means[sparse_index] <- sparse_diff_means
    ci_lower <- bootstrap_ci$ci_lower
    ci_lower[sparse_index] <- sparse_ci_lower
    ci_upper <- bootstrap_ci$ci_upper
    ci_upper[sparse_index] <- sparse_ci_upper

    mean_g1 <- colMeans(norm_group1)
    mean_g2 <- colMeans(norm_group2)

    mean_names <- paste(group_names, "mean", sep = "_")
    res <- data.frame(
        permute_p$pvalue_two_side,
        mean_g1,
        mean_g2,
        diff_means,
        ci_lower,
        ci_upper
    )
    names(res) <- c(
        "pvalue", mean_names,
        "ef_diff_mean", "ci_lower", "ci_upper"
    )

    # enrich_group
    means_df <- data.frame(mean_g1, mean_g2)
    group_enriched <- group_names[apply(means_df, 1, which.max)]
    res$enrich_group <- group_enriched

    res
}

#' permuted p values from Storey and Tibshirani(2003)
#'
#' @param t_statistic white non parametric t statistic
#' @noRd
calc_permute_p <- function(norm_group1,
    norm_group2,
    orig_group1,
    orig_group2,
    t_statistic,
    conf_level = 0.95,
    nperm = 1000) {
    n1 <- nrow(norm_group1)
    n2 <- nrow(norm_group2)
    smaples_n <- n1 + n2
    features_n <- length(norm_group1)


    # calculate p value -------------------------------------------------------
    permuted_res <- purrr::rerun(
        nperm,
        calc_permute_ts(norm_group1, norm_group2)
    )
    permuted_ts <- purrr::map_df(
        permuted_res,
        ~ .x %>% purrr::map(~ .x["t_static"])
    )
    permuted_diff_means <- purrr::map_df(
        permuted_res,
        ~ .x %>% purrr::map(~ .x["diff_means"])
    )

    if (n1 < 8 || n2 < 8) {
        # pool just the frequently observed ts
        cleaned_permuted_ttests <- permuted_ts
        group1_high_freq <- colSums(orig_group1) >= n1
        group2_high_freq <- colSums(orig_group2) >= n2
        high_freq_indices <- which(group1_high_freq | group2_high_freq)

        pvalue_one_side <- rep(0, features_n)
        pvalue_two_side <- rep(0, features_n)

        for (hf_index in high_freq_indices) {
            one_side <- 0
            two_side <- 0

            for (i in seq_len(nperm)) {
                for (hf_index2 in high_freq_indices) {
                    # one side
                    if (cleaned_permuted_ttests[i, hf_index2] >
                            t_statistic[hf_index]) {
                        one_side <- one_side + 1
                    }

                    # two side
                    if (abs(cleaned_permuted_ttests[i, hf_index2]) >
                        abs(t_statistic[hf_index])) {
                        two_side <- two_side + 1
                    }
                }
            }

            pvalue_one_side[hf_index] <- 1 /
                (nperm * length(high_freq_indices)) * one_side
            pvalue_two_side[hf_index] <- 1 /
                (nperm * length(high_freq_indices)) * two_side
        }
    } else {
        no <- calc_p_large_sample(permuted_ts, t_statistic)
        two_side_no <- purrr::map_dbl(no, ~ .x["two_side"])
        g_side_no <- purrr::map_dbl(no, ~ .x["g_side"])
        l_side_no <- purrr::map_dbl(no, ~ .x["l_side"])

        pvalue_two_side <- 1 / (nperm + 1) * (two_side_no + 1)
        pvalue_g_side <- 1 / (nperm + 1) * (g_side_no + 1)
        pvalue_l_side <- 1 / (nperm + 1) * (l_side_no + 1)
    }

    pvalue <- data.frame(
        pvalue_two_side = pvalue_two_side,
        pvalue_g_side = pvalue_g_side,
        pvalue_l_side = pvalue_l_side
    )

    pvalue
}

# calculate the permute p value, if number of samples in both groups are larger
# than 8
calc_p_large_sample <- function(permuted_ts, t_statistic) {
    no <- purrr::map2(
        permuted_ts,
        t_statistic,
        ~ purrr::map_df(
            .x,
            function(i) {
                l_side <- 0
                g_side <- 0
                two_side <- 0
                if (i > .y) {
                    g_side <- g_side + 1
                }
                if (i < .y) {
                    l_side <- l_side + 1
                }
                if (abs(i) > abs(.y)) {
                    two_side <- two_side + 1
                }
                return(c(two_side = two_side, g_side = g_side, l_side = l_side))
            }
        )
    )

    purrr::map(no, colSums)
}

# bootstrap confidence interval
calc_bootstrap_ci <- function(norm_group1,
    norm_group2,
    conf_level = 0.95,
    replicates = 1000) {
    diff_means <- purrr::map2_df(
        norm_group1,
        norm_group2,
        bootstrap_diff_mean_prop_single
    )

    ci_lower <- purrr::map_dbl(
        diff_means,
        ~ .x[max(0, floor(0.5 * (1 - conf_level) * length(.x)))]
    )

    ci_upper <- purrr::map_dbl(
        diff_means,
        ~ .x[min(
            length(.x) - 1,
            ceiling((conf_level + 0.5 * (1.0 - conf_level)) * length(.x))
        )]
    )

    return(data.frame(ci_lower = ci_lower, ci_upper = ci_upper))
}

# bootstrap one time, difference mean of an single feature
bootstrap_diff_mean_prop_single <- function(group1, group2, replicates = 1000) {
    bootstrap_one <- function(group1, group2) {
        n1 <- length(group1)
        n2 <- length(group2)
        choices1 <- sample.int(n1, n1, replace = TRUE)
        choices2 <- sample.int(n2, n2, replace = TRUE)
        sample_group1 <- group1[choices1]
        sample_group2 <- group2[choices2]
        diff_means <- mean(sample_group1) - mean(sample_group2)

        diff_means
    }

    diff_means <- replicate(
        replicates,
        bootstrap_one(group1, group2),
        simplify = TRUE
    )

    return(sort(diff_means))
}

calc_permute_ts <- function(norm_group1, norm_group2) {
    n1 <- nrow(norm_group1)
    n2 <- nrow(norm_group2)
    samples_n <- n1 + n2
    perm <- sample.int(samples_n, samples_n)
    features_n <- length(norm_group1)

    # permute the rows
    prop_group <- dplyr::bind_rows(norm_group1, norm_group2)
    prop_group_permute <- prop_group[perm, ]

    calc_twosample_ts(
        prop_group_permute[seq_len(n1), ],
        prop_group_permute[(n1 + 1):(n1 + n2), ]
    )
}

# Calculate two sample t statistic of all features
calc_twosample_ts <- function(norm_group1, norm_group2) {
    ts <- purrr::map2(
        norm_group1,
        norm_group2,
        calc_twosample_ts_single_feature
    )

    ts
}

#' Calculate two sample t statistic of a feature, return a two length vector:
#' two sample t static and difference means (effect size)
#' @importFrom stats var
#' @noRd
calc_twosample_ts_single_feature <- function(norm_group1, norm_group2) {
    n1 <- length(norm_group1)
    n2 <- length(norm_group2)

    mean_g1 <- sum(norm_group1) / n1
    var_g1 <- var(norm_group1)
    stderr_g1 <- var_g1 / n1

    mean_g2 <- sum(norm_group2) / n2
    var_g2 <- var(norm_group2)
    stderr_g2 <- var_g2 / n2

    diff_means <- mean_g1 - mean_g2

    denom <- sqrt(stderr_g1 + stderr_g2)
    if (denom == 0) {
        warning(
            "degenerate case: zero variance for both groups;",
            " variance set to 1e-6.",
            call. = FALSE
        )
        t_static <- diff_means / 1e-6
    } else {
        t_static <- diff_means / denom
    }

    return(c(t_static = t_static, diff_means = diff_means))
}

#' calculate p values for sparse data using fisher's exact test
#' @importFrom stats fisher.test
#' @noRd
calc_sparse_p <- function(orig_group1, orig_group2, sparse_index, ...) {
    cm_list <- create_contingency_matrix(
        orig_group1,
        orig_group2,
        sparse_index = sparse_index
    )

    purrr::map(cm_list, fisher.test, ...)
}

# create contingency matrix  list for fisher exact test
create_contingency_matrix <- function(orig_group1, orig_group2, sparse_index) {
    all1 <- sum(orig_group1)
    all2 <- sum(orig_group2)
    sparse_group1 <- orig_group1[sparse_index]
    sparse_group2 <- orig_group2[sparse_index]
    feature_abd1 <- colSums(sparse_group1)
    feature_abd2 <- colSums(sparse_group2)

    cm_list <- purrr::map2(
        feature_abd1, feature_abd2,
        ~ matrix(
            c(.x, all1 - .x, .y, all2 - .y),
            nrow = 2,
            dimnames = list(c("featrue", "other"), c("group1", "group2"))
        )
    )

    cm_list
}

calc_sparse_diff_mean <- function(orig_group1, orig_group2, sparse_index) {
    all1 <- sum(orig_group1)
    all2 <- sum(orig_group2)
    sparse_group1 <- orig_group1[sparse_index]
    sparse_group2 <- orig_group2[sparse_index]
    feature_abd1 <- colSums(sparse_group1)
    feature_abd2 <- colSums(sparse_group2)

    purrr::map2_dbl(
        feature_abd1,
        feature_abd2,
        ~ (.x / all1 - .y / all2)
    )
}


# ratio ------------------------------------------------------------------------

#' ratio used for effect size
#'
#' @param abd1,abd2 numeric vector, abundance of a given feature of the group1
#' and group2
#' @param pseducount numeric, pseducount for unobserved data
#'
#' @return numeric ratio for a feature
#' @noRd
calc_ratio <- function(abd1, abd2, pseudocount = 0.5) {
    n1 <- length(abd1)
    n2 <- length(abd2)

    mean_g1 <- sum(abd1) / n1
    mean_g2 <- sum(abd2) / n2

    if (mean_g1 == 0 || mean_g2 == 0) {
        pseudocount <- pseudocount / (mean_g1 + mean_g2)
        mean_g1 <- mean_g1 + pseudocount
        mean_g2 <- mean_g2 + pseudocount
    }

    res <- mean_g1 / mean_g2
    if (is.na(res)) {
        res <- 0
    }

    res
}


================================================
FILE: R/abundances-methods.R
================================================
# This function is inspired from microbiome::abundances

#' Extract taxa abundances
#'
#' Extract taxa abundances from phyloseq objects.
#'
#' @param object [`otu_table-class`], [`phyloseq-class`], or
#'   [`microbiomeMarker-class`].
#' @param transform transformation to apply, the options inclulde:
#' * "identity", return the original data without any transformation.
#' * "log10", the transformation is `log10(object)`, and if the data contains
#'   zeros the transformation is `log10(1 + object)`.
#' * "log10p", the transformation is `log10(1 + object)`.
#' @param norm logical, indicating whether or not to return the normalized
#'   taxa abundances.
#' @return abundance matrix with taxa in rows and samples in columns.
#' @seealso [`otu_table-class`], [`phyloseq-class`],
#' [`microbiomeMarker-class`],[`transform_abundances`]
#' @docType methods
#' @rdname abundances-methods
#' @export
#' @examples
#' data(caporaso)
#' abd <- abundances(caporaso)
setGeneric(
    "abundances",
    function(object,
    transform = c("identity", "log10", "log10p"),
    norm = FALSE) {
        standardGeneric("abundances")
    }
)

# otu_table object
#' @aliases abundances, otu_table-method
#' @rdname abundances-methods
setMethod(
    abundances, "otu_table",
    function(object, 
        transform = c("identity", "log10", "log10p"),
        norm = FALSE) {
        transform <- match.arg(transform, c("identity", "log10", "log10p"))
        obj_transed <- transform_abundances(object, transform = transform)
        abd <- as(otu_table(obj_transed), "matrix")

        if (norm) {
            nf <- get_norm_factors(object)
            if (!is.null(nf)) {
                abd <- sweep(abd, 2, nf, "/")
            }
        }

        abd
    }
)

# phyloseq object
#' @aliases abundances,phyloseq-method
#' @rdname abundances-methods
setMethod(
    abundances, "phyloseq",
    function(object,
        transform = c("identity", "log10", "log10p"),
        norm = FALSE) {
        transform <- match.arg(transform, c("identity", "log10", "log10p"))
        otu <- otu_table(object)
        if (norm) {
            nf <- get_norm_factors(object)
            if (!is.null(nf)) {
                attr(otu, "norm_factor") <- nf
            }
        }
        otu <- abundances(otu, transform = transform, norm = norm)

        otu
    }
)

# microbiomeMarker object
#' @aliases abundances,microbiomeMarker-method
#' @rdname abundances-methods
setMethod(
    abundances, "microbiomeMarker",
    function(object,
        transform = c("identity", "log10", "log10p")) {
        transform <- match.arg(transform, c("identity", "log10", "log10p"))
        otu <- otu_table(object)
        otu <- abundances(otu, transform = transform, norm = FALSE)

        otu
    }
)


================================================
FILE: R/aggregate-taxa.R
================================================
## Note: This function is copied from package microbiome

#' @title Aggregate Taxa
#' @description Summarize phyloseq data into a higher phylogenetic level.
#' @details This provides a convenient way to aggregate phyloseq OTUs
#' (or other taxa) when the phylogenetic tree is missing. Calculates the
#' sum of OTU abundances over all OTUs that map to the same higher-level
#' group. Removes ambiguous levels from the taxonomy table. Returns a
#' phyloseq object with the summarized abundances.
#' @param x \code{\link{phyloseq-class}} object
#' @param level Summarization level (from \code{rank_names(pseq)})
#' @param verbose verbose
#' @return Summarized phyloseq object
#' @examples
#' data(caporaso)
#' caporaso_phylum <- aggregate_taxa(caporaso, "Phylum")
#' @importFrom phyloseq taxa_are_rows<- merge_phyloseq
#' @export
#' @references See citation('microbiome')
#' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com}
#' @keywords utilities
aggregate_taxa <- function(x, level, verbose = FALSE) {
    if (!level %in% rank_names(x)) {
        stop(
            "The level argument should be one of the options
            given by rank_names(x): ",
            paste(rank_names(x), collapse = " / ")
        )
    }

    # Check if the object is already at the given level
    inds <- all(tax_table(x)[, level] == tax_table(x)[, ncol(tax_table(x))])
    inds <- which(inds)
    check1 <- length(inds) > 0
    check2 <- !any(duplicated(tax_table(x)[, level]))
    if (check1 && check2) {
        return(x)
    }

    # Sanity checks for a phyloseq object. Required with some methods.
    if (!taxa_are_rows(x)) {
        x@otu_table <- otu_table(t(otu_table(x)), taxa_are_rows = TRUE)
        taxa_are_rows(x) <- TRUE
    }

    fill_na_taxa <- "Unknown"

    if (verbose) {
        print("Remove taxonomic information below the target level")
    }
    M <- as.matrix(tax_table(x))
    inds2 <- match(level, colnames(M))
    M <- M[, seq_len(inds2)]
    M[is.na(M)] <- fill_na_taxa
    # Ensure that the filled entries are unique
    inds <- which(M[, level] == fill_na_taxa)
    M[inds, seq_len(inds2)] <- fill_na_taxa

    unique <- apply(M, 1, function(x) {
        paste(x, collapse = "_")
    })
    M <- cbind(M, unique = unique)
    x@tax_table <- tax_table(M)

    if (!nrow(tax_table(x)) == nrow(otu_table(x))) {
        stop("Taxonomic table and OTU table dimensions do not match.")
    }

    if (verbose) {
        print("Mark the potentially ambiguous taxa")
    }
    # Some genera for instance belong to multiple Phyla and perhaps these
    # are different
    # genera. For instance there is genus Clostridium in Tenericutes
    # and Firmicutes.
    # (GlobalPatterns data set) and even more families.

    tt <- tax_table(x)
    if (verbose) {
        print("-- split")
    }
    otus <- split(rownames(tt), as.character(tt[, "unique"]))

    ab <- matrix(NA, nrow = length(otus), ncol = nsamples(x))
    rownames(ab) <- names(otus)
    colnames(ab) <- sample_names(x)

    if (verbose) {
        print("-- sum")
    }
    d <- abundances(x)

    ab <- t(vapply(otus, function(taxa) {
        as.numeric(colSums(matrix(d[taxa, ], ncol = nsamples(x)), na.rm = TRUE))
    }, FUN.VALUE = unname(as.numeric(d[1, ]))))

    colnames(ab) <- colnames(d)
    rownames(ab) <- names(otus)

    if (verbose) {
        print("Create phyloseq object")
    }
    OTU <- otu_table(ab, taxa_are_rows = TRUE)
    x2 <- phyloseq(OTU)

    if (verbose) {
        print("Remove ambiguous levels")
    }
    ## First remove NA entries from the target level
    inds3 <- match(level, colnames(tt@.Data))
    inds4 <- match("unique", colnames(tt@.Data))
    taxtab <- tt@.Data[
        which(!is.na(tt@.Data[, level])),
        c(seq_len(inds3), inds4)
    ]

    if (verbose) {
        print("-- unique")
    }
    tax <- unique(taxtab)
    if (verbose) {
        print("-- Rename the lowest level")
    }
    tax <- as.data.frame(tax)
    if (verbose) {
        print("-- rownames")
    }
    rownames(tax) <- tax[, "unique"]

    if (verbose) {
        print("-- taxa")
    }
    tax <- as.matrix(tax)

    if (verbose) {
        print("Convert to taxonomy table")
    }
    TAX <- tax_table(tax)

    if (verbose) {
        print("Combine OTU and Taxon matrix into Phyloseq object")
    }
    x2 <- merge_phyloseq(x2, TAX)

    # Then keep short names for those taxa where short names are unique
    tt <- tax_table(x2)

    uni <- names(which(table(as.vector(tt[, level])) == 1))
    inds <- which(tt[, level] %in% uni)
    taxa <- tt[inds, level]
    tt[inds, "unique"] <- taxa
    rownames(tt)[inds] <- taxa
    ab <- abundances(x2)
    rnams <- rownames(ab)
    rnams[inds] <- taxa
    rownames(ab) <- rnams

    x2 <- phyloseq(
        otu_table(ab, taxa_are_rows = TRUE),
        tax_table(tt)
    )

    if (verbose) {
        print("Add the metadata as is")
    }
    if (!is.null(x@sam_data)) {
        x2 <- phyloseq(
            otu_table(ab, taxa_are_rows = TRUE),
            tax_table(tt),
            sample_data(x)
        )
    }

    x2
}


================================================
FILE: R/assignment-methods.R
================================================
#' Assign a new OTU table
#'
#' Assign a new OTU table in microbiomeMarker object
#' @param x [`microbiomeMarker-class`]
#' @param value [`otu_table-class`], [`phyloseq-class`],
#'   or [`microbiomeMarker-class`]
#' @docType methods
#' @name assign-otu_table
#' @importMethodsFrom phyloseq otu_table<-
#' @importClassesFrom phyloseq otu_table
#' @return a [`microbiomeMarker-class`] object.
NULL

#' @rdname assign-otu_table
#' @aliases otu_table<-,microbiomeMarker,otu_table-method
setMethod(
    "otu_table<-", c("microbiomeMarker", "otu_table"),
    function(x, value) {
        microbiomeMarker(
            x@marker_table,
            x@norm_method,
            x@diff_method,
            otu_table = value,
            sam_data = x@sam_data,
            phy_tree = x@phy_tree,
            refseq = x@refseq,
            tax_table = x@tax_table
        )
    }
)

#' @rdname assign-otu_table
#' @aliases otu_table<-,microbiomeMarker,phyloseq-method
setMethod(
    "otu_table<-", c("microbiomeMarker", "phyloseq"),
    function(x, value) {
        microbiomeMarker(
            x@marker_table,
            x@norm_method,
            x@diff_method,
            otu_table = otu_table(value),
            sam_data = x@sam_data,
            phy_tree = x@phy_tree,
            refseq = x@refseq,
            tax_table = x@tax_table
        )
    }
)

#' @rdname assign-otu_table
#' @aliases otu_table<-,microbiomeMarker,microbiomeMarker-method
setMethod(
    "otu_table<-", c("microbiomeMarker", "microbiomeMarker"),
    function(x, value) {
        microbiomeMarker(
            x@marker_table,
            x@norm_method,
            x@diff_method,
            otu_table = otu_table(value),
            sam_data = x@sam_data,
            phy_tree = x@phy_tree,
            refseq = x@refseq,
            tax_table = x@tax_table
        )
    }
)


================================================
FILE: R/confounder.R
================================================
## reference
# https://github.com/biomedbigdata/namco/blob/647d3108a281eb0e36af31c44f5bf38d0c70c07d/app/R/utils.R#L480-L561
# https://github.com/biomedbigdata/namco/blob/647d3108a281eb0e36af31c44f5bf38d0c70c07d/app/R/server/confounding_server.R#L71-L100
# vagan: https://fromthebottomoftheheap.net/slides/advanced-vegan-webinar-2020/advanced-vegan

## adjust confounders:
  # deseq2, edger, voom, metagenomeSeq:  ~ covariates + interested_var (condition)
  # ancom: adj_formula
  # ancombc: formula
  # maAsLin fixed effect https://forum.biobakery.org/t/confounding-factors/154

#' Confounder analysis
#'
#' Confounding variables may mask the actual differential features. This
#' function utilizes constrained correspondence analysis (CCA) to measure the
#' confounding factors.
#'
#' @param ps  a [`phyloseq::phyloseq-class`] object.
#' @param target_var character, the variable of interest
#' @param norm  norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#' @param confounders the confounding variables to be measured, if `NULL`, all
#'   variables in the meta data will be analyzed.
#' @param permutations the number of permutations, see [`vegan::anova.cca()`].
#' @param ... extra arguments passed to [`vegan::anova.cca()`].
#'
#' @return a `data.frame` contains three variables: confounder,
#' pseudo-F and p value.
#'
#' @examples
#' data(caporaso)
#' confounder(caporaso, "SampleType", confounders = "ReportedAntibioticUsage")
#'
#' @importFrom vegan cca
#' @importFrom phyloseq t taxa_are_rows
#' @importFrom stats anova
#' @export
confounder <- function(ps,
                       target_var,
                       norm = "none",
                       confounders = NULL,
                       permutations = 999,
                       ...) {
    stopifnot(inherits(ps, "phyloseq"))
    abd <- otu_table(ps)
    abd <- normalize(abd, method = norm)
    if (taxa_are_rows(abd)) {
        abd <- as(t(abd), "matrix")
    }

    meta <- data.frame(sample_data(ps))
    confounders <- check_confounder(ps, target_var, confounders)
    confounders_meta <- meta[confounders]
    cca_out <- cca(abd ~ ., data = confounders_meta)

    cca_sig <- anova(cca_out, by = "terms", permutations = permutations, ...)
    cca_sig <- cca_sig[confounders, ]
    pseudo_F <- cca_sig$F
    pvalue <- cca_sig$`Pr(>F)`
    sig <- data.frame(
        confounder = row.names(cca_sig),
        pseudo_F = pseudo_F,
        pvalue = pvalue
    )

    sig
}


================================================
FILE: R/data.R
================================================
#' 16S rRNA data from "Moving pictures of the human microbiome"
#'
#' 16S read counts and phylogenetic tree file of 34 Illumina samples derived
#' from Moving Pictures of the Human Microbiome (Caporaso et al.) Group label:
#' gut, left palm, right palm, and tongue - indicating different sampled body
#' sites.
#'
#' @format a [phyloseq::phyloseq] object
#' @references
#' Caporaso, et al. Moving pictures of the human microbiome. Genome Biol 12,
#' R50 (2011).
#'
#' \url{https://doi.org/10.1186/gb-2011-12-5-r50}
#'
#' @source Data was downloaded from https://www.microbiomeanalyst.ca
#'
#'
#' @name data-caporaso
#' @aliases caporaso
#' @docType data
#' @author Yang Cao
NA

#' 16S rRNA data of 94 patients from CID 2012
#'
#' Data from a cohort of 94 Bone Marrow Transplant patients previously published
#' on in CID
#'
#' @format a [phyloseq::phyloseq] object
#' @references
#' Ying, et al. Intestinal Domination and the Risk of Bacteremia in Patients
#' Undergoing Allogeneic Hematopoietic Stem Cell Transplantation,
#' Clinical Infectious Diseases, Volume 55, Issue 7, 1 October 2012,
#' Pages 905–914,
#'
#' \url{https://academic.oup.com/cid/article/55/7/905/428203}
#'
#' @source \url{https://github.com/ying14/yingtools2/tree/master/data}
#' @name data-cid_ying
#' @aliases cid_ying
#' @docType data
#' @author Yang Cao
NA

#' IBD stool samples
#'
#' 43 pediatric IBD stool samples obtained from the Integrative Human Microbiome
#' Project Consortium (iHMP). Group label: CD and Controls.
#'
#' @format a [`phyloseq::phyloseq-class`] object
#' @source \url{https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources}
#' @name data-pediatric_ibd
#' @aliases pediatric_ibd
#' @docType data
#' @author Yang Cao
NA

#' Oxygen availability 16S dataset, of which taxa table has been summarized for
#' python lefse input
#'
#' A small subset of the HMP 16S dataset for finding biomarkers characterizing
#' different level of oxygen availability in different bodysites
#'
#' @format a [`phyloseq::phyloseq-class`] object
#' @source \url{http://huttenhower.sph.harvard.edu/webfm_send/129}
#' @name data-oxygen
#' @aliases oxygen
#' @docType data
#' @author Yang Cao
NA

#' This is a sample data from lefse python script, a 16S dataset for studying
#' the characteristics of the fecal microbiota in a mouse model of spontaneous
#' colitis.
#'
#' The dataset contains 30 abundance profiles (obtained processing the 16S reads
#' with RDP) belonging to 10 rag2 (control) and 20 truc (case) mice.
#'
#' @format a [`phyloseq::phyloseq-class`] object
#' @source \url{http://www.huttenhower.org/webfm_send/73}
#' @name data-spontaneous_colitis
#' @aliases spontaneous_colitis
#' @docType data
#' @author Yang Cao
NA

#' Enterotypes data of 39 samples
#'
#' The data contains 22 European metagenomes from Danish, French, Italian,
#' and Spanish individuals, and 13 Japanese and 4 American.
#'
#' @format a [`phyloseq::phyloseq-class`] object
#' @references
#' Arumugam, Manimozhiyan, et al. Enterotypes of the human gut microbiome.
#' nature 473.7346 (2011): 174-180.
#' @name data-enterotypes_arumugam
#' @aliases enterotypes_arumugam
#' @docType data
#' @author Yang Cao
NA


#' Data from a study on colorectal cancer (kostic 2012)
#'
#' The data from a study on colorectal cancer. Samples that had no `DIAGNOSIS`
#' attribute assigned and with less than 500 reads (counts) were removed, and
#' 191 samples remains (91 healthy and 86 Tumors).
#'
#' @format a [`phyloseq::phyloseq-class`] object
#'
#' @references
#' Kostic et al. Genomic analysis identifies association of Fusobacterium with
#' colorectal carcinoma. Genome research, 2012, 22(2), 292-298.
#' @name data-kostic_crc
#' @aliases kostic_crc
#' @docType data
#' @author Yang Cao
NA


#' Data from Early Childhood Antibiotics and the Microbiome (ECAM) study
#'
#' The data from a subset of the Early Childhood Antibiotics and the
#' Microbiome (ECAM) study, which tracked the microbiome composition and
#' development of 43 infants in the United States from birth to 2 years of age,
#' identifying microbiome associations with antibiotic exposure, delivery mode,
#' and diet.
#'
#' @format a [`phyloseq::phyloseq-class`] object
#'
#' @references
#' Bokulich, Nicholas A., et al. "Antibiotics, birth mode, and diet shape
#' microbiome maturation during early life." Science translational medicine
#' 8.343 (2016): 343ra82-343ra82.
#'
#' \url{https://github.com/FrederickHuangLin/ANCOM/tree/master/data}
#' @name data-ecam
#' @aliases ecam
#' @docType data
NA


================================================
FILE: R/extract-methods.R
================================================
#' Extract `marker_table` object
#'
#' Operators acting on `marker_table` to extract parts.
#'
#' @name [
#' @aliases [,marker_table,ANY,ANY,ANY-method
#' @param x a [`marker_table`] object.
#' @param i,j elements to extract.
#' @param ... see [`base::Extract()`].
#' @param drop ignored now.
#' @return a `marker_table` object.
#' @export
#' @rdname extract-methods
#' @seealso [`base::Extract()`]
setMethod("[", "marker_table", function(x, i, j, ...) {
    newx <- marker_table(data.frame(x)[i, j, drop = FALSE])
    newx
})


================================================
FILE: R/import-biobakery-lefse_in.R
================================================
#' @title Import function to read the tab-delimited input file of biobakery
#' lefse
#'
#' @description For biobakey lefse, the input file must be a tab-delimited
#' text, consists of a list of numerical features, the class vector and
#' optionally the subclass and subject vectors. The features can be read counts
#' directly or abundance floating-point values more generally, and the first
#' field is the name of the feature. Class, subclass and subject vectors have a
#' name (the first field) and a list of non-numerical strings. This function
#' requires the features are organized in rows, although both column and row
#' feature organization is accepted in biobakery lefse.
#'
#' @param file the file path of tab-delimited input file of biobakery lefse
#' @param ranks_prefix character vector, prefix of taxonomic ranks to add,
#'   e.g. "p" for "Phylum", "g" for "Genus".
#' @param meta_rows integer vector, set which rows represent the meta data,
#'   such as class, subclass and subject, default `1`.
#' @param sep character, separator between different taxnomic ranks,
#'   default `|`.
#' @noRd
#' @return a [`phyloseq::phyloseq-class`] object.
#' @examples
#' # file <- system.file(
#' #   "extdata",
#' #   "hmp_small_aerobiosis.txt",
#' #   package = "microbiomeMarker"
#' # )
#' # six level of taxonomic ranks,
#' # meta data: row 1 represents class (oxygen_availability),
#' # row 2 represents subclass (body_site),
#' # row 3 represents subject (subject_id)
#' # ps <- import_biobakery_lefse_in(
#' #   file,
#' #   ranks_prefix = c("k", "p", "c", "o", "f", "g"),
#' #   meta_rows = 1:3,
#' # )
import_biobakery_lefse_in <- function(file,
    ranks_prefix,
    meta_rows = 1,
    sep = "|") {
    dat <- utils::read.delim(file, header = FALSE)

    # meta data of samples
    meta_nms <- dat[meta_rows, 1]
    sample_meta <- dat[meta_rows, -1] %>%
        t() %>%
        as.data.frame() %>%
        sample_data()
    colnames(sample_meta) <- meta_nms
    row.names(sample_meta) <- paste0("sa", seq_len(nrow(sample_meta)))

    # tax table
    tax <- dat[-meta_rows, 1, drop = FALSE] %>%
        as.matrix() %>%
        tax_table()
    # ensure the ranks_prefix is contained in available_ranks
    # and in descending order
    available_prefix <- get_available_prefix(available_ranks)
    if (!all(ranks_prefix %in% available_prefix)) {
        stop("all elements of ranks_prefix must be contained ",
            "in available_ranks"
        )
    }
    tax_nms <- keep_prefix_desc(ranks_prefix, type = "ranks") %>%
        paste0(collapse = sep)
    colnames(tax) <- tax_nms
    row.names(tax) <- paste0("feature", seq_len(nrow(tax)))

    # otu table
    otu <- dat[-meta_rows, -1, drop = FALSE] %>%
        apply(2, as.numeric) %>%
        otu_table(taxa_are_rows = TRUE)
    row.names(otu) <- taxa_names(tax)
    colnames(otu) <- sample_names(sample_meta)

    ps <- phyloseq(otu, tax, sample_meta) %>%
        add_prefix_summarized(ranks_prefix, sep)

    ps
}


================================================
FILE: R/import-dada2.R
================================================
# This function is modified from import_dada2.R in MicrobiotaProcess
# https://github.com/YuLab-SMU/MicrobiotaProcess/blob/master/R/import_dada2.R

#' Import function to read the the output of dada2 as phyloseq object
#'
#' Import the output of dada2 into phyloseq object
#'
#' @param seq_tab matrix-like, ASV table, the output of
#' `dada2::removeBimeraDenovo`.
#' @param tax_tab matrix, taxonomy table, the output of
#' `dada2::assignTaxonomy` or `dada2::addSpecies`.
#' @param sam_tab data.frame or [`phyloseq::sample_data-class`], sample data
#' @param phy_tree [`ape::phylo`] class or character represents the path of
#'   the tree file
#' @param keep_taxa_rows logical, whether keep taxa in rows or not in the
#'   `otu_table` of the returned `phyloseq` object, default `TRUE`.
#' @details
#' The output of the dada2 pipeline is a feature table of amplicon sequence
#' variants (an ASV table): A matrix with rows corresponding to samples and
#' columns to ASVs, in which the value of each entry is the number of times
#' that ASV was observed in that sample. This table is analogous to the
#' traditional OTU table. Conveniently, taxa names are saved as ASV1, ASV2,
#' ..., in the returned phyloseq object.
#' @importFrom phyloseq sample_data read_tree
#' @importMethodsFrom phyloseq t
#' @return [`phyloseq::phyloseq-class`] object hold the taxonomy info,
#'   sample metadata, number of reads per ASV.
#' @export
#'
#' @examples
#' seq_tab <- readRDS(system.file("extdata", "dada2_seqtab.rds",
#'     package = "microbiomeMarker"
#' ))
#' tax_tab <- readRDS(system.file("extdata", "dada2_taxtab.rds",
#'     package = "microbiomeMarker"
#' ))
#' sam_tab <- read.table(system.file("extdata", "dada2_samdata.txt",
#'     package = "microbiomeMarker"
#' ), sep = "\t", header = TRUE, row.names = 1)
#' ps <- import_dada2(seq_tab = seq_tab, tax_tab = tax_tab, sam_tab = sam_tab)
#' ps
import_dada2 <- function(seq_tab,
    tax_tab = NULL,
    sam_tab = NULL,
    phy_tree = NULL,
    keep_taxa_rows = TRUE) {
    # refseq
    refseq <- colnames(seq_tab)
    # set refseq and taxa names to ASV_1, ASV_2,...
    refseq_nm <- paste0("ASV", seq_along(refseq))
    colnames(seq_tab) <- refseq_nm
    names(refseq) <- refseq_nm

    if (!is.null(tax_tab)) {
        if (!identical(refseq_nm, row.names(tax_tab))) {
            tax_tab <- tax_tab[match(refseq, row.names(tax_tab)), , 
                drop = FALSE]
        }
        row.names(tax_tab) <- refseq_nm
        tax_tab <- tax_table(as.matrix(tax_tab))
    }

    # refseq to XStringSet
    refseq <- Biostrings::DNAStringSet(refseq)

    if (!is.null(sam_tab)) {
        sam_tab <- sample_data(sam_tab)
    }

    if (!is.null(phy_tree) && inherits(phy_tree, "character")) {
        phy_tree <- read_tree(phy_tree)
    }

    asv_tab <- otu_table(seq_tab, taxa_are_rows = FALSE)
    ps <- phyloseq(asv_tab, tax_tab, sam_tab, phy_tree, refseq)

    if (keep_taxa_rows) {
        ps <- t(ps)
    }

    ps
}


================================================
FILE: R/import-picrust2.R
================================================
#' Import function to read the output of picrust2 as phyloseq object
#' 
#' Import the output of picrust2 into phyloseq object
#' 
#' @param feature_tab character, file path of the prediction abundance table of 
#'   functional feature.
#' @param sam_tab character, file path of the sample meta data.
#' @param trait character, options are picrust2 function traits (including 
#'   "COG", "EC", "KO", "PFAM", "TIGRFAM", and "PHENO") and "PATHWAY".
#' @importFrom utils read.delim
#' @export
#' @return [`phyloseq::phyloseq-class`] object.
#' @details 
#' [PICRUSt2](https://huttenhower.sph.harvard.edu/picrust/) is a software for 
#' predicting abundances of functional profiles based on marker gene sequencing 
#' data. The functional profiles can be predicted from the taxonomic 
#' profiles using PICRUSt2. "Function" usually refers to gene families such as 
#' KEGG orthologs and Enzyme Classification numbers, but predictions can be 
#' made for any arbitrary trait.
#' 
#' In the `phyloseq object`, the predicted function abundance profile is stored 
#' in `otu_table` slot. And the functional trait is saved in `tax_table` slot, 
#' if the descriptions of function features is not added to the predicted table, 
#' `tax_table` will have only one rank `Picrust_trait` to represent the function 
#' feature id, or if the desciptions are added one more rank 
#' `Picrust_description` will be added to represent the description of 
#' function feature. 
#' @examples 
#' sam_tab <- system.file(
#'     "extdata", "picrust2_metadata.tsv",
#'     package = "microbiomeMarker")
#' feature_tab <- system.file(
#'     "extdata", "path_abun_unstrat_descrip.tsv.gz",
#'     package = "microbiomeMarker")
#' ps <- import_picrust2(feature_tab, sam_tab, trait = "PATHWAY")
#' ps
import_picrust2 <- function(feature_tab, 
    sam_tab = NULL, 
    trait = c("PATHWAY", "COG", "EC", "KO", "PFAM", "TIGRFAM", "PHENO")) {
    trait <- match.arg(
        trait, 
        c("COG", "EC", "KO", "PFAM", "TIGRFAM", "PHENO", "PATHWAY"))
    feature_tab <- read.delim(gzfile(feature_tab))
   
    # extract meta data of trait and stored it as taxonomy table in phyloseq
    feature_tab_vars <- names(feature_tab)
    tax_var <- feature_tab_vars[1]
    tax_rank <- "Picrust_trait"
    desp <- "description"
    if (desp %in% names(feature_tab)) {
        tax_var <- c(tax_var, desp)
        tax_rank <- c(tax_rank, paste0("Picrust_", desp))
    }
    tax_mat <- as.matrix(feature_tab[tax_var])
    name_prefix <- ifelse(trait == "PATHWAY", "path", "func")
    n_feature <- nrow(feature_tab)
    feature_nms <- paste0(name_prefix, seq_len(n_feature))
    row.names(tax_mat) <- tax_mat[, 1]
    colnames(tax_mat) <- tax_rank
    tax_tab <- tax_table(tax_mat)
    
    # sample data
    if (!is.null(sam_tab)) {
        sam_tab <- sample_data(read.delim(sam_tab))
    }
    
    # set the names of the feature
    feature_tab <- feature_tab[setdiff(feature_tab_vars, tax_var)]
    row.names(feature_tab) <- tax_mat[, 1]
    colnames(feature_tab) <- rownames(sam_tab)
    
    ps <- phyloseq(otu_table(as.matrix(feature_tab), taxa_are_rows = TRUE),
             tax_tab,
             sam_tab)
    ps
}


================================================
FILE: R/import-qiime2.R
================================================
# This function is modified from import_qiime2.R in MicrobiotaProcess
# https://github.com/YuLab-SMU/MicrobiotaProcess/blob/master/R/import_qiime2.R
# and https://github.com/jbisanz/qiime2R

#' Import function to read the the output of dada2 as phyloseq object
#'
#' Import the qiime2 artifacts, including feature table, taxonomic table,
#'   phylogenetic tree, representative sequence and sample metadata into
#'   phyloseq object.
#'
#' @param otu_qza character, file path of the feature table from qiime2.
#' @param taxa_qza character, file path of the taxonomic table from qiime2,
#'   default `NULL`.
#' @param sam_tab character, file path of the sample metadata in tsv format,
#'   default `NULL`.
#' @param refseq_qza character, file path of the representative sequences from
#'   qiime2, default `NULL`.
#' @param tree_qza character, file path of the phylogenetic tree from
#'   qiime2, default `NULL`.
#' @export
#' @return [`phyloseq::phyloseq-class`] object.
#' @examples
#' otuqza_file <- system.file(
#'     "extdata", "table.qza",
#'     package = "microbiomeMarker"
#' )
#' taxaqza_file <- system.file(
#'     "extdata", "taxonomy.qza",
#'     package = "microbiomeMarker"
#' )
#' sample_file <- system.file(
#'     "extdata", "sample-metadata.tsv",
#'     package = "microbiomeMarker"
#' )
#' treeqza_file <- system.file(
#'     "extdata", "tree.qza",
#'     package = "microbiomeMarker"
#' )
#' ps <- import_qiime2(
#'     otu_qza = otuqza_file, taxa_qza = taxaqza_file,
#'     sam_tab = sample_file, tree_qza = treeqza_file
#' )
#' ps
import_qiime2 <- function(otu_qza,
    taxa_qza = NULL,
    sam_tab = NULL,
    refseq_qza = NULL,
    tree_qza = NULL) {
    feature_tab <- read_qza(otu_qza)

    if (!is.null(taxa_qza)) {
        taxa_tab <- read_qza(taxa_qza)
        taxa_tab <- subset_taxa_in_feature(taxa_tab, feature_tab)
        taxa_tab <- parse_q2taxonomy(taxa_tab)
    } else {
        taxa_tab <- NULL
    }

    if (!is.null(sam_tab)) {
        sam_tab <- read_q2sample_meta(sam_tab)
    } else {
        sam_tab <- NULL
    }

    if (!is.null(tree_qza)) {
        tree <- read_qza(tree_qza)
    } else {
        tree <- NULL
    }

    # check the row.names of feature table: DNA sequence or not
    # if is DNA, row.names(feature_tab) is set as refseq
    if (is_dna_seq(row.names(feature_tab))) {
        refseq <- row.names(feature_tab)
        refseq_nm <- paste0("OTU", seq_along(refseq))
        names(refseq) <- refseq_nm

        # set the rownames of feature and taxa tab as OTU1, OTU2,...
        if (!is.null(taxa_tab)) {
            rownames(taxa_tab) <- refseq_nm
        }
        rownames(feature_tab) <- refseq_nm
    } else {
        refseq <- NULL
    }

    if (!is.null(refseq_qza)) {
        refseq <- read_qza(refseq_qza)
    }


    if (!is.null(refseq)) {
        refseq <- Biostrings::DNAStringSet(refseq)
    } else {
        refseq <- NULL
    }

    ps <- phyloseq(feature_tab, taxa_tab, sam_tab, tree, refseq)
    if (inherits(ps, "otu_table")) {
        warning("`otu_table` object is returned")
    }

    ps
}


#' Read the qza file output from qiime2
#'
#' Import the qiime2 artifacts to R.
#' @param file character, path of the input qza file. Only files in format of
#'   `BIOMV210DirFmt` (feature table), `TSVTaxonomyDirectoryFormat` (taxonomic
#'   table), `NewickDirectoryFormat` (phylogenetic tree ) and
#'    `DNASequencesDirectoryFormat` (representative sequences) are supported
#'    right now.
#' @param temp character, a temporary directory in which the qza file will be
#'   decompressed to, default `tempdir()`.
#' @importFrom yaml read_yaml
#' @return [`phyloseq::otu_table-class`] object for feature table,
#'   [`phyloseq::taxonomyTable-class`] object for taxonomic table,
#'   [`ape::phylo`] object for phylogenetic tree,
#'   [`Biostrings::DNAStringSet-class`] for representative sequences of taxa.
#' @noRd
read_qza <- function(file, temp = tempdir()) {
    unzipped_file <- utils::unzip(file, exdir = temp)
    meta_file <- grep("metadata.yaml", unzipped_file, value = TRUE)
    metadata <- yaml::read_yaml(meta_file[1])
    format <- metadata$format
    uuid <- metadata$uuid

    if (grepl("BIOMV", metadata$format)) {
        biom_file <- file.path(temp, uuid, "data/feature-table.biom")
        res <- read_q2biom(biom_file)
    } else if (format == "TSVTaxonomyDirectoryFormat") {
        taxa_file <- file.path(temp, uuid, "data/taxonomy.tsv")
        res <- read_q2taxa(taxa_file)
    } else if (format == "NewickDirectoryFormat") {
        tree_file <- file.path(temp, uuid, "data/tree.nwk")
        res <- read_tree(tree_file)
    } else if (format == "DNASequencesDirectoryFormat") {
        refseq_file <- file.path(temp, uuid, "data/dna-sequences.fasta")
        res <- Biostrings::readDNAStringSet(refseq_file)
    } else {
        stop(
            "Only files in format of 'BIOMV210DirFmt' ",
            "'TSVTaxonomyDirectoryFormat', ",
            "'NewickDirectoryFormat' and 'DNASequencesDirectoryFormat'",
            " are supported."
        )
    }

    res
}

#' Read qiime2 feature table
#'
#' Import qiime2 feature table into otu_table object
#'
#' @param file character, file name of the biom file.
#' @importFrom biomformat read_biom biom_data
#' @return [`phyloseq::otu_table-class`] object.
#' @noRd
read_q2biom <- function(file) {
    biomobj <- suppressWarnings(read_biom(file))
    feature_tab <- as(biom_data(biomobj), "matrix")

    otu_table(feature_tab, taxa_are_rows = TRUE)
}

#' Read qiime2 taxa file
#' @keywords internal
#' @noRd
read_q2taxa <- function(file) {
    taxa <- utils::read.table(file, sep = "\t", header = TRUE)
    if ("Confidence" %in% names(taxa)) {
        taxa$Confidence <- NULL
    }

    tax_table(as.matrix(taxa))
}

#' Read qiime2 sample meta data file
#' @keywords internal
#' @noRd
read_q2sample_meta <- function(file) {
    phyloseq::import_qiime_sample_data(file)
}

#' Subset taxa according to the taxa in feature table
#' @keywords internal
#' @noRd
subset_taxa_in_feature <- function(taxa_tab, feature_tab) {
    idx <- match(row.names(feature_tab), taxa_tab[, "Feature.ID"])
    taxa_tab <- taxa_tab[idx, , drop = FALSE]
    row.names(taxa_tab) <- taxa_tab[, "Feature.ID"]

    taxa_tab
}

#' Parse qiime2 taxa in different taxonomic levels
#' @param taxa `tax_table` object.
#' @param sep character string containing a regular expression, separator
#'  between different taxonomic levels, defaults to on compatible with both
#'  GreenGenes and SILVA `; |;"`.
#' @param trim_rank_prefix logical whether remove leading characters from
#'   taxonomic levels, e.g. k__ or D_0__, default `TRUE`.
#' @return [`phyloseq::taxonomyTable-class`] object.
#' @keywords internal
#' @noRd
parse_q2taxonomy <- function(taxa, sep = "; |;", trim_rank_prefix = TRUE) {
    taxa <- data.frame(taxa)
    if (trim_rank_prefix) {
        # remove leading characters from GG
        taxa$Taxon <- gsub("[kpcofgs]__", "", taxa$Taxon)
        # remove leading characters from SILVA
        taxa$Taxon <- gsub("D_\\d__", "", taxa$Taxon)
    }

    taxa <- tidyr::separate(taxa, .data$Taxon,
        c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
        sep = sep,
        fill = "right"
    )
    taxa <- purrr::map_df(taxa, ~ ifelse(.x == "", NA_character_, .x)) %>%
        as.data.frame()
    rownames(taxa) <- taxa$Feature.ID
    taxa$Feature.ID <- NULL

    tax_table(as.matrix(taxa))
}

#' check the row.names of feature table is DNA sequence or not
#' This function is from
#' {github}YuLab-SMU/MicrobiotaProcess/blob/master/R/import_qiime2.R#L169-L177
#' @keywords internal
#' @noRd
is_dna_seq <- function(names) {
    x <- unlist(strsplit(toupper(names[1]), split = ""))
    freq <- mean(x %in% c("A", "G", "T", "C", "N", "X", "-"))

    if (length(x) > 30 & freq > 0.9) {
        return(TRUE)
    } else {
        return(FALSE)
    }
}


================================================
FILE: R/lefse-utilities.R
================================================
# enrich group of the feature ---------------------------------------------

#' get the mean abundances of each class for each feature
#' @noRd
get_feature_enrich_group <- function(class, feature) {
    feature$class <- class
    feature_mean <- group_by(feature, class) %>%
        group_modify(~ purrr::map_df(.x, mean)) %>%
        ungroup()

    feature_enrich_index <- select(feature_mean, -class) %>%
        purrr::map_dbl(which.max)
    feature_enrich_group <- feature_mean$class[feature_enrich_index]
    names(feature_enrich_group) <- names(feature)[names(feature) != "class"]
    feature_max_mean <- purrr::map2_dbl(
        select(feature_mean, -class),
        feature_enrich_index,
        ~ .x[.y]
    )
    feature_max_mean[feature_max_mean < 1] <- 1

    return(list(
        group = feature_enrich_group,
        log_max_mean = log10(feature_max_mean)
    ))
}


# bootstrap iteration LDA -------------------------------------------------

#' check bootstrap sample, not contast within class and too fewer in each
#' class
#'
#' @param feature_abudance data.frame, significant feature abundance,
#'   where columns represents samples and rows represents features
#' @param sample_indx numeric vector, sample index be bootstraped
#' @param sample_min integer, min samples in each class
#' @param class character vector, class of all samples
#'
#' @noRd
check_bootstrap_sample <- function(feature_abundance,
    sample_indx,
    sample_min,
    class) {
    if ("class" %in% names(feature_abundance)) {
        feature_abundance$class <- NULL
    }
    feature_abundance <- feature_abundance[sample_indx, ]
    class_n <- length(unique(class))
    class <- class[sample_indx]

    if (length(unique(class)) < class_n) {
        return(FALSE)
    }

    for (cls in unique(class)) {
        if (sum(class == cls) < sample_min) {
            return(FALSE)
        }

        # sig feature smaller than min sample count
        cls_abundance <- feature_abundance[class == cls, ]

        for (i in seq_along(ncol(cls_abundance))) {
            unique_abd <- length(unique(cls_abundance[[i]]))

            if ((unique_abd <= sample_min && sample_min > 1) ||
                (unique_abd <= 1 && sample_min == 1)) {
                return(FALSE)
            }
        }
    }

    return(TRUE)
}

#' bootstrap iteration of samples for lda analysis
#' @noRd
bootstap_lda <- function(feature_abundance,
    boot_n,
    class,
    sample_fract,
    seed = 2020) {
    # Bioconductor not allows set.seed
    ldas <- purrr::map(
        1:boot_n,\(i)
        bootstap_lda_one(
            feature_abundance,
            class,
            sample_fract
        )
    ) %>%
        purrr::transpose() %>%
        purrr::map(~ do.call(bind_rows, .x)) %>%
        bind_rows()

    mean_lds <- colMeans(ldas)
    mean_lds <- sign(mean_lds) * log10(1 + abs(mean_lds))

    mean_lds
}

bootstap_lda_one <- function(feature_abundance,
    class,
    sample_fract) {
    sample_groups <- unique(class)
    class_count <- table(class)
    feature_abundance$class <- class
    feature_abundance <- preprocess_feature_all(feature_abundance, class)

    sample_n <- nrow(feature_abundance)
    random_n <- floor(sample_n * sample_fract)
    class_n <- length(sample_groups)
    sample_min <- floor(
        min(class_count) * sample_fract * sample_fract * 0.5) %>%
        max(1)

    # class vs class
    pairs <- utils::combn(sample_groups, 2, simplify = FALSE) %>%
        purrr::map(sort, decreasing = TRUE)

    for (i in seq_len(1000)) {
        # random select samples using bootstrap method
        sample_indx <- sample(sample_n, random_n, replace = TRUE)

        is_checked <- check_bootstrap_sample(
            feature_abundance,
            sample_indx,
            sample_min,
            class
        )
        if (is_checked) {
            break
        }
    }

    if (!is_checked) {
        stop(
            "Too small samples in each class",
            " or the variance of feature abundances within a",
            " class too small (zero or near zero)",
            call. = FALSE
        )
    }

    lda <- purrr::map(
        pairs,
        ~ cal_pair_lda(feature_abundance, sample_indx, .x)
    )
    names(lda) <- purrr::map(pairs, paste, collapse = " -VS- ")

    lda
}

#' calculate lda score of single pair groups
#' @noRd
cal_pair_lda <- function(feature_abundance,
    sample_indx,
    pair) {
    sample_feature_abundance <- feature_abundance[sample_indx, ]

    # reference lefse.py in lefse
    lda_res <- suppressWarnings(
        MASS::lda(
            class ~ .,
            data = sample_feature_abundance,
            tol = 1.0e-10
        )
    )
    w <- lda_res$scaling[, 1]
    w_unit <- w / sqrt(sum(w^2))
    feature_remove_class <- sample_feature_abundance[-1]

    # not support subclass and subject argument in lefse

    ld <- as.matrix(feature_remove_class) %*% w_unit
    group1_indx <- sample_feature_abundance$class == pair[1]
    effect_size <- abs(mean(ld[group1_indx]) - mean(ld[-group1_indx]))
    wfinal <- w_unit * effect_size
    lda_means <- lda_res$means
    lda_row_nms <- row.names(lda_means)
    feature_n <- ncol(lda_means)
    coeff <- ifelse(is.nan(wfinal), 0, abs(wfinal))

    res <- purrr::map(
        pair,
        function(x) {
            if (x %in% lda_row_nms) {
                # fixes #7, Since `pair` is a level, and `lda_means[pair[i], ]`
                # corced pair[i]` to numeric rather than use the corresponding
                # level of pair[i]
                ind <- match(x, lda_row_nms)
                lda_means[ind, ]
            } else {
                rep(0, feature_n)
            }
        }
    )

    names(res) <- pair

    feature <- names(feature_remove_class)
    lda_score <- purrr::map_dbl(
        seq_along(feature),
        function(i) {
            gm <- abs(res[[1]][i] - res[[2]][i])
            return(gm + coeff[i] * 0.5)
        }
    )
    names(lda_score) <- feature

    lda_score
}


#' feature abundance preprocess
#' @noRd
preprocess_feature_all <- function(x, class) {
    res <- group_by(x, class) %>%
        group_modify(~ purrr::map_df(.x, preprocess_feature)) %>%
        ungroup()

    res
}

preprocess_feature <- function(x) {
    if (length(unique(x)) <= max(length(x) * 0.5, 4)) {
        x <- purrr::map_dbl(x, ~ abs(.x + rnorm(1, 0, max(.x * 0.05, 0.01))))
    }

    x
}

# wilcoxon test within the same class-------------------------------------------

#' wilcoxon test in each subclass
#'
#' @param subcls character vector, length equal to the number of samples
#' @param cls_hie list, length equal to the number of classes, class hierarchy
#' @param feats_abd numeric vector, abundance profile of a given feature
#' @param feats_name character, feature names
#' @param wilcoxon_cutoff the cutoff for the wilcoxon test, default 0.05
#' @param multicls_strat logical, for multiple class tasks, whether the test is
#'   performed in a one-against one (more strict) or in a one-against all
#'   setting, default `FALSE`.
#' @param strict multiple testing options, 0 for no correction (default), 1 for
#'   independent comparisons, 2 for independent comparison
#' @param sample_min integer, minimum number of samples per subclass for
#'   performing wilcoxon test, default 10
#' @param only_same_subcls logical, whether perform the wilcoxon test only
#'   among the subclasses with the same name, default `FALSE`
#' @param curv logical, whether perform the wilcoxon test using the
#'   Curtis's approach, defalt `FALSE`
#'
#' @noRd
test_rep_wilcoxon <- function(subcls,
    cls_hie,
    feats_abd,
    feats_name,
    strict = 0,
    wilcoxon_cutoff = 0.05,
    multicls_strat = FALSE,
    sample_min = 10,
    only_same_subcls = FALSE,
    curv = FALSE) {
    if (!strict %in% c(0, 1, 2)) {
        stop("`strict` must be 0, 1 or 2")
    }

    cls_nms <- names(cls_hie)
    pairs <- utils::combn(cls_nms, 2, simplify = FALSE)
    tot_ok <- 0

    all_diff <- list()

    for (pair in pairs) {
        dir_cmp <- "not_set"

        subcls1 <- cls_hie[[pair[1]]]
        subcls1_n <- length(subcls1)
        subcls2 <- cls_hie[[pair[2]]]
        subcls2_n <- length(subcls2)

        # multiple tests
        if (strict != 0) {
            wilcoxon_cutoff <- ifelse(
                strict == 2,
                wilcoxon_cutoff * subcls1_n * subcls2_n,
                1 - (1 - wilcoxon_cutoff)^(subcls1_n * subcls2_n)
            )
        }

        ok <- 0
        curv_sign <- 0
        first <- TRUE

        for (i in seq_along(subcls1)) {
            br <- FALSE
            for (j in seq_along(subcls2)) {
                if (only_same_subcls &&
                    gsub(pair[1], "", subcls1[i]) !=
                        gsub(pair[2], "", subcls2[j])) {
                    ok <- ok + 1
                    next
                }

                cls1_abd <- feats_abd[subcls == subcls1[i]]
                cls2_abd <- feats_abd[subcls == subcls2[j]]
                med_comp <- FALSE

                if (length(cls1_abd) < sample_min ||
                        length(cls2_abd) < sample_min) {
                    med_comp <- TRUE
                }

                sx <- stats::median(cls1_abd)
                sy <- stats::median(cls2_abd)

                if (cls1_abd[1] == cls2_abd[1] &&
                    length(unique(cls1_abd)) == 1 &&
                    length(unique(cls2_abd)) == 1
                ) {
                    tres <- FALSE
                    first <- FALSE
                } else if (!med_comp) {
                    x <- c(cls1_abd, cls2_abd)
                    y <- factor(
                        c(
                            rep(1, length(cls1_abd)),
                            rep(2, length(cls2_abd))
                        )
                    )
                    pv <- coin::wilcox_test(
                        x ~ y,
                        data = data.frame(x, y)
                    )
                    pv <- coin::pvalue(pv)
                    tres <- pv < wilcoxon_cutoff * 2
                }

                if (first) {
                    first <- FALSE

                    if (!curv && (med_comp || tres)) {
                        dir_cmp <- sx < sy
                    } else if (curv) {
                        dir_cmp <- NULL
                        if (med_comp || tres) {
                            curv_sign <- curv_sign + 1
                            dir_cmp <- sx < sy
                        }
                    } else {
                        br <- TRUE
                    }
                } else if (!curv && med_comp) {
                    if ((sx < sy) != dir_cmp || sx == sy) {
                        br <- TRUE
                    }
                } else if (curv) {
                    if (tres && is.null(dir_cmp)) {
                        curv_sign <- curv_sign + 1
                        dir_cmp <- sx < sy
                    }

                    if (tres && dir_cmp != (sx < sy)) {
                        br <- TRUE
                        curv_sign <- curv_sign - 1
                    }
                } else if (!tres || (sx < sy) != dir_cmp || sx == sy) {
                    br <- TRUE
                }

                if (br) {
                    break
                }
                ok <- ok + 1
            }
            if (br) {
                break
            }
        }

        diff <- ifelse(
            curv,
            curv_sign > 0,
            ok == subcls1_n * subcls2_n
        )

        if (diff) tot_ok <- tot_ok + 1
        if (!diff && multicls_strat) {
            return(FALSE)
        }
        if (diff && !multicls_strat) all_diff <- c(all_diff, pair)
    }

    if (!multicls_strat) {
        tot_k <- length(cls_hie)

        for (k in names(cls_hie)) {
            nk <- 0
            for (a in all_diff) {
                if (k %in% a) nk <- nk + 1
            }

            if (nk == tot_k - 1) {
                return(TRUE)
            }
        }

        return(FALSE)
    }

    return(TRUE)
}

# format input ------------------------------------------------------------

#' format lefse input
#'
#' @param sample_meta a data.frame like object, sample metadata
#' @param cls character variable of class
#' @param subcls character variable of subclass, default `NULL`, no subclass
#'
#' @noRd
#'
#' @return  a list, contains class, subclass, and class hierarchy
lefse_format_grp <- function(sample_meta, group, subgroup = NULL) {
    groups <- sample_meta[[group]]
    group_nms <- unique(groups)

    if (is.null(subgroup)) {
        subgroups <- paste0(groups, "_subgrp")
    } else {
        subgroups <- paste(groups, sample_meta[[subgroup]], sep = "_")
    }

    group_hie <- split(subgroups, groups) %>%
        purrr::map(unique)

    return(list(group = groups, subgroup = subgroups, group_hie = group_hie))
}

#' add missing levels, used for summarized taxa
#' @param feature feature data, a [phyloseq::otu_table-class]
#' @noRd
#' @description this function require the row names of `feature` is the
#' summarized taxa
#' @return a data frame, where taxa in rows
add_missing_levels <- function(feature) {
    if (!taxa_are_rows(feature)) {
        feature <- t(feature)
    }
    feature_nms <- row.names(feature)
    feature <- feature@.Data %>% data.frame()

    # the missing feature names
    feature_nms2 <-
        strsplit(feature_nms, "|", fixed = TRUE) %>%
        purrr::map(
            ~ Reduce(
                function(x, y) paste(x, y, sep = "|"),
                .x,
                accumulate = TRUE
            )
        )

    unq_nms <- unlist(feature_nms2) %>% unique()
    missing_nms <- setdiff(unq_nms, feature_nms)

    if (length(missing_nms) == 0) {
        return(feature)
    }

    missing_nms_split <- strsplit(missing_nms, split = "|", fixed = TRUE)
    missing_mns_level <- lengths(missing_nms_split)
    missing_level_range <- range(missing_mns_level)

    # only sum the next level of tax, so we need first add the missing tax at
    # the most depth level
    for (i in missing_level_range[2]:missing_level_range[1]) {
        missing_nms_i <- missing_nms[missing_mns_level == i]
        taxs <- row.names(feature)

        indx <- purrr::map(
            missing_nms_i,
            ~ purrr::map_lgl(taxs, function(x) grepl(.x, x, fixed = TRUE))
        )

        # only sum the next level of tax
        feature_nms_level <- strsplit(taxs, split = "|", fixed = TRUE) %>%
            lengths()
        indx <- purrr::map(indx, ~ .x & feature_nms_level == (i + 1))
        abd <- purrr::map_df(
            feature,
            ~ purrr::map_dbl(indx, function(x) sum(.x[x]))
        )

        feature <- rbind(feature, abd)
        row.names(feature) <- c(taxs, missing_nms_i)
    }

    otu_table(feature, taxa_are_rows = TRUE)
}

# check whether tax have level prefix, such as `p__`
check_tax_prefix <- function(taxa_nms) {
    prefix <- paste0(c("k", "p", "c", "o", "f", "g", "s"), "__")
    has_prefix <- purrr::map_lgl(prefix,
        ~ any(grepl(.x, taxa_nms, fixed = TRUE))
    )

    any(has_prefix)
}


================================================
FILE: R/microbiomeMarker.R
================================================
#' microbiomeMarker: A package for microbiome biomarker discovery
#'
#' The microboimeMarker package provides several methods to identify micribome
#' biomarker, such as lefse, deseq2.
#'
#' @docType package
#' @name microbiomeMarker-package
#' @aliases microbiomeMarker-package
#' @importFrom dplyr %>% group_by summarise filter select bind_rows
#' group_split arrange slice mutate desc group_modify ungroup
#' @importFrom phyloseq sample_data otu_table taxa_are_rows
#' transform_sample_counts tax_table taxa_sums prune_taxa phyloseq
#' phy_tree ntaxa nsamples refseq
#' @importClassesFrom phyloseq phyloseq
#' @importFrom rlang .data
#' @importFrom methods setClass setGeneric setMethod
#' @keywords internal
NULL


================================================
FILE: R/normalization.R
================================================
#' Normalize the microbial abundance data
#'
#' It is critical to normalize the feature table to eliminate any bias due to
#' differences in the sampling sequencing depth.This function implements six
#' widely-used normalization methods for microbial compositional data.
#'
#' @param object a matrix, data.frame, [phyloseq::phyloseq-class] or
#'   [phyloseq::otu_table-class] object.
#' @param method the methods used to normalize the microbial abundance data.
#'   Options includes:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over
#'     the differences of the log-transformed gene-count fold-change between
#'     the sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param ... other arguments passed to the corresponding normalization
#'   methods.
#' @seealso [edgeR::calcNormFactors()],[DESeq2::estimateSizeFactorsForMatrix()],
#' [metagenomeSeq::cumNorm()]
#' @importMethodsFrom BiocGenerics normalize
#' @importFrom phyloseq sample_data<-
#' @exportMethod normalize
#' @aliases normalize,phyloseq-method
#' @rdname normalize-methods
#' @return the same class with `object`.
#' @examples
#' data(caporaso)
#' normalize(caporaso, "TSS")
setMethod(
    "normalize", "phyloseq",
    function(object,
    method = "TSS",
    ...) {
        otu <- otu_table(object)
        otu_normed <- normalize(otu, method = method, ...)

        # extract norm_factor attributes and prepend to the sample_data
        nf <- attr(otu_normed, "norm_factor")
        if (!is.null(nf)) {
            sample_data(object) <- cbind(
                sample_data(object),
                norm_factor = nf
            )
        }

        # please note otu_table<- function will drop the norm_factor attribute
        otu_table(object) <- otu_normed

        object
    }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,otu_table-method normalize
#' @rdname normalize-methods
setMethod(
    "normalize", "otu_table",
    function(object,
    method = "TSS",
    ...) {
        methods <- c("none", "rarefy", "TSS", "TMM", "RLE", "CSS", "CLR", "CPM")
        if (method %in% methods) {
            object_normed <- switch(method,
                none = object,
                rarefy = norm_rarefy(object, ...),
                TSS = norm_tss(object),
                TMM = norm_tmm(object, ...),
                RLE = norm_rle(object, ...),
                CSS = norm_css(object, ...),
                CLR = norm_clr(object),
                CPM = norm_cpm(object)
            )
        } else {
            stop(
                "`method` must be one of none, rarefy, TSS,",
                " TMM, RLE, CSS, CLR, or CPM",
                call. = FALSE
            )
        }

        object_normed
    }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,data.frame-method normalize
#' @rdname normalize-methods
setMethod(
    "normalize", "data.frame",
    function(object,
    method = "TSS",
    ...) {
        otu <- otu_table(object, taxa_are_rows = TRUE)
        otu_norm <- normalize(otu, method, ...)
        nf <- attr(otu_norm, "norm_factor")

        res <- as.data.frame(otu_norm)
        if (!is.null(nf)) {
            attr(res, "norm_factor") <- nf
        }

        res
    }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,matrix-method normalize
#' @rdname normalize-methods
setMethod(
    "normalize", "matrix",
    function(object,
    method = "TSS",
    ...) {
        otu <- as.data.frame(object)
        otu_norm <- normalize(otu, method, ...)
        nf <- attr(otu_norm, "norm_factor")

        res <- as.matrix(otu_norm)
        if (!is.null(nf)) {
            attr(res, "norm_factor") <- nf
        }

        res
    }
)

## Four normalization methods do not save the norm factor: value, rarefy,
## clr and tss; where three methods save the norm factor: css, rle, tmm.

#' Normalize feature table by rafefying such that all samples have the same
#' number of total counts (library size).
#'
#' For rarefying, reads in the different samples are randomly removed until
#' the same predefined number has been reached, to assure all samples have the
#' same library size. Rarefying normalization method is the standard in
#' microbial ecology. Please note that the authors of phyloseq do not advocate
#' using this rarefying a normalization procedure, despite its recent
#' popularity
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#' object.
#' @param size,rng_seed,replace,trim_otus,verbose extra arguments passed to
#'   [`phyloseq::rarefy_even_depth()`].
#' @export
#' @rdname normalize-methods
#' @aliases norm_rarefy
#' @importFrom phyloseq rarefy_even_depth sample_sums
#' @seealso [`phyloseq::rarefy_even_depth()`]
norm_rarefy <- function(object,
    size = min(sample_sums(object)),
    rng_seed = FALSE,
    replace = TRUE,
    trim_otus = TRUE,
    verbose = TRUE) {
    object_rarefied <- rarefy_even_depth(
        object,
        sample.size = size,
        rngseed = rng_seed,
        replace = replace,
        trimOTUs = trim_otus,
        verbose = verbose
    )

    # do not save the norm_factor
    # the norm factors can be calculated in the subsequently differential
    # analysis method, e.g. edgeR, DESeq
    object_rarefied
}

#' Total-Sum Scaling (TSS) method
#'
#' TSS simply transforms the feature table into relative abundance by dividing
#' the number of total reads of each sample.
#'
#' @param object object a [phyloseq::phyloseq-class] or
#'   [phyloseq::otu_table-class] object
#' @export
#' @rdname normalize-methods
#' @aliases norm_tss
#' @importFrom phyloseq otu_table<-
norm_tss <- function(object) {
    otu <- otu_table(object)
    size <- colSums(otu)
    otu_normed <- sweep(otu, MARGIN = 2, STATS = size, FUN = "/")
    otu_table(object) <- otu_table(
        otu_normed,
        taxa_are_rows = taxa_are_rows(object)
    )

    # do not save the norm_factor, the norm factors are calculated based on the
    # subsequently differential analysis method, e.g. edgeR, DESeq
    object
}

#' Cumulative-Sum Scaling (CSS) method
#'
#' CSS is based on the assumption that the count distributions in each sample
#' are equivalent for low abundant genes up to a certain threshold.  Only the
#' segment of each sample’s count distribution that is relatively invariant
#' across samples is scaled by CSS
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object.
#' @param sl The value to scale.
#' @importFrom phyloseq sample_data<-
#' @importFrom metagenomeSeq newMRexperiment cumNorm cumNormStatFast MRcounts
#' @seealso [metagenomeSeq::calcNormFactors()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_css
norm_css <- function(object, sl = 1000) {
    if (inherits(object, "phyloseq")) {
        object_mgs <- phyloseq2metagenomeSeq(object)
    } else if (inherits(object, "otu_table")) {
        object_mgs <- otu_table2metagenomeSeq(object)
    }

    # cumNormStatFast requires counts of all samples at least have two
    # non zero features. Thus, if there are samples with only one non-zer
    # features, cumNormStat is taken to compute the pth quantile.
    count <- as(otu_table(object), "matrix")
    fun_p <- select_quantile_func(count)
    nf <- metagenomeSeq::calcNormFactors(object_mgs, p = fun_p(object_mgs))
    nf <- unlist(nf) / sl
    object_nf <- set_nf(object, nf)

    object_nf
}

#' Relative log expression (RLE) normalization
#'
#' RLE assumes most features are not differential and uses the relative
#' abundances to calculate the normalization factor.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @param locfunc a function to compute a location for a sample. By default,
#'   the median is used.
#' @param type method for estimation: either "ratio"or "poscounts" (recommend).
#' @param geo_means default `NULL`, which means the geometric means of the
#'   counts are used. A vector of geometric means from another count matrix can
#'   be provided for a "frozen" size factor calculation.
#' @param control_genes default `NULL`, which means all taxa are used for size
#'   factor estimation, numeric or logical index vector specifying the taxa
#'   used for size factor estimation (e.g. core taxa).
#' @seealso [DESeq2::estimateSizeFactorsForMatrix()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_rle
norm_rle <- function(object,
    locfunc = stats::median,
    type = c("poscounts", "ratio"),
    geo_means = NULL,
    control_genes = NULL) {
    stopifnot(class(object) %in% c("phyloseq", "otu_table"))
    type <- match.arg(type, c("poscounts", "ratio"))

    # use substitute() to create missing argument
    geo_means <- ifelse(is.null(geo_means), substitute(), geo_means)
    control_genes <- ifelse(is.null(control_genes), substitute(), control_genes)

    otu <- as(otu_table(object), "matrix")
    nf <- estimateSizeFactorsForMatrix(
        otu,
        locfunc = locfunc,
        geoMeans = geo_means,
        controlGenes = control_genes,
        type = type
    )
    object_nf <- set_nf(object, nf)

    object_nf
}

# https://github.com/biobakery/Maaslin2/blob/master/R/utility_scripts.R
#
#' TMM (trimmed mean of m-values) normalization
#'
#' TMM calculates the normalization factor using a robust statistics based on
#' the assumption that most features are not differential and should, in
#' average, be equal between the samples. The TMM scaling factor is  calculated
#' as the weighted mean of log-ratios between each pair of samples, after
#' excluding the highest count OTUs and OTUs with the largest log-fold change.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @param ref_column column to use as reference
#' @param logratio_trim amount of trim to use on log-ratios
#' @param sum_trim amount of trim to use on the combined absolute levels
#'   ("A" values)
#' @param do_weighting whether to compute the weights or not
#' @param Acutoff cutoff on "A" values to use before trimming
#' @seealso [edgeR::calcNormFactors()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_tmm
norm_tmm <- function(object,
    ref_column = NULL,
    logratio_trim = 0.3,
    sum_trim = 0.05,
    do_weighting = TRUE,
    Acutoff = -1e10) {
    otu <- as(otu_table(object), "matrix")
    nf <- edgeR::calcNormFactors(
        otu,
        method = "TMM",
        refcolumn = ref_column,
        logratioTrim = logratio_trim,
        sumTrim = sum_trim,
        doWeighting = do_weighting,
        Acutoff = Acutoff
    )
    object_nf <- set_nf(object, nf)

    object_nf
}

#' CLR (centered log-ratio) normalization
#'
#' In CLR, the log-ratios are computed relative to the geometric mean of all
#' features.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @export
#' @rdname normalize-methods
#' @aliases norm_clr
norm_clr <- function(object) {
    otu <- as(otu_table(object), "matrix")
    otu_norm <- apply(otu, 2, trans_clr)

    otu_table(object) <- otu_table(
        otu_norm,
        taxa_are_rows = taxa_are_rows(object)
    )

    # do not save the norm_factor, the norm factors are calculated based on the
    # subsequently differential analysis method, e.g. edgeR, DESeq
    object
}

# from joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md
gm_mean <- function(x, na.rm = TRUE) {
    # The geometric mean, with some error-protection bits.
    exp(sum(log(x[x > 0 & !is.na(x)]), na.rm = na.rm) / length(x))
}

trans_clr <- function(x, base = exp(1)) {
    x <- log((x / gm_mean(x)), base)
    x[!is.finite(x) | is.na(x)] <- 0.0
    return(x)
}

#' Normalize the sum of values of each sample to million (counts per million)
#'
#' `norm_cpm`: This normalization method is from the original LEfSe algorithm,
#' recommended when very low values are present (as shown in the LEfSe galaxy).
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#' @export
#' @rdname normalize-methods
#' @aliases norm_cpm
#' @importFrom phyloseq transform_sample_counts
norm_cpm <- function(object) {
    otu <- as(otu_table(object), "matrix") %>%
        as.data.frame()

    # whether the object is summarized
    hie <- check_tax_summarize(object)
    if (hie) {
        features <- row.names(otu)
        features_split <- strsplit(features, "|", fixed = TRUE)
        single_indx <- which(lengths(features_split) < 2)

        ## keep the counts of a sample identical with `normalization`
        ## if we norm the counts in two steps:
        ## 1. calculate scale size: norm_coef = normalization/lib_size;
        ## 2. multiple the scale size value * norm_coef
        ## the counts of a sample colSums(otu) may not equal to the argument
        ## normalization.
        ## e.g. normalization = 1e6, colSums(otu) = 999999
        ## Finally, the kruskal test may be inaccurate,
        ## e.g. https://github.com/yiluheihei/microbiomeMarker/issues/13
        ps_normed <- transform_sample_counts(
            object,
            function(x) x * 1e+06 / sum(x[single_indx])
        )
    } else {
        ps_normed <- transform_sample_counts(
            object,
            function(x) x * 1e+06 / sum(x)
        )
    }

    otu_normed <- data.frame(otu_table(ps_normed))
    otu_normed <- purrr::map_df(
        otu_normed,
        function(x) {
            if (mean(x) && stats::sd(x) / mean(x) < 1e-10) {
                return(round(x * 1e6) / 1e6)
            } else {
                return(x)
            }
        }
    )

    otu_normed <- as.data.frame(otu_normed)
    row.names(otu_normed) <- row.names(otu)
    colnames(otu_normed) <- colnames(otu)
    otu_table(object) <- otu_table(otu_normed, taxa_are_rows = TRUE)

    # do not save the norm_factor, the norm factors are calculated based on the
    # subsequently differential analysis method, e.g. edgeR, DESeq
    object
}

# set the norm factors of the object, if object is in phyloseq-class, add a
# var `norm_factor` in the sample_data to save the norm factors of each sample;
# if object in otu_table-class, add a attributes `norm_factor` to the object
# to save the norm factors.
#' @importFrom phyloseq sample_data<-
#' @noRd
set_nf <- function(object, nf) {
    # ensure norm factors from sample_data and attributes of otu_table are
    # identical
    names(nf) <- NULL

    if (inherits(object, "phyloseq")) {
        sample_data(object) <- cbind(sample_data(object), norm_factor = nf)
        # to keep accordance with otu_table,
        # we also add the attributes `norm_factor` to otu_table
        ot <- otu_table(object)
        attr(ot, "norm_factor") <- nf
        otu_table(object) <- ot
    } else if (inherits(object, "otu_table")) {
        attr(object, "norm_factor") <- nf
    } else {
        stop("object must be a `phloseq` or `otu_table` object")
    }

    object
}


#' Extract the normalization factors
#'
#' This function will be used to extract the normalization factors. After
#' dividing the observed feature table by normalization factors (eliminate
#' sequencing biases), we will obtain the normalized feature table.
#'
#' @param object a [`phyloseq::phyloseq-class`], [phyloseq::otu_table-class]
#'   object.
#' @return a numeric vector with the length equal to the number of samples, or
#'  `NULL` if the `object` has not been normalized.
#' @noRd
get_norm_factors <- function(object) {
    if (inherits(object, "phyloseq")) {
        nf <- sample_data(object)$norm_factor
    } else {
        nf <- attr(object, "norm_factor")
    }

    nf
}


# Deprecated functions ----------------------------------------------------

# This function is deprecated
#' normalize the summarized feature
#' @param feature otu table or data.frame
#' @param normalization  set the normalization value
#' @noRd
normalize_feature <- function(feature, normalization) {
    if (inherits(feature, "otu_table")) {
        if (!taxa_are_rows(feature)) {
            feature <- t(feature)
        }
        feature <- feature@.Data %>% data.frame()
    }
    if (is.null(normalization)) {
        return(feature)
    }

    feature_split <- strsplit(row.names(feature), "\\|")
    hie <- ifelse(any(lengths(feature_split) > 1), TRUE, FALSE)
    if (hie) {
        single_indx <- which(lengths(feature_split) < 2)
        abd <- purrr::map_dbl(feature, ~ sum(.x[single_indx]))
    } else {
        abd <- purrr::map_dbl(feature, sum)
    }
    normed_coef <- normalization / abd
    normed_feature <- purrr::map2_df(
        feature, normed_coef,
        function(x, y) {
            res <- x * y
            if (mean(res) && stats::sd(res) / mean(res) < 1e-10) {
                res <- round(res * 1e6) / 1e6
            }
            res
        }
    )

    # for row names setting, phyloseq requires otu_table and tax_table has the
    # same taxa
    normed_feature <- as.data.frame(normed_feature)
    row.names(normed_feature) <- row.names(feature)

    otu_table(normed_feature, taxa_are_rows = TRUE)
}


================================================
FILE: R/plot-abundance.R
================================================
#' plot the abundances of markers
#'
#' @inheritParams plot_ef_bar
#' @param group character, the variable to set the group
#' @return a [`ggplot2::ggplot`] object.
#' @importFrom ggplot2 ggplot aes geom_boxplot theme_bw element_text
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.01,
#'     p_adjust = "none"
#' )
#' plot_abundance(mm, group = "Enterotype")
plot_abundance <- function(mm,
    label_level = 1,
    max_label_len = 60,
    markers = NULL,
    group) {
    stopifnot(inherits(mm, c("microbiomeMarker", "marker_table")))

    sample_meta <- sample_data(mm)
    sample_meta_nms <- names(sample_meta)
    if (!group %in% sample_meta_nms) {
        stop(
            "`group_var` must be one of the sample-level variables",
            call. = FALSE
        )
    }

    marker <- marker_table(mm)

    # maker subset white a function here: replicate code in .plot_ef
    if (!is.null(markers)) {
        ind <- match(markers, marker$feature)
        ind_na <- is.na(ind)
        if (all(ind_na)) {
            stop(
                "all the elements of `markers` should",
                " be a contained in marker_table",
                call. = FALSE
            )
        }

        if (any(ind_na)) {
            warning(
                paste(marker[ind_na], collapse = " "),
                "are not contained in the `marker_table`"
            )
        }

        marker <- marker[ind, ]

        # reorder to keep in increase order of effect size
        marker <- marker[order(marker[[3]]), ]
    }

    abd <- as(otu_table(mm), "matrix")
    marker_abd <- abd[match(marker$feature, row.names(abd)), ] %>%
        as.data.frame()

    groups <- sample_meta[[group]]
    names(groups) <- names(marker_abd)

    marker_abd$feature <- row.names(marker_abd)
    marker_abd <- tidyr::pivot_longer(
        marker_abd,
        -.data$feature,
        names_to = "sample",
        values_to = "abd"
    )
    marker_abd[[group]] <- groups[match(marker_abd$sample, names(groups))]

    p <- ggplot(
        marker_abd,
        aes(x = .data$abd, y = .data$feature, fill = .data[[group]])
    ) +
        geom_boxplot() +
        labs(x = "Abundance", y = NULL) +
        scale_x_continuous(expand = c(0, 0)) +
        scale_y_discrete(labels = function(x) {
            get_features_labels(x, label_level, max_label_len)
        }) +
        theme_bw()

    p
}


================================================
FILE: R/plot-cladogram.R
================================================
##  codes for cladogram plot are modified from microbiomeViz
##  https://github.com/lch14forever/microbiomeViz

#' @title plot cladogram of micobiomeMaker results
#'
#' @param mm a [microbiomeMarker-class] object
#' @param color a color vector, used to highlight the clades of microbiome
#'   biomarker. The values will be matched in order (usually alphabetical) with
#'   the groups. If this is a named vector, then the colors will be matched
#'   based on the names instead.
#' @param only_marker logical, whether show all the features or only 
#'   markers in the cladogram, default `FALSE`.
#' @param branch_size numeric, size of branch, default `0.2`
#' @param alpha alpha parameter for shading, default `0.2`
#' @param clade_label_level max level of taxa used to label the clade, other
#' level of taxa  will be shown on the side.
#' @param clade_label_font_size font size of the clade label, default 4.
#' @param node_size_scale the parameter 'a' controlling node size:
#' `node_size=a*log(relative_abundance) + b`
##' @param node_size_offset the parameter 'b' controlling node size:
##' `node_size=a*log(relative_abundance) + b`
##' @param annotation_shape shape used for annotation, default `22`
##' @param annotation_shape_size size used for annotation shape, default `5`
##' @param  group_legend_param,marker_legend_param a list specifying
##'   extra parameters of group legend and marker legend, such as `direction` (
##'   the direction of the guide), `nrow` (the desired number of rows of
##'   legends). See [`ggplot2::guide_legend()`] for more details.
#' @return a ggtree object
#' @importFrom tidytree treedata
#' @importFrom ggplot2 geom_point theme element_blank geom_rect guides
#' guide_legend aes_ scale_shape_manual
#' @importFrom ggtree ggtree geom_hilight geom_point2 geom_cladelabel
#' @author Chenhao Li, Guangchuang Yu, Chenghao Zhu, Yang Cao
#' @seealso [ggtree::ggtree()]
#' @export
#' @references This function is modified from `clada.anno` from microbiomeViz.
#' @examples
#' data(kostic_crc)
#' kostic_crc_small <- phyloseq::subset_taxa(
#'     kostic_crc,
#'     Phylum %in% c("Firmicutes")
#' )
#' mm_lefse <- run_lefse(
#'     kostic_crc_small,
#'     wilcoxon_cutoff = 0.01,
#'     group = "DIAGNOSIS",
#'     kw_cutoff = 0.01,
#'     multigrp_strat = TRUE,
#'     lda_cutoff = 4
#' )
#' plot_cladogram(mm_lefse, color = c("darkgreen", "red"))
plot_cladogram <- function(mm,
    color,
    only_marker = FALSE,
    branch_size = 0.2,
    alpha = 0.2,
    node_size_scale = 1,
    node_size_offset = 1,
    clade_label_level = 4,
    clade_label_font_size = 4,
    annotation_shape = 22,
    annotation_shape_size = 5,
    group_legend_param = list(),
    marker_legend_param = list()) {
    ps <- create_ps_from_mm(mm, only_marker = only_marker)
    tree <- get_treedata_phyloseq(ps) %>%
        generate_taxa_tree(size = branch_size)

    annotation <- generate_cladogram_annotation(
        mm@marker_table,
        color = color
    )

    # background highlight
    annotation_info <- dplyr::left_join(
        annotation,
        tree$data,
        by = c("node" = "label")
    ) %>%
        mutate(
            label = .data$node,
            id = .data$node.y,
            level = as.numeric(.data$node_class)
        )

    hilight_para <- dplyr::transmute(
        annotation_info,
        node = .data$id,
        fill = .data$color,
        alpha = alpha,
        extend = get_offset(.data$level)
    )
    hilights_g <- purrr::pmap(hilight_para, geom_hilight)
    tree <- purrr::reduce(hilights_g, `+`, .init = tree)

    # hilight legend
    # default: colors were matched for in alphabetical of groups, which requires
    # arrange hilights_df according to enrich_group
    hilights_df <- dplyr::distinct(
        annotation_info,
        .data$enrich_group,
        .data$color
    ) %>%
        arrange(.data$enrich_group)
    hilights_df$x <- 0
    hilights_df$y <- 1
    group_legend_param <- c(
        group_legend_param,
        list(
            title = NULL,
            order = 1,
            override.aes = list(fill = hilights_df$color)
        )
    )
    group_lgd <- do.call(guide_legend, group_legend_param)
    tree <- tree +
        geom_rect(
            aes_(xmin = ~x, xmax = ~x, ymax = ~y, ymin = ~y, 
                fill = ~enrich_group),
            data = hilights_df, inherit.aes = FALSE
        ) +
        guides(fill = group_lgd)

    # set nodes color and size
    nodes_colors <- rep("white", nrow(tree$data))
    nodes_colors[annotation_info$id] <- annotation_info$color
    node_size <- node_size_scale * log(tree$data$abd) + node_size_offset
    tree$data$node_size <- node_size
    tree <- tree +
        geom_point2(aes(size = I(node_size)), fill = nodes_colors, shape = 21)

    ## add clade labels
    clade_label <- dplyr::transmute(
        annotation_info,
        node = .data$id,
        offset = get_offset(.data$level) - 0.4,
        angle = purrr::map_dbl(.data$id, get_angle, tree = tree) + 90,
        label = .data$label,
        fontsize = clade_label_font_size,
        barsize = 0,
        hjust = 0.5,
        level = .data$level
    ) %>%
        dplyr::arrange(desc(.data$level))
    ind <- clade_label$level < clade_label_level
    short_label <- get_short_label_id(clade_label, clade_label_level)
    clade_label_para <- mutate(
        clade_label,
        label = c(.data$label[!ind], short_label),
        level = NULL
    )
    clade_label_g <- purrr::pmap(clade_label_para, geom_cladelabel)
    tree <- purrr::reduce(clade_label_g, `+`, .init = tree)

    ## add guide labels
    guide_label <- clade_label[ind, ] %>%
        mutate(
            label2 = paste0(short_label, ": ", .data$label),
            color = annotation_info$color[
                match(.data$label, annotation_info$label)]
        )

    # marker annotation, legend
    marker_legend_param <- c(
        marker_legend_param,
        list(
            p = tree,
            color = guide_label$color,
            label = guide_label$label2,
            shape = annotation_shape,
            size = annotation_shape_size
        )
    )
    p <- do.call(set_marker_annotation, marker_legend_param) +
        theme(legend.position = "right", legend.title = element_blank())

    p
}

#' Get short label id
#' @keywords internal
#' @noRd
get_short_label_id <- function(clade_label, clade_label_level) {
    ind <- clade_label$level < clade_label_level
    unique_id <- get_unique_id(sum(ind))
    short_label <- unique_id[seq_len(sum(ind))]

    short_label
}

#' Get unique id for short label annotation
#' {so}/questions/21681785/repeating-vector-of-letters/21689613#21689613
#' @keywords internal
#' @noRd
get_unique_id <- function(n, depth = 1) {
    args <- lapply(seq_len(depth), FUN = function(x) letters)
    x <- do.call(expand.grid, args = list(args, stringsAsFactors = FALSE))
    x <- x[, rev(names(x)), drop = FALSE]
    x <- do.call(paste0, x)
    if (n <= length(x)) {
        return(x[seq_len(n)])
    }

    return(c(x, get_unique_id(n - length(x), depth = depth + 1)))
}

#' Generate tree data from phyloseq object
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param sep character, separate between different levels of taxa, default `|`
#' @author Yang Cao
#' @return a [`tidytree::treedata-class`] object
#' @keywords internal
get_treedata_phyloseq <- function(ps, sep = "|") {
    if (!taxa_are_rows(ps)) {
        stop("Requires taxa in rows of phyloseq")
    }

    taxa <- tax_table(ps)
    otu <- otu_table(ps)
    row.names(otu) <- taxa@.Data[, 1]
    taxa_nms <- row.names(otu)

    tree_table <- data.frame(
        taxa = taxa_nms,
        abd = rowMeans(otu),
        stringsAsFactors = FALSE
    ) %>%
        mutate(
            taxa = paste("r__Root", .data$taxa, sep = "|"),
            abd = .data$abd / max(.data$abd) * 100
        )

    taxa_split <- strsplit(tree_table$taxa, split = sep, fixed = TRUE)
    nodes <- purrr::map_chr(taxa_split, utils::tail, n = 1)
    # add root node
    nodes <- c("r__Root", nodes)

    ## data may not contain all the seven ranks of the taxa, such as
    ## enterotypes_arumugam only contains Phylum and Genus ranks
    taxa_deepest <- taxa_split[[which.max(lengths(taxa_split))]]
    prefix <- vector("character", length(taxa_deepest))
    for (i in seq_along(taxa_deepest)) {
        if (!grepl("__$", taxa_deepest[i])) {
            prefix[i] <- gsub("(.*)__.*", "\\1", taxa_deepest[i])
        } else {
            pos <- nchar(taxa_deepest[i]) - 2
            prefix[i] <- substr(taxa_deepest[i], pos, pos)
        }
    }

    levels <- purrr::map_chr(nodes, ~ gsub("__.*$", "", .x)) %>%
        factor(levels = rev(prefix))

    nodes_parent <- purrr::map_chr(
        taxa_split,
        ~ .x[length(.x) - 1]
    )
    # root must be a parent node
    nodes_parent <- c("root", nodes_parent)

    ## tips comes first ?
    is_tip <- !nodes %in% nodes_parent
    index <- vector("integer", length(is_tip))
    index[is_tip] <- seq_len(sum(is_tip))
    index[!is_tip] <- (sum(is_tip) + 1):length(is_tip)

    edges <- cbind(
        parent = index[match(nodes_parent, nodes)],
        child = index
    )
    edges <- edges[!is.na(edges[, 1]), ]

    # not label the tips
    node_label <- nodes[!is_tip]

    phylo <- structure(
        list(
            edge = edges,
            node.label = node_label,
            tip.label = nodes[is_tip],
            edge.length = rep(1, nrow(edges)),
            Nnode = length(node_label)
        ),
        class = "phylo"
    )

    mapping <- data.frame(
        node = index,
        abd = c(100, tree_table$abd),
        node_label = nodes,
        stringsAsFactors = FALSE
    )
    mapping$node_class <- levels

    tidytree::treedata(phylo = phylo, data = tibble::as_tibble(mapping))
}

#' generate taxa hierarchy tree
#' @noRd
generate_taxa_tree <- function(treedata,
    size = 0.2,
    layout = "circular") {
    ggtree::ggtree(treedata, size = size, layout = layout)
}

#' generate annotaion data for cladogram plot
#' @param marker data.frame
#' @param color a color vector, used to highlight the clades of ggtree
#' @param sep seprator between different of levels of taxa
#' @noRd
generate_cladogram_annotation <- function(marker,
    color,
    sep = "|") {
    enrich_group <- marker$enrich_group
    if (length(color) != length(unique(enrich_group))) {
        stop("the number of colors must be equal to ",
            "the number of enriched groups.")
    }

    feature <- marker$feature
    label <- strsplit(feature, split = sep, fixed = TRUE) %>%
        purrr::map_chr(utils::tail, n = 1)
    label_level <- lengths(strsplit(feature, sep, fixed = TRUE))

    # may be no marker are identified enriched in some groups
    # drop the levels of this groups if the enrich_group is a factor
    if (inherits(enrich_group, "factor")) {
        enrich_group <- droplevels(enrich_group)
    }

    # named colors: set the colors based on the matched names to groups
    if (is.vector(color) && !is.null(names(color))) {
        if (!all(names(color) %in% enrich_group)) {
            stop("names of `color` muste be contained in enriched groups")
        }
        color <- color[match(enrich_group, names(color))]
    } else {
        # colors will be matched in order (usually alphabetical) with the groups
        names(color) <- sort(unique(enrich_group))
        color <- color[match(enrich_group, names(color))]
    }

    annotation <- data.frame(
        node = label,
        color = color,
        enrich_group = enrich_group,
        stringsAsFactors = FALSE
    )

    annotation
}

#' get clade background offset
#' @noRd
get_offset <- function(x) {
    (x * 0.2 + 0.2)^2
}

#' get the mean angle of a clade
#' @noRd
get_angle <- function(tree, node) {
    if (length(node) != 1) {
        stop("The length of `node` must be 1")
    }
    tree_data <- tree$data
    sp <- tidytree::offspring(tree_data, node)$node
    sp2 <- c(sp, node)
    sp.df <- tree_data[match(sp2, tree_data$node), ]
    mean(range(sp.df$angle))
}

#' set legend for multiple geom_cladelabel layers
#'
#' This function can be used to set the microbiome marker annotations
#'
#' @param p a ggtree object
#' @param color a color vector
#' @param label a character vector, with the same length with `color`
#' @param shape shape of label, default `22`
#' @param size size of shape, default `5`
#' @param ... extra arguments passed to  [ggplot2::guide_legend()],
#' e.g. `ncol`, more details see [ggplot2::guide_legend()].
#' @seealso [ggplot2::guide_legend()]
#' @return an updated `ggtree` object
#' @importFrom ggplot2 geom_point aes_ scale_shape_manual guides guide_legend
#' @noRd
set_marker_annotation <- function(p,
    color,
    label,
    size = 5,
    shape = 22,
    ...) {
    dat <- data.frame(
        color = color,
        label = label,
        stringsAsFactors = FALSE
    )

    # suppress warning: The shape palette can deal with a maximum of 6 discrete
    # values because more than 6 becomes difficult to discriminate; you have 18.
    # Consider specifying shapes manually if you must have them.
    # using scale_shape_manual
    p <- p +
        geom_point(
            data = dat, inherit.aes = FALSE,
            aes_(x = 0, y = 0, shape = ~label),
            size = 0, stroke = 0,
        ) +
        scale_shape_manual(values = rep(shape, nrow(dat)), limits = dat$label) +
        guides(
            shape = guide_legend(
                override.aes = list(
                    size = size,
                    shape = shape,
                    fill = dat$color
                ),
                order = 2,
                ...
            )
        )

    p
}


================================================
FILE: R/plot-comparing.R
================================================
#' Plotting DA comparing result
#'
#' @param x an `compareDA` object, output from [`compare_DA()`].
#' @param sort character string specifying sort method. Possibilities are
#'   "score" which is calculated as \eqn{(auc - 0.5) * power - fdr}, "auc" for
#'   area under the ROC curve, "fpr" for false positive rate, "power" for
#'   empirical power.
#' @param ... extra arguments, just ignore it.
#'
#' @importFrom ggplot2 ggplot aes_string geom_point geom_boxplot facet_wrap
#' labeller labs
#'
#' @return a [`ggplot2::ggplot`] object containing 4 subplots: "auc", "fdr",
#' "power"and "score" plot.
#'
#' @export
plot.compareDA <- function(x,
                           sort = c("score", "auc", "fpr", "power"),
                           ...) {
    sort <- match.arg(sort, c("score", "auc", "fpr", "power"))

    metrics <- x$metrics
    calls <- metrics$call
    new_metrics <- metrics[c('auc', 'fpr', 'power', 'fdr')]
    metric_med <- stats::aggregate(new_metrics,
                                   by = list(call = calls),
                                   FUN = median)
    metric_med$score <- (metric_med$auc - 0.5) * metric_med$power -
        metric_med$fdr
    metric_med$method <- metrics$method[match(metric_med$call, metrics$call)]
    metric_med <- metric_med[order(metric_med$score, decreasing = TRUE), ]

    sort_metric_method <- function(df, sort) {
        is_desc <- ifelse(sort == "fdr", FALSE, TRUE)
        df <- df[order(df[[sort]], decreasing = is_desc), ]
        method_lvl <- df[["method"]]

        method_lvl
    }

    method_lvl <- sort_metric_method(metric_med, sort)
    metrics$method <- factor(metrics$method, levels = method_lvl)
    metrics$score <- (metrics$auc - 0.5) * metrics$power - metrics$fdr

    metrics <- tidyr::pivot_longer(metrics,
                        cols = tidyr::one_of("score", "fdr", "auc", "power"))

    facet_labs <- c("Score", "Area Under the Curve",
                    "Power", "False Discovery Rate")
    names(facet_labs) <- c("score", "auc", "power", "fdr")
    p <- ggplot(metrics, aes_string("method", "value")) +
        geom_point() +
        geom_boxplot() +
        facet_wrap(c("name"),
                   ncol = 1,
                   labeller = labeller(.rows = facet_labs)) +
        labs(x = NULL, y = NULL)

    p
}


================================================
FILE: R/plot-effect-size.R
================================================
#' bar and dot plot of effect size of microbiomeMarker data
#'
#' bar and dot plot of effect size microbiomeMarker data. This function returns
#' a `ggplot2` object that can be saved or further customized using **ggplot2**
#' package.
#'
#' @param mm a [`microbiomeMarker-class`] object
#' @param label_level integer, number of label levels to be displayed, default
#'   `1`, `0` means display the full name of the feature
#' @param max_label_len integer, maximum number of characters of feature label,
#'   default `60`
#' @param markers character vector, markers to display, default `NULL`,
#'   indicating plot all markers.
#' @importFrom ggplot2 ggplot aes geom_col labs scale_x_continuous theme_bw
#' scale_y_discrete guide_axis
#' @return a ggplot project
#' @export
#' @rdname effect_size-plot
#' @aliases ef-barplot,ef-dotplot
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.01,
#'     p_adjust = "none"
#' )
#' plot_ef_bar(mm)
plot_ef_bar <- function(mm,
    label_level = 1,
    max_label_len = 60,
    markers = NULL) {
    .plot_ef(mm, label_level, max_label_len, markers, "bar")
}

#' @rdname effect_size-plot
#' @export
plot_ef_dot <- function(mm,
    label_level = 1,
    max_label_len = 60,
    markers = NULL) {
    .plot_ef(mm, label_level, max_label_len, markers, "dot")
}


# plot of effect size
.plot_ef <- function(mm,
    label_level = 1,
    max_label_len = 60,
    markers = NULL,
    type = c("bar", "dot")) {
    stopifnot(inherits(mm, c("microbiomeMarker", "marker_table")))
    type <- match.arg(type, c("bar", "dot"))

    marker <- marker_table(mm)

    # effect size names, the third var of marker_table: prefix with "ef_"
    orig_ef_nm <- gsub("ef_", "", names(marker)[3])
    names(marker)[3] <- "effect_size"

    # labels of x
    # effect size: lda for lefse, diff_mean for classical test, logFC for
    # metagenomeSeq, DESeq2, edgeR
    if (orig_ef_nm == "lda") {
        label_x <- "LDA score (log10)"
    } else if (orig_ef_nm == "diff_mean") {
        label_x <- "Differential means"
    } else if (orig_ef_nm == "logFC") {
        label_x <- "log2 Fold Change"
    } else if (orig_ef_nm == "eta_squared") {
        label_x <- "Eta squared"
    } else if (orig_ef_nm == "CLR_diff_mean") {
        label_x <- "CLR differential means"
    } else if (orig_ef_nm == "CLR_F_statistic") {
        label_x <- "CLR F statistic"
    } else if (orig_ef_nm == "W") {
        label_x <- "W"
    } else if (orig_ef_nm == "imp") {
        label_x <- "Importance score"
    } else if (orig_ef_nm == "LR") {
        label_x <- "Likelihood ratio statistic"
    } else if (orig_ef_nm == "F") {
        label_x <- "F statistic"
    } else {
        stop(
            "The effect size must be one of lda, diff_mean, eta_squared, ",
            "logFC, clr_diff_mean, clr_F_statistic, W, imp, LR or F"
        )
    }

    # maker subset
    if (!is.null(markers)) {
        ind <- match(markers, marker$feature)
        ind_na <- is.na(ind)
        if (all(ind_na)) {
            stop(
                "all the elements of `markers` should be a contained in ",
                "`marker_table`",
                call. = FALSE
            )
        }

        if (any(ind_na)) {
            warning(
                paste(marker[ind_na], collapse = " "),
                "are not contained in the `marker_table`"
            )
        }

        marker <- marker[ind, ]
    }

    # increase order in each group
    marker <- dplyr::arrange(
        data.frame(marker),
        .data$enrich_group,
        .data$effect_size
    )
    feat <- marker$feature
    marker$feature <- factor(feat, levels = feat)

    nms_check <- any(c("feature", "enrich_group") %in% names(marker))
    if (!nms_check) {
        stop("`marker_table` must contains variable `feature` ", 
            "and `enrich_group`")
    }

    if (type == "bar") {
        p <-
            ggplot(
                marker,
                aes(.data$effect_size, .data$feature, fill = .data$enrich_group)
            ) +
            geom_col() +
            labs(x = label_x, y = NULL, fill = "Enriched group") +
            scale_x_continuous(expand = c(0, 0)) +
            scale_y_discrete(labels = function(x) {
                get_features_labels(x, label_level, max_label_len)
            }) +
            theme_bw()
    } else {
        p <- ggplot(
            marker,
            aes(.data$effect_size, .data$feature, color = .data$enrich_group)
        ) +
            geom_point() +
            labs(
                x = label_x, y = NULL,
                color = "Enriched group"
            ) +
            scale_y_discrete(labels = function(x) {
                get_features_labels(x, label_level, max_label_len)
            }) +
            theme_bw()
        if ("padj" %in% names(marker)) {
            marker$logp <- -log10(marker$padj)
            p <- p +
                geom_point(data = marker, aes(size = .data$logp)) +
                labs(size = "-log10(pvalue)")
        }
    }

    p
}


#' get the labels of markers which will be used in the barplot
#' @noRd
get_features_labels <- function(features, label_level, max_label_len) {
    purrr::map_chr(features, 
        ~ get_feature_label(.x, label_level, max_label_len))
}

#' get the label of a single feature
#' @noRd
get_feature_label <- function(feature,
    label_level = 1,
    max_label_len = 60,
    sep = "|") {
    if (length(feature) != 1) {
        stop("`feature` muste be a character vector of length 1")
    }
    if (label_level == 0) {
        feature <- feature
    } else {
        feature <- strsplit(feature, split = sep, fixed = TRUE) %>%
            unlist() %>%
            rev()
        feature_level <- length(feature)
        feature <- ifelse(
            label_level > feature_level,
            paste(rev(feature[seq_len(feature_level)]), collapse = sep),
            paste(rev(feature[seq_len(label_level)]), collapse = sep)
        )
    }

    feature_len <- nchar(feature)
    if (feature_len > max_label_len) {
        feature_letters <- unlist(strsplit(feature, ""))
        feature <- paste(
            paste(feature_letters[seq_len(max_label_len / 2 - 2)], 
                collapse = ""),
            "..",
            paste(feature_letters[
                (feature_len - max_label_len / 2 + 3):feature_len],
                collapse = ""),
            sep = ""
        )
    }
    # replace "Unknown" label in the species level as "sp."
    feature <- replace_unknown_species(feature)

    feature
}

# replace "Unknown" label in the species level as "sp."
replace_unknown_species <- function(feature, sep = "|") {
    species_hased <- grepl("s__", feature, fixed = TRUE)
    if (!species_hased) {
        return(feature)
    }

    taxa_lvl <- strsplit(feature, sep, fixed = TRUE)
    n_lvl <- length(taxa_lvl)
    sp <- taxa_lvl[[n_lvl]]
    sp <- gsub("Unknown", "sp.", feature, fixed = TRUE)
    taxa_lvl[[n_lvl]] <- sp
    feature <- paste(taxa_lvl, collapse = sep)

    feature
}


================================================
FILE: R/plot-heatmap.R
================================================
#' Heatmap of microbiome marker
#'
#' Display the microbiome marker using heatmap, in which rows represents the
#' marker and columns represents the samples.
#'
#' @inheritParams plot_abundance
#' @param transform  transformation to apply, for more details see
#' [`transform_abundances()`]:
#'   * "identity", return the original data without any transformation.
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param cluster_marker,cluster_sample logical, controls whether to perform
#'  clustering in markers (rows) and samples (cols), default `FALSE`.
#' @param sample_label logical, controls whether to show the sample labels in
#'   the heatmap, default `FALSE`.
#' @param scale_by_row logical, controls whether to scale the heatmap by the
#'  row (marker) values, default `FALSE`.
#' @param annotation_col assign colors for the top annotation using a named
#'  vector, passed to `col` in [`ComplexHeatmap::HeatmapAnnotation()`].
#' @param ... extra arguments passed to [`ComplexHeatmap::Heatmap()`].
#' @export
#' @seealso [`transform_abundances`],[`ComplexHeatmap::Heatmap()`]
#' @importFrom ComplexHeatmap Heatmap HeatmapAnnotation
#' @return a [`ComplexHeatmap::Heatmap-class`] object.
#' @examples
#' data(kostic_crc)
#' kostic_crc_small <- phyloseq::subset_taxa(
#'     kostic_crc,
#'     Phylum %in% c("Firmicutes")
#' )
#' mm_lefse <- run_lefse(
#'     kostic_crc_small,
#'     wilcoxon_cutoff = 0.01,
#'     group = "DIAGNOSIS",
#'     kw_cutoff = 0.01,
#'     multigrp_strat = TRUE,
#'     lda_cutoff = 4
#' )
#' plot_heatmap(mm_lefse, group = "DIAGNOSIS")
plot_heatmap <- function(mm,
    transform = c("log10", "log10p", "identity"),
    cluster_marker = FALSE,
    cluster_sample = FALSE,
    markers = NULL,
    label_level = 1,
    max_label_len = 60,
    sample_label = FALSE,
    scale_by_row = FALSE,
    annotation_col = NULL,
    group,
    ...) {
    stopifnot(inherits(mm, c("microbiomeMarker", "marker_table")))
    stopifnot(is.logical(sample_label))
    stopifnot(is.logical(cluster_marker))
    stopifnot(is.logical(cluster_sample))

    transform <- match.arg(transform, c("log10", "log10p", "identity"))
    mm <- transform_abundances(mm, transform = transform)

    marker <- marker_table(mm)
    # maker subset white a function here
    if (!is.null(markers)) {
        ind <- match(markers, marker$feature)
        ind_na <- is.na(ind)
        if (all(ind_na)) {
            stop(
                "all the elements of `markers`",
                " should be a contained in marker_table",
                call. = FALSE
            )
        }

        if (any(ind_na)) {
            warning(
                paste(marker[ind_na], collapse = " "),
                "are not contained in the `marker_table`"
            )
        }

        marker <- marker[ind, ]
        # reorder to keep in increase order of effect size
        marker <- marker[order(marker$effect_size), ]
    }

    abd <- as(otu_table(mm), "matrix")
    if (scale_by_row) {
        abd <- scale_rows(abd)
    }

    marker_abd <- abd[match(marker$feature, row.names(abd)), ] %>%
        as.data.frame()

    # set the feature label
    labels <- get_features_labels(
        row.names(marker_abd),
        label_level,
        max_label_len
    )
    row.names(marker_abd) <- labels

    groups <- sample_data(mm)[[group]]
    group_lvl <- unique(groups)
    idx <- lapply(group_lvl, function(x) which(groups == x))
    marker_abd <- marker_abd[, unlist(idx)]
    column_nms <- rep(group_lvl, times = lengths(idx))

    if (!sample_label) {
        colnames(marker_abd) <- NULL
    }

    if (transform == "identity") {
        lgd_title <- "Abundance"
    } else {
        lgd_title <- "Log10 Abundance"
    }

    if (scale_by_row) {
        lgd_title <- "Row Z-score"
    }

    if (!is.null(annotation_col)) {
        annotation_col <- list(Group = annotation_col)
    }

     p <- Heatmap(
        as.matrix(marker_abd),
        cluster_rows = cluster_marker,
        cluster_columns = cluster_sample,
        top_annotation = HeatmapAnnotation(Group = column_nms,
         col = annotation_col),
        name = lgd_title,
        ...
    )

    p
}

#' Scale the heatmap by the row (marker) values
#' @keywords internal
#' @noRd
scale_rows <- function(x) {
    m <- apply(x, 1, mean, na.rm = TRUE)
    s <- apply(x, 1, sd, na.rm = TRUE)
    return((x - m) / s)
    }

================================================
FILE: R/plot-postHocTest.R
================================================
#' `postHocTest` plot
#'
#' Visualize the result of post-hoc test using ggplot2
#'
#' @param pht a [`postHocTest-class`] object
#' @param feature character, to plot the post-toc test result of this feature
#' @param step_increase numeric vector with the increase in fraction of total
#' height for every additional comparison to minimize overlap, default `0.12`.
#' @name plot_postHocTest
#' @return a `ggplot` object
#' @importFrom dplyr filter
#' @importFrom ggplot2 ggplot aes labs geom_errorbar geom_point geom_boxplot
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
#' ) %>%
#'     phyloseq::subset_taxa(Phylum == "Bacteroidetes")
#' pht <- run_posthoc_test(ps, group = "Enterotype")
#' plot_postHocTest(pht, feature = "p__Bacteroidetes|g__Alistipes")
plot_postHocTest <- function(pht,
    feature,
    step_increase = 0.12) {
    abd_long <- pht@abundance %>%
        tidyr::pivot_longer(-.data$group, names_to = "feat")

    if (!is.null(feature)) {
        abd_long <- filter(abd_long, .data$feat %in% feature)
    }

    annotation <- get_sig_annotation(pht, step_increase = step_increase)

    p_box <- suppressWarnings(
        ggplot(abd_long, aes(x = .data$group, y = .data$value)) +
            geom_boxplot() +
            ggsignif::geom_signif(
                data = annotation[annotation$feature %in% feature, ],
                aes(
                    xmin = .data$xmin, xmax = .data$xmax,
                    annotations = .data$annotation, 
                    y_position = .data$y_position),
                manual = TRUE, textsize = 3, vjust = 0.2
            ) +
            labs(x = NULL, y = "Abundance")
    )

    test_res <- as.data.frame(pht@result[[feature]])
    p_test <- ggplot(test_res, aes(x = .data$comparisons)) +
        geom_errorbar(
            aes(ymin = .data$ci_lower, ymax = .data$ci_upper),
            width = 0.2
        ) +
        geom_point(aes(y = .data$diff_mean)) +
        labs(x = NULL, y = "95% confidence intervals")

    patchwork::wrap_plots(p_box + p_test)
}


#' get the annotation used for setting the location of the the significance bars
#'
#' @param pht a [postHocTest-class] object.
#' @param step_increase numeric vector with the increase in fraction of total
#' height for every additional comparison to minimize overlap, default `0.12`.
#' @noRd
get_sig_annotation <- function(pht, step_increase = 0.12) {
    abd <- pht@abundance
    group <- abd$group
    abd <- dplyr::mutate(abd, group = NULL)
    pht_res <- pht@result


    sig_annotation <- purrr::map2_df(
        abd, as.list(pht_res),
        ~ get_sig_annotation_single(.x, .y,
            group = group,
            step_increase = step_increase
        )
    )
    sig_annotation$feature <- rep(names(abd), each = nrow(pht_res[[1]]))

    sig_annotation
}


#' get the annotation data frame of a single feature
#'
#' @param abd numeric vector, abundance of a given feature
#' @param pht_df `data.frame` or `DFrame`, post hoc test result of a given
#' feature
#' @param group character vector the same length with `abd`, the group of the
#' samples
#' @param step_increase numeric vector with the increase in fraction of total
#' height for every additional comparison to minimize overlap, default `0.12`.
#' @return a data frame with four variables, `start`, `end`, `y_position`,
#' and `annotation`.
#' @noRd
get_sig_annotation_single <- function(abd,
    pht_df,
    group,
    step_increase = 0.12) {
    if (inherits(pht_df, "DFrame")) {
        pht_df <- as.data.frame(pht_df)
    }

    y_max <- split(abd, group) %>%
        purrr::map(range) %>%
        purrr::map_dbl(2)
    y_range <- max(abd) - min(abd)

    comps <- strsplit(pht_df$comparisons, "-", fixed = TRUE)
    start <- purrr::map_chr(comps, 1)
    end <- purrr::map_chr(comps, 2)
    y_max <- purrr::map2_dbl(start, end, ~ max(y_max[.x], y_max[.y]))
    y_pos <- purrr::map_dbl(
        seq_len(3),
        ~ y_max[.x] + y_range * step_increase * (.x - 1)
    )

    annotate_df <- data.frame(
        xmin = start,
        xmax = end,
        y_position = y_pos,
        annotation = pvalue2siglevel(pht_df$pvalue),
        stringsAsFactors = FALSE
    )

    annotate_df
}


#' covert p value to significance level
#'
#' <= 0.001 "\\*\\*\\*", <= 0.01 "\\*\\*", <=0.05 "\\*", > 0.05 "NS."
#' @noRd
#'
pvalue2siglevel <- function(p) {
    p[p <= 0.001] <- "***"
    p[p <= 0.01 & p > 0.001] <- "**"
    p[p > 0.01 & p <= 0.05] <- "*"
    p[p > 0.05] <- "NS."

    p
}


================================================
FILE: R/plot-sl-roc.R
================================================
#' ROC curve of microbiome marker from supervised learning methods
#'
#' Show the ROC curve of the microbiome marker calculated by `run_sl`.
#'
#' @param mm a [microbiomeMarker-class] object.
#' @param group,nfolds,nrepeats,tune_length,... same with the `run_sl()`.
#'
#' @importFrom plotROC geom_roc style_roc calc_auc
#' @importFrom ggplot2 coord_equal annotate
#' @return  a [`ggplot2::ggplot`] object.
#' @seealso [`run_sl()`]
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' # small example phyloseq object for test
#' ps_s <- phyloseq::subset_taxa(
#'     enterotypes_arumugam,
#'     Phylum %in% c("Firmicutes", "Bacteroidetes")
#' )
#'
#' set.seed(2021)
#' mm <- run_sl(
#'     ps_s,
#'     group = "Gender",
#'     taxa_rank = "Genus",
#'     nfolds = 2,
#'     nrepeats = 1,
#'     top_n = 15,
#'     norm = "TSS",
#'     method = "LR",
#' )
#' plot_sl_roc(mm, group = "Gender")
plot_sl_roc <- function(mm,
    group,
    nfolds = 3,
    nrepeats = 3,
    tune_length = 5,
    ...) {

    # sl method
    diff_method <- mm@diff_method
    diff_method <- switch(diff_method,
        "logistic regression" = "LR",
        "random forest" =  "RF",
        "support vector machine" = "SVM"
    )
    train_method <- switch(diff_method,
        LR = "glmnet",
        RF = "ranger",
        SVM = "svmLinear"
    )

    count_tab <- as(otu_table(mm), "matrix")
    marker <- marker_table(mm)
    count_marker <- count_tab[rownames(count_tab) %in% marker$feature, ]

    fitControl <- caret::trainControl(
        method = "repeatedcv",
        number = nfolds,
        repeats = nrepeats,
        classProbs = TRUE,
        summaryFunction = caret::twoClassSummary,
        savePredictions = TRUE
    )


    dat <- transpose_and_2df(count_marker)
    dat$y <- factor(sample_data(mm)[[group]])
    fit <- caret::train(
        y ~ .,
        data = dat,
        method = train_method,
        trControl = fitControl,
        tuneLength = tune_length,
        metric = "ROC",
        ...
    )

    prob_pred <- as.numeric(fit$pred$obs)
    prob_pred[prob_pred == 1] <- 0
    prob_pred[prob_pred == 2] <- 1
    df_roc <- data.frame(
        m = fit$pred[, colnames(fit$pred) == levels(fit$pred$obs)[2]],
        d = prob_pred,
        stringsAsFactors = FALSE
    )

    p <- ggplot(df_roc, aes(m = .data$m, d = .data$d)) +
        geom_roc(n.cuts = 0) +
        coord_equal() +
        style_roc()
    p <- p +
        annotate(
            "text",
            x = 0.75, y = 0.25,
            label = paste("AUC =", round((calc_auc(p))$AUC, 2))
        )

    p
}


================================================
FILE: R/post-hoc-test.R
================================================
# post hoc test -----------------------------------------------------------

#' Post hoc pairwise comparisons for multiple groups test.
#'
#' Multiple group test, such as anova and Kruskal-Wallis rank sum test, can be
#' used to uncover the significant feature among all groups. Post hoc tests are
#' used to uncover specific mean differences between pair of groups.
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param transform character, the methods used to transform the microbial
#'   abundance. See [`transform_abundances()`] for more details. The
#'   options include:
#'   * "identity", return the original data without any transformation
#'     (default).
#'   * "log10", the transformation is `log10(object)`, and if the data contains
#'     zeros the transformation is `log10(1 + object)`.
#'   * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#'   [`normalize()`] for more details.
#'   Options include:
#'   * a integer, e.g. 1e6 (default), indicating pre-sample normalization of
#'     the sum of the values to 1e6.
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over the
#'     differences of the log-transformed gene-count fold-change between the
#'     sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#' @param norm_para arguments passed to specific normalization methods
#' @param conf_level confidence level, default 0.95
#' @param method one of "tukey", "games_howell", "scheffe", "welch_uncorrected",
#' defining the method for the pairwise comparisons. See details for more
#' information.
#' @return a [postHocTest-class] object
#' @seealso [postHocTest-class], [run_test_multiple_groups()]
#' @importFrom IRanges DataFrameList
#' @importFrom dplyr mutate
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#'     enterotypes_arumugam,
#'     Enterotype %in% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
#' ) %>%
#'     phyloseq::subset_taxa(Phylum == "Bacteroidetes")
#' pht <- run_posthoc_test(ps, group = "Enterotype")
#' pht
run_posthoc_test <- function(ps,
    group,
    transform = c("identity", "log10", "log10p"),
    norm = "TSS",
    norm_para = list(),
    conf_level = 0.95,
    method = c(
        "tukey", "games_howell",
        "scheffe", "welch_uncorrected"
    )) {
    stopifnot(inherits(ps, "phyloseq"))
    ps <- check_rank_names(ps)

    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    method <- match.arg(
        method,
        c("tukey", "games_howell", "scheffe", "welch_uncorrected")
    )

    # preprocess phyloseq object
    ps <- preprocess_ps(ps)
    ps <- transform_abundances(ps, transform = transform)

    norm_para <- c(norm_para, method = norm, object = list(ps))
    ps_norm <- do.call(normalize, norm_para)
    ps_summarized <- summarize_taxa(ps_norm)

    abd_norm <- transpose_and_2df(otu_table(ps_summarized))
    feature <- tax_table(ps_summarized)@.Data[, 1]
    names(abd_norm) <- feature

    groups <- sample_data(ps_summarized)[[group]]

    result <- switch(method,
        tukey = purrr::map(
            abd_norm,
            calc_tukey_test,
            groups,
            conf_level
        ),
        games_howell = purrr::map(
            abd_norm,
            calc_games_howell_test,
            groups,
            conf_level
        ),
        scheffe = purrr::map(
            abd_norm,
            calc_scheffe_test,
            groups,
            conf_level
        ),
        welch_uncorrected = purrr::map(
            abd_norm,
            calc_welch_uncorrected_test,
            groups,
            conf_level
        )
    )

    # diff_means to diff_mean
    result <- purrr::map(result, ~ dplyr::mutate(
        .x,
        # comparions = .data$comparisons,
        diff_mean = .data$diff_means,
        .after = .data$comparisons,
        # pvalue = .data$pvalue,
        # ci_lower = .data$ci_lower,
        # ci_upper = .data$ci_upper,
        .keep = "unused"
    ))

    abundance <- abd_norm
    abundance$group <- groups
    postHocTest(
        result = DataFrameList(result),
        abundance = abundance,
        conf_level = conf_level,
        method = method,
        method_str = paste("Posthoc multiple comparisons of means:", method)
    )
}

#' Extract results from a posthoc test
#'
#' This function extracts the results of posthoc test.
#'
#' @param object a [`postHocTest-class`] object.
#' @param features either `NULL` extracts results of all features, or a
#'   character vector to specify the test resuts of which features are
#'   extracted.
#' @export
#' @return a [`IRanges::SimpleDFrameList-class`] object.
#' @examples
#' require(IRanges)
#' pht <- postHocTest(
#'     result = DataFrameList(
#'         featureA = DataFrame(
#'             comparisons = c("group2-group1",
#'                 "group3-group1",
#'                 "group3-group2"),
#'             diff_mean = runif(3),
#'             pvalue = rep(0.01, 3),
#'             ci_lower = rep(0.01, 3),
#'             ci_upper = rep(0.011, 3)
#'         ),
#'         featureB = DataFrame(
#'             comparisons = c("group2-group1",
#'                 "group3-group1",
#'                 "group3-group2"),
#'             diff_mean = runif(3),
#'             pvalue = rep(0.01, 3),
#'             ci_lower = rep(0.01, 3),
#'             ci_upper = rep(0.011, 3)
#'         )
#'     ),
#'     abundance = data.frame(
#'         featureA = runif(3),
#'         featureB = runif(3),
#'         group = c("group1", "group2", "grou3")
#'     )
#' )
#' extract_posthoc_res(pht, "featureA")[[1]]
extract_posthoc_res <- function(object, features = NULL) {
    stopifnot(inherits(object, "postHocTest"))
    res <- object@result
    if (!is.null(features)) {
        res <- res[features]
    }

    res
}

#' Tukey Post-hoc Tests
#' @param obs numeric vector, relative abundance of a feature
#' @param groups character vector, the same length of the argument `obs`
#' @param conf_level confidence level, default 0.95
#' @importFrom stats aov TukeyHSD
#' @importFrom dplyr mutate
#' @noRd
calc_tukey_test <- function(obs, groups, conf_level = 0.95) {
    df <- data.frame(obs = obs, groups = groups)
    fit <- aov(obs ~ groups, df)
    tukey_groups <- TukeyHSD(fit, "groups", conf.level = conf_level)$groups
    res <- data.frame(comparisons = row.names(tukey_groups), tukey_groups) %>%
        mutate(
            comparisons = .data$comparisons,
            diff_means = .data$diff,
            pvalue = .data$p.adj,
            ci_lower = .data$lwr,
            ci_upper = .data$upr,
            .keep = "none"
        )

    res
}

# https://github.com/kassambara/rstatix/blob/master/R/games_howell_test.R
#' Games Howell Post-hoc Tests
#' @importFrom stats var pairwise.table ptukey qtukey
#' @inheritParams calc_tukey_test
#' @noRd
calc_games_howell_test <- function(obs, groups, conf_level = 0.95) {
    groups <- factor(groups)
    groups_n <- length(levels(groups))
    if (groups_n == 1) {
        stop("The number of groups at least 3")
    }

    # Statistics for games howell tests
    grp_sizes <- tapply(obs, groups, length)
    grp_means <- tapply(obs, groups, mean)
    grp_vars <- tapply(obs, groups, var)

    # Helper functions
    get_mean_diff <- function(i, j) {
        grp_means[i] - grp_means[j]
    }
    get_weltch_sd <- function(i, j) {
        vn1 <- grp_vars[i] / grp_sizes[i]
        vn2 <- grp_vars[j] / grp_sizes[j]
        if (vn1 == 0) {
            vn1 <- 1e-6
        }
        if (vn2 == 0) {
            vn2 <- 1e-6
        }
        sqrt(vn1 + vn2)
    }
    get_degree_of_freedom <- function(i, j) {
        vn1 <- grp_vars[i] / grp_sizes[i]
        vn2 <- grp_vars[j] / grp_sizes[j]
        if (vn1 == 0) {
            vn1 <- 1e-6
        }
        if (vn2 == 0) {
            vn2 <- 1e-6
        }
        A <- (vn1 + vn2)^2
        B <- (vn1^2) / (grp_sizes[i] - 1)
        C <- (vn2^2) / (grp_sizes[j] - 1)
        A / (B + C)
    }
    correct_pairwise_table <- function(table) {
        comparisons <- purrr::map(
            colnames(table),
            ~ paste(row.names(table), "-", .x, sep = "")
        ) %>%
            unlist()
        res <- c(table[, 1], table[, 2])
        names(res) <- comparisons
        res <- res[!is.na(res)]

        res
    }

    mean_diff_table <- pairwise.table(
        get_mean_diff, levels(groups),
        p.adjust.method = "none"
    )
    mean_diff <- correct_pairwise_table(mean_diff_table)

    weltch_sd <- pairwise.table(
        get_weltch_sd, levels(groups),
        p.adjust.method = "none"
    ) %>%
        correct_pairwise_table()

    df <- pairwise.table(
        get_degree_of_freedom, levels(groups),
        p.adjust.method = "none"
    ) %>%
        correct_pairwise_table()

    t <- abs(mean_diff) / weltch_sd
    p <- ptukey(t * sqrt(2), groups_n, df, lower.tail = FALSE)
    se <- weltch_sd * sqrt(0.5)

    q <- qtukey(p = conf_level, groups_n, df = df)
    conf_high <- mean_diff + q * se
    conf_low <- mean_diff - q * se

    res <- data.frame(
        comparisons = names(mean_diff),
        diff_means = mean_diff,
        pvalue = p,
        ci_lower = conf_low,
        ci_upper = conf_high
    )

    res
}

# https://github.com/AndriSignorell/DescTools/blob/master/R/Tests.r#L3755:1
#' scheffe Post-hoc Tests
#' @inheritParams calc_tukey_test
#' @importFrom stats aov model.tables pf qf
#' @noRd
calc_scheffe_test <- function(obs, groups, conf_level = 0.95) {
    x <- aov(obs ~ groups)
    mm <- model.tables(x, "means")
    if (is.null(mm$n)) {
        stop("no factors in the fitted model")
    }
    tab <- mm$tables[["groups"]]
    MSE <- sum(x$residuals^2) / x$df.residual

    group_means <- as.vector(tab)
    nms <- names(tab)
    group_size <- mm$n[["groups"]]

    if (length(group_size) < length(group_means)) {
        group_size <- rep.int(group_size, length(group_means))
    }

    contrasts <- Contrasts(nms)
    diff_means <- apply(contrasts * group_means, 2, sum)
    sscoeff <- apply(contrasts * contrasts / group_size, 2, sum)

    dferr <- x$df.residual
    dfgrp <- length(x$residuals) - dferr - 1

    pvalue <- pf(
        diff_means^2 / (MSE * sscoeff * dfgrp),
        df1 = dfgrp,
        df2 = dferr,
        lower.tail = FALSE
    )

    critvalue <- dfgrp * qf(1 - conf_level, dfgrp, dferr, lower.tail = FALSE)
    ci_lower <- diff_means - sqrt(critvalue) * sqrt(MSE * sscoeff)
    ci_upper <- diff_means + sqrt(critvalue) * sqrt(MSE * sscoeff)

    res <- data.frame(
        comparisons = names(diff_means),
        diff_means = diff_means,
        pvalue = pvalue,
        ci_lower = ci_lower,
        ci_upper = ci_upper
    )

    res
}

Contrasts <- function(levs) {
    k <- length(levs)
    M <- data.frame(levs = levs)
    for (i in seq_len(k - 1)) {
        for (j in (i + 1):k) {
            con <- rep(0, k)
            con[i] <- -1
            con[j] <- 1
            nm <- paste(levs[j], levs[i], sep = "-")
            M[[nm]] <- con
        }
    }
    row.names(M) <- levs

    return(M[-1])
}

#' welch's uncorrected Post-hoc Tests
#' @inheritParams calc_tukey_test
#' @noRd
calc_welch_uncorrected_test <- function(obs, groups, conf_level = 0.95) {
    group_means <- tapply(obs, groups, mean)
    nms <- names(group_means)

    contrasts <- Contrasts(nms)
    diff_means <- apply(contrasts * group_means, 2, sum)

    obs_split <- split(obs, groups)
    comparisons <- names(diff_means)
    comparison_groups <- strsplit(comparisons, "-", fixed = TRUE)

    welch_res <- purrr::map(
        comparison_groups,
        ~ t.test(obs_split[[.x[1]]], obs_split[[.x[2]]],
            conf.level = conf_level)
    )

    pvalue <- purrr::map_dbl(welch_res, ~ .x$p.value)
    ci_lower <- purrr::map_dbl(welch_res, ~ .x$conf.int[1])
    ci_upper <- purrr::map_dbl(welch_res, ~ .x$conf.int[2])

    res <- data.frame(
        comparisons = comparisons,
        diff_means = diff_means,
        pvalue = pvalue,
        ci_lower = ci_lower,
        ci_upper = ci_upper
    )

    res
}


================================================
FILE: R/reexports.R
================================================
#' @importMethodsFrom phyloseq ntaxa
#' @importFrom phyloseq ntaxa
#' @export
#' @exportMethod ntaxa
phyloseq::ntaxa

#' @importMethodsFrom phyloseq nsamples
#' @importFrom phyloseq nsamples
#' @export
#' @exportMethod nsamples
phyloseq::nsamples

#' @importMethodsFrom phyloseq otu_table
#' @importFrom phyloseq otu_table
#' @export
#' @exportMethod otu_table
phyloseq::otu_table

#' @importMethodsFrom phyloseq sample_data
#' @importFrom phyloseq sample_data
#' @export
#' @exportMethod sample_data
phyloseq::sample_data

#' @importMethodsFrom phyloseq tax_table
#' @importFrom phyloseq tax_table
#' @export
#' @exportMethod tax_table
phyloseq::tax_table

#' @importMethodsFrom phyloseq taxa_names
#' @importFrom phyloseq taxa_names
#' @export
#' @exportMethod taxa_names
phyloseq::taxa_names

#' @export
#' @exportMethod sample_names
#' @importFrom phyloseq sample_names
#' @importMethodsFrom phyloseq sample_names
phyloseq::sample_names


#' @importFrom magrittr %>%
#' @export
magrittr::`%>%`

#' @importFrom phyloseq import_qiime
#' @export
phyloseq::import_qiime

#' @importFrom phyloseq import_mothur
#' @export
phyloseq::import_mothur

#' @importFrom phyloseq import_biom
#' @export
phyloseq::import_biom


================================================
FILE: R/subset-marker.R
================================================
#' Subset microbiome markers
#'
#' Subset markers based on an expression related to the columns and values
#' within the `marker_table` slot of `mm`.
#'
#' @param mm a [`microbiomeMarker-class`] or [`marker_table-class`] object.
#' @param ... the subsetting expression passed to [`base::subset()`].
#' @return a subset object in the same class with `mm`.
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' mm <- run_limma_voom(
#'     enterotypes_arumugam,
#'     "Enterotype",
#'     contrast = c("Enterotype 3", "Enterotype 2"),
#'     pvalue_cutoff = 0.01,
#'     p_adjust = "none"
#' )
#' subset_marker(mm, pvalue < 0.005)
subset_marker <- function(mm, ...) {
    if (is.null(marker_table(mm))) {
        warning("No marker_table in `mm`")
        return(mm)
    }

    marker <- data.frame(marker_table(mm))
    marker_new <- subset(marker, ...)

    if (inherits(mm, "marker_table")) {
        return(marker_table(marker_new))
    } else {
        marker_table(mm) <- marker_new
        return(mm)
    }
}


================================================
FILE: R/summarize-taxa.R
================================================
#' Summarize taxa into a taxonomic level within each sample
#'
#' Provides summary information of the representation of a taxonomic levels
#' within each sample.
#'
#' @param ps a \code{\link[phyloseq]{phyloseq-class}} object.
#' @param level taxonomic level to summarize, default the top level rank of the
#'  `ps`.
#' @param absolute logical, whether return the absolute abundance or
#'   relative abundance, default `FALSE`.
#' @param sep a character string to separate the taxonomic levels.
#' @return a [`phyloseq::phyloseq-class`] object, where each row represents a
#'   taxa, and each col represents the taxa abundance of each sample.
#' @export
#' @examples
#' data(enterotypes_arumugam)
#' summarize_taxa(enterotypes_arumugam)
summarize_taxa <- function(ps,
    level = rank_names(ps)[1],
    absolute = TRUE,
    sep = "|") {

    # return ps if it has been summarized
    summarized <- check_tax_summarize(ps)
    if (summarized) {
        otu_summarized <- otu_table(ps) %>%
            add_missing_levels()
        tax_summarized <- row.names(otu_summarized) %>%
            matrix() %>%
            tax_table()
        row.names(tax_summarized) <- row.names(otu_summarized)
        return(phyloseq(otu_summarized, tax_summarized, sample_data(ps)))
    }

    if (!has_prefix(ps)) {
        ps <- add_prefix(ps)
    }

    ps_ranks <- rank_names(ps)
    if (!level %in% ps_ranks) {
        stop("`level` must in the ranks of `ps` (rank_names(ps))")
    }

    ind <- match(level, ps_ranks)
    levels <- ps_ranks[ind:length(ps_ranks)]
    res <- purrr::map(
        levels,
        ~ .summarize_taxa_level(
            ps,
            rank = .x,
            absolute = absolute,
            sep = sep
        )
    )
    tax_nms <- purrr::map(res, row.names) %>% unlist()
    res <- bind_rows(res)
    row.names(res) <- tax_nms

    otu_summarized <- otu_table(res, taxa_are_rows = TRUE)
    tax_summarized <- row.names(otu_summarized) %>%
        matrix() %>%
        tax_table()
    row.names(tax_summarized) <- row.names(otu_summarized)
    row.names(otu_summarized) <- row.names(tax_summarized)

    # To ensure the rank of the summarized object is valid (one of  "domain"  
    # "kingdom" "phylum"  "class"   "order"   "family"  "genus"   "species"),  
    # set it (column names of tax_summarized) as the top level rank in the ps
    # object.
    #  
    # rank_prefix <- extract_prefix(ps_ranks)
    # colnames(tax_summarized) <- paste0(rank_prefix, collapse = sep)
    colnames(tax_summarized) <- ps_ranks[1]

    return(phyloseq(otu_summarized, tax_summarized, sample_data(ps)))
}

#' extract prefix of names of the taxonomic ranks
#' @noRd
extract_prefix <- function(ranks) {
    if (inherits(ranks, "phyloseq")) {
        ranks <- rank_names(ranks)
    }

    tolower(substr(ranks, 1, 1))
}

#' Summarize the taxa for the specific rank
#' @noRd
.summarize_taxa_level <- function(ps,
    rank_name,
    absolute = TRUE,
    sep = "|") {
    if (!absolute) {
        ps <- transform_sample_counts(ps, function(x) x / sum(x))
    }

    otus <- otu_table(ps)
    otus_extend <- slot(otus, ".Data") %>%
        tibble::as_tibble()

    taxas <- tax_table(ps)@.Data %>%
        tibble::as_tibble()

    ranks <- setdiff(available_ranks, "Summarize")
    rank_level <- match(rank_name, ranks)
    select_ranks <- intersect(ranks[seq_len(rank_level)], rank_names(ps))

    consensus <- taxas[, select_ranks] %>%
        purrr::pmap_chr(paste, sep = sep)
    otus_extend$consensus <- consensus

    taxa_summarized <- group_split(otus_extend, consensus) %>%
        purrr::map(.sum_consensus)
    taxa_summarized <- do.call(rbind, taxa_summarized)
    # filter taxa of which abundance is zero
    ind <- rowSums(taxa_summarized) != 0
    taxa_summarized <- taxa_summarized[ind, ]

    taxa_summarized
}

#' sum all otus which belongs to the same taxa
#' @noRd
.sum_consensus <- function(x) {
    consensus <- unique(x$consensus)
    if (length(consensus) != 1) {
        stop("consensus in the same group muste be the same")
    }

    x$consensus <- NULL
    res <- as.data.frame(t(colSums(x)))
    row.names(res) <- consensus

    return(res)
}

# check whether taxa has prefix or not
has_prefix <- function(ps) {
    sample_tax <- tax_table(ps)[1, 1]
    if (substr(sample_tax, 2, 3) == "__") {
        return(TRUE)
    } else {
        return(FALSE)
    }
}

# add ranks prefix, e.g k__, p__, only worked for unsummarized data
add_prefix <- function(ps) {
    tax <- as(tax_table(ps), "matrix") %>%
        as.data.frame()
    lvl <- colnames(tax)
    prefix <- get_prefix(lvl)

    tax_new <- mapply(function(x, y) paste0(x, y), 
        prefix, tax, SIMPLIFY = FALSE)
    tax_new <- do.call(cbind, tax_new)
    row.names(tax_new) <- row.names(tax)
    colnames(tax_new) <- lvl
    tax_table(ps) <- tax_new

    ps
}


================================================
FILE: R/test-utilities.R
================================================
# transpose otu_table and then convert it to data.frame
#' @importMethodsFrom phyloseq t
transpose_and_2df <- function(ot) {
    t(ot) %>% as.data.frame()
}


================================================
FILE: R/transform.R
================================================
#' Transform the taxa abundances in `otu_table` sample by sample
#'
#' Transform the taxa abundances in `otu_table` sample by sample, which means
#' the counts of each sample will be transformed individually.
#'
#' @param object  [`otu_table-class`], [`phyloseq-class`], or
#'   [`microbiomeMarker-class`].
#' @param transform transformation to apply, the options inclulde:
#' * "identity", return the original data without any transformation.
#' * "log10", the transformation is `log10(object)`, and if the data contains
#'   zeros the transformation is `log10(1 + object)`.
#' * "log10p", the transformation is `log10(1 + object)`.
#' @importFrom phyloseq t otu_table<-
#' @return A object matches the class of argument `object` with the transformed
#'   `otu_table`.
#' @export
#' @seealso [`abundances()`]
#' @examples
#' data(oxygen)
#' x1 <- transform_abundances(oxygen)
#' head(otu_table(x1), 10)
#' x2 <- transform_abundances(oxygen, "log10")
#' head(otu_table(x2), 10)
#' x3 <- transform_abundances(oxygen, "log10p")
#' head(otu_table(x3), 10)
transform_abundances <- function(object,
    transform = c("identity", "log10", "log10p")) {
    transform <- match.arg(transform, c("identity", "log10", "log10p"))
    otu <- as(otu_table(object), "matrix")

    if (transform == "identity") {
        abd <- otu
    } else if (transform == "log10") {
        abd <- transform_log10(otu)
    } else {
        abd <- transform_log10p(otu)
    }

    otu_table(object) <- otu_table(abd, taxa_are_rows = taxa_are_rows(object))

    object
}

# the data is transformed using log10(1 + x) if the data contains zeroes
transform_log10 <- function(x) {
    if (min(x) == 0) {
        warning("OTU table contains zeroes. Using log10(1 + x) instead.")
        x_norm <- log10(1 + x)
    } else {
        x_norm <- log10(x)
    }

    x_norm
}

# the data is transformed using log10(1 + x)
transform_log10p <- function(x) {
    log10(1 + x)
}


================================================
FILE: R/utilities.R
================================================
#' check whether tax abundance table is summarized or not
#' @noRd
check_tax_summarize <- function(ps) {
    taxa <- row.names(otu_table(ps))
    # whether taxa is separated by `|`,
    # may be required to add extra separate strings in the future
    has_separate <- any(grepl("[|]", taxa))

    has_separate
}


# whether picrust functional profile
is_picrust2 <- function(ps) {
    ps_ranks <- rank_names(ps)
    if ("Picrust_trait" %in% ps_ranks) TRUE else FALSE
}

#' check whether all names of taxonomic ranks include in available_ranks
#' @noRd
check_rank_names <- function(ps) {
    ps_ranks <- rank_names(ps)
    if (is_picrust2(ps)) {
        picrust_rank <- c("Picrust_trait", "Picrust_description")
        diff_rank <- setdiff(ps_ranks, picrust_rank)
        if (length(diff_rank)) {
            stop("ranks of picrust2 functional profile must be one of ",
                 paste(picrust_rank, collapse = ", "))
        }
    } else {
        if (!all(ps_ranks %in% available_ranks)) {
            stop(
                "ranks of taxonimic profile must be one of ",
                paste(available_ranks, collapse = ", ")
            )
        }
    }

    invisible(ps)
}

#' only first letter in lower case
#' @noRd
upper_firstletter <- function(x) {
    paste(toupper(substr(x, 1, 1)), tolower(substr(x, 2, nchar(x))), sep = "")
}

#' add prefix of taxonomic ranks for summarized data construct from original
#' lefse (galaxy lefse or python app) input, p__, k__
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param ranks character vector, prefix of ranks to add, e.g. "p", "c"
#' @importFrom phyloseq taxa_names<-
#' @noRd
add_prefix_summarized <- function(ps, ranks_prefix, sep = "|") {
    tax <- tax_table(ps)@.Data[, 1]
    tax_split <- strsplit(tax, split = sep, fixed = TRUE)

    if (max(lengths(tax_split)) != length(ranks_prefix)) {
        stop(
            "The length of `ranks_prefix` muste be",
            " equal to number of taxonomic ranks.",
            call. = FALSE
        )
    }

    # ensure the ranks_prefix is contained in available_ranks
    # and in descending order
    available_prefix <- get_available_prefix(available_ranks)
    if (!all(ranks_prefix %in% available_prefix)) {
        stop("all elements of ranks_prefix must be contained ", "
            in available_ranks")
    }
    ranks_prefix <- keep_prefix_desc(ranks_prefix, type = "ranks_prefix")

    tax_prefix <- purrr::map(
        tax_split,
        ~ paste0(ranks_prefix[seq_along(.x)], "__", .x) %>%
            paste0(collapse = sep)
    )
    tax_prefix <- do.call(rbind, tax_prefix)
    colnames(tax_prefix) <- paste0(ranks_prefix, collapse = sep)
    tax_table(ps) <- tax_table(tax_prefix)
    taxa_names(ps) <- tax_prefix[, 1]

    ps
}

# extract the first letter of taxonomic ranks as the prefixes of the ranks
get_available_prefix <- function(ranks) {
    substr(ranks, 1, 1) %>%
        tolower()
}

# keep prefix in descending order: "k" "p" "c" "o" "f" "g" "s"
keep_prefix_desc <- function(ranks_prefix, type = c("ranks", "ranks_prefix")) {
    type <- match.arg(type, choices = c("ranks", "ranks_prefix"))
    available_prefix <- get_available_prefix(available_ranks)
    idx_desc <- sort(match(ranks_prefix, available_prefix))

    if (type == "ranks") {
        return(available_ranks[idx_desc])
    } else {
        return(available_prefix[idx_desc])
    }
}

# check whether var in sample meta data, or raise an error
check_var_in_meta <- function(var, sample_meta) {
    stopifnot(inherits(sample_meta, "sample_data"))
    meta_nms <- names(sample_meta)
    if (!var %in% meta_nms) {
        stop(var, " must be one of variable of `sample_meta`", call. = FALSE)
    }
}

################################################################################
## preprocessing ps object
################################################################################

# preprocess of phyloseq object, including keep taxa in rows,
# filter taxa whose abundance is zero, fix duplicated tax
# filter samples whose library size is zero
#' @importFrom phyloseq prune_samples
preprocess_ps <- function(ps) {
    zero_sample <- check_samples(ps)
    if (!is.null(zero_sample)) {
        warning(
            "The library size of sample(s): ",
            paste(zero_sample, collapse = ", "),
            " is/are zero, and will be removed in the subsequent analysis."
        )

        keep <- setdiff(sample_names(ps), zero_sample)
        ps <- prune_samples(keep, ps)
    }

    # keep taxa in rows
    ps <- keep_taxa_in_rows(ps)
    # filter the taxa whose abundance is zero
    ps <- phyloseq_qc(ps)
    # fix duplicated tax
    ps <- fix_duplicate_tax(ps)

    ps
}

#' phyloseq quality control, remove otu/asv of which abundance is zero
#' @noRd
phyloseq_qc <- function(ps) {
    prune_taxa(taxa_sums(ps) > 0, ps)
}

#' Transpose the phyloseq object to ensure taxa are in rows
#' @param ps a [phyloseq::phyloseq-class] object
#' @importMethodsFrom phyloseq t
#' @keywords internal
#' @noRd
keep_taxa_in_rows <- function(ps) {
    if (!taxa_are_rows(ps)) {
        ps <- t(ps)
    }

    ps
}

# https://github.com/lch14forever/microbiomeVizb
# /94cbfe452a735aadf88733b27b8221a03f450a55/R/utils.R#L68-L86
#
#' Duplicated taxa: e.g. maybe multiple species (s__uncultured)
#' belong to different genera. append the upper level taxa to the taxa to
#' distinguish this duplicated taxa
#' @param ps [phyloseq::phyloseq-class] object or
#'   [phyloseq::taxonomyTable-class] object
#' @importFrom phyloseq tax_table<-
#' @keywords internal
#' @noRd
fix_duplicate_tax <- function(ps) {
    # convert na to Unknown first
    ps <- fix_na_tax(ps)

    tax <- tax_table(ps)
    if (ncol(tax) == 1) {
        return(ps)
    }

    for (i in 2:ncol(tax)) {
        tax_uniq <- unique(tax[, i])
        for (j in seq_along(tax_uniq)) {
            if (is.na(tax_uniq[j])) next
            ind <- which(tax[, i] == as.character(tax_uniq[j]))
            if (length(unique(tax[ind, i - 1])) > 1) {
                tax[ind, i] <- paste(tax[ind, i - 1], tax[ind, i], sep = "_")
            }
        }
    }

    tax_table(ps) <- tax

    ps
}

#' set NA (missing) tax to its prefix, e.g. s__ (or s__unknown?)
#' @keywords internal
#' @noRd
fix_na_tax <- function(ps) {
    tax <- as.data.frame(tax_table(ps))

    tax_fixed <- purrr::imap_dfc(
        tax,
        ~ ifelse(is.na(.x), get_prefix(.y), .x)
    ) %>%
        as.matrix()
    row.names(tax_fixed) <- taxa_names(ps)
    tax_table(ps) <- tax_fixed

    ps
}

# extract the prefix of taxonomic ranks
get_prefix <- function(ranks) {
    prefix <- substr(ranks, 1, 1) %>%
        tolower() %>%
        paste("__", sep = "")

    prefix
}


# `metagenomeSeq::cumNormStatFast()` requires counts of all samples at least
# have two non zero features. Thus, if there are samples with only one non-zer
# features, `cumNormStat()` is taken to compute the pth quantile.
# This function was used to select the function to calculate the quantile used
# for CSS norm factors estimation in metagenomeSeq.
select_quantile_func <- function(counts) {
    if (sum(colSums(counts > 0) > 1) < ncol(counts)) {
        fun_p <- metagenomeSeq::cumNormStat
    } else {
        fun_p <- metagenomeSeq::cumNormStatFast
    }

    fun_p
}


get_norm_method <- function(norm) {
    new_norm <- ifelse(
        is.numeric(norm),
        paste("per-sample normalized (sum of all taxa) to", norm),
        norm
    )

    new_norm
}


# check samples in ps, make sure at least one non zero features in a sample
check_samples <- function(ps) {
    if (!taxa_are_rows(ps)) {
        ps <- t(ps)
    }
    lib_size <- colSums(otu_table(ps))
    zero_ind <- which(lib_size == 0)

    if (length(zero_ind) == 0) {
        return(NULL)
    }

    return(sample_names(ps)[zero_ind])
}


# remove samples with missing values for any variable specified in the group
remove_na_samples <- function(ps, group) {
    groups <- sample_data(ps)[[group]]
    na_idx <- is.na(groups)
    if (all(!na_idx)) {
        return(ps)
    }

    sample_nms <- sample_names(ps)
    warning(
        "remove sample(s): ", paste(sample_nms[na_idx], collapse = ","),
        ", which with missing value in `", group, "`"
    )
    ps <- phyloseq::prune_samples(sample_nms[!na_idx], ps)

    ps
}


## calculate coef for edgeR, metagenomeSeq
# if contrast is a two length character, set the first element as the first level
# (reference group), the second element as the second level, return a single
# integer
#
# if contrast is null, return a integer vector (number of levels - 1)
check_contrast <- function(contrast) {
    if (!is.null(contrast)) {
        if (!is.character(contrast) || length(contrast) != 2) {
            stop("`contrast` must be a two length character", call. = FALSE)
        }
    }

    contrast
}

set_lvl <- function(groups, contrast) {
    if (!is.factor(groups)) {
        stop("`groups` must be a factor", call. = FALSE)
    }
    # this will will change the elements simultaneously
    # levels(groups) <- c(contrast, setdiff(levels(groups), contrast))
    groups <- factor(
        groups,
        levels =  c(contrast, setdiff(levels(groups), contrast))
    )

    groups
}

create_design <- function(groups, meta, confounders = character(0)) {
    if (inherits(meta, "sample_data")) {
        meta <- data.frame(meta)
    }
    model_data <- data.frame(group = groups)
    if (!length(confounders)) {
        design <- stats::model.matrix(~ group, data = model_data)
    } else {
        model_data[confounders] <- meta[confounders]
        design <- stats::model.matrix(
            formula(paste(
                "~ + ",
                paste(c(confounders, "group"), collapse = " + "))),
            data = model_data
        )
    }

    design
}

calc_coef <- function(groups, design, contrast = NULL) {
    contrast <- check_contrast(contrast)
    groups <- set_lvl(groups, contrast)
    lvl <- levels(groups)
    n_lvl <- length(lvl)
    n_design <- ncol(design)

    if (n_lvl < 2) {
        stop("Differential analysis requires at least two groups.")
    }

    if (n_lvl == 2) { # two groups
        if (!is.null(contrast)) {
            warning(
                "`contrast` is ignored, you do not need to set it",
                call. = FALSE
            )
        }
        coef <- n_design
    } else { # multiple groups
        if (!is.null(contrast)) {
            coef <- n_design - n_lvl + 2L
        } else {
            coef <- (n_design - n_lvl + 2L):n_design
        }
    }

    coef
}

# create_contrast <- function(groups, contrast = NULL) {
#     if (!is.factor(groups)) {
#         groups <- factor(groups)
#     }
#     lvl <- levels(groups)
#     n_lvl <- length(lvl)
#     if (n_lvl < 2) {
#         stop("Differential analysis requires at least two groups.")
#     }
#
#     if (n_lvl == 2) { # two groups
#         if (!is.null(contrast)) {
#             warning(
#                 "`contrast` is ignored, you do not need to set it",
#                 call. = FALSE
#             )
#         }
#         design <- rep(0, n_lvl)
#         design[1] <- -1
#         design[2] <- 1
#     } else { # multiple groups
#         if (!is.null(contrast)) {
#             if (!is.character(contrast) || length(contrast) != 2) {
#                 stop("`contrast` must be a two length character", call. = FALSE)
#             }
#
#             idx <- match(contrast, lvl, nomatch = 0L)
#             if (!all(idx)) {
#                 stop(
#                     "all elements of `contrast` must be contained in `groups`",
#                     call. = FALSE
#                 )
#             }
#             design <- rep(0, n_lvl)
#             design[idx[1]] <- -1
#             design[idx[2]] <- 1
#             design <- matrix(design)
#             row.names(design) <- lvl
#             colnames(design) <- paste0(contrast[2], "-", contrast[1])
#         } else {
#             design <- create_pairwise_contrast(lvl)
#         }
#     }
#
#     design
# }
#
# # create all pair-wise comparisons (contrasts) for anova-like test
# create_pairwise_contrast <- function(groups) {
#     groups <- factor(groups)
#     lvl <- levels(groups)
#     n <- length(lvl)
#
#     design <- matrix(0, n, choose(n, 2))
#     rownames(design) <- lvl
#     colnames(design) <- seq_len(choose(n, 2))
#     k <- 0
#     for (i in seq_len(n - 1)) {
#         for (j in (i + 1):n) {
#             k <- k + 1
#             design[j, k] <- 1
#             design[i, k] <- -1
#             colnames(design)[k] <- paste(lvl[j], "-", lvl[i], sep = "")
#         }
#     }
#     design
# }


# extract the specify rank of phyloseq object, return a phyloseq object
# with only one rank
extract_rank <- function(ps, taxa_rank) {
    ranks <- rank_names(ps)
    if (!taxa_rank %in% c("none", ranks)) {
        stop(
            "`taxa_rank` must be one of options: none, ",
            paste(rank_names(ps), collapse = ", "),
            call. = FALSE
        )
    }
    if (taxa_rank != "none") {
        ps <- aggregate_taxa(ps, taxa_rank)
        new_tax_table <- tax_table(ps)[, taxa_rank]
    } else {
        taxon <- taxa_names(ps)
        new_tax_table <- tax_table(matrix(taxon))
        colnames(new_tax_table) <- "otu"
        rownames(new_tax_table) <- taxon
    }

    tax_table(ps) <- new_tax_table

    # set the taxa names as the corresponding names
    if (taxa_rank != "none") {
        taxa_names(ps) <- new_tax_table[, 1]
    }

    ps
}

# only used for check the argument taxa_rank which is used to specify
# taxonomic rank to perform differential analysis on
check_taxa_rank <- function(ps, taxa_rank) {
    ranks <- rank_names(ps)
    all_taxa_rank <- c("all", "none", ranks)
    if (!taxa_rank %in% all_taxa_rank) {
        stop(
            "`taxa_rank` must be one of ",
            paste(all_taxa_rank, collapse = ", "),
            call. = FALSE
        )
    }

    invisible(ps)
}

# preprocess the ps according to para taxa_rank
pre_ps_taxa_rank <- function(ps, taxa_rank) {
    if (is_picrust2(ps)) {
        warning("para `taxa_rank` is not worked for picrust2 function profile",
                " and it will be ignored")
        return(ps)
    }

    ps <- check_taxa_rank(ps, taxa_rank)
    if (taxa_rank == "all") {
        ps_orig_summarized <- summarize_taxa(ps)
    } else if (taxa_rank == "none") {
        ps_orig_summarized <- extract_rank(ps, taxa_rank)
    } else {
        ps_orig_summarized <- aggregate_taxa(ps, taxa_rank) %>%
            extract_rank(taxa_rank)
    }

    return(ps_orig_summarized)
}

# return the marker_table, if no significant marker return NULL
return_marker <- function(sig_feature, all_feature) {
    if (nrow(sig_feature)) {
        row.names(sig_feature) <- paste0("marker", seq_len(nrow(sig_feature)))
        marker <- marker_table(sig_feature)

    } else {
        warning("No marker was identified", call. = FALSE)
        marker <- NULL
    }

    marker
}


# For multiple groups comparison of LRT test of DESeq2.
# Only fold changes of pair-wise comparisons are supported in DESse2.
# https://support.bioconductor.org/p/131272/#131450
# https://github.com/qiime2/q2-composition/blob/HEAD/q2_composition/_ancom.py
#'
#' Calculate effect size, mean differences for two groups, and f statistic of
#' one-way anova for multiple groups.
#' @noRd
#' @importFrom stats lm aov
calc_ef_md_f <- function(feature_abd, group) {
    group_n <- length(unique(group))
    if (group_n < 2) {
        stop("The number of group must be greater than 2", call. = FALSE)
    }

    if (group_n == 2) {
        ef <- abs(lm(feature_abd ~ group)$coefficients[2])
    } else if (group_n > 2) {
        # f statistic from aov
        ef <- summary(aov(feature_abd ~ group))[[1]]$`F value`[1]
    }

    ef
}

# create phyloseq from microbiomeMarker object,
# and keep only nodes correlated with significant features
create_ps_from_mm <- function(mm, only_marker = TRUE) {
    ot <- otu_table(mm)
    tt <- tax_table(mm)
    st <- sample_data(mm)
    mt <- marker_table(mm)
    sig_features <- mt$feature

    # extract all nodes correlated with the significant features
    # First, all parent nodes of marker
    down_nodes <- strsplit(sig_features, "|", fixed = TRUE) %>%
        purrr::map(~ purrr::map_chr(
            seq_along(.x), function(y) paste(.x[1:y], collapse = "|")))
    down_nodes <- unique(unlist(down_nodes))
    # Two, all children nodes of marker
    all_features <- tt@.Data[, 1]
    up_nodes <- purrr::map(sig_features,
               ~ all_features[grepl(.x, all_features, fixed = TRUE)])
    up_nodes <- unique(unlist(up_nodes))
    idx <- match(unique(c(down_nodes, up_nodes)), all_features)

    if (only_marker) {
        ot <- ot[idx, ]
        tt <- tt[idx, ]
    }
    ps <- phyloseq(ot, tt, st)

    ps
}

# check confounder
check_confounder <- function(ps, target_var, confounders = NULL) {
    meta <- sample_data(ps)
    vars <- names(meta)

    if (! target_var %in% vars) {
        stop(
            "the interested var `target_var` must be contained in the meta data",
            call. = FALSE
        )
    }

    other_vars <- setdiff(vars, target_var)

     if (is.null(confounders)) {
        confounders <- other_vars
        if (! length(confounders)) {
            stop("No confounding var in sample meta data")
        }
    } else {
        out_confounder <- setdiff(confounders, other_vars)
        if (length(out_confounder)) {
            stop("var(s) `", paste(out_confounder, collapse = "`, ` "),
                 "` not be contained in the sample meta data")
        }
    }

    confounders
}

# generate n spaces character
space <- function(n) {
    paste(rep(" ", each = n), collapse = "")
}


================================================
FILE: README.Rmd
================================================
---
title:: R package for microbiome biomarker discovery
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

# microbiomeMarker

<a href='https://github.com/yiluheihei/microbiomeMarker'/><img src='man/figures/microbiomeMarker.png' height="150" align="right" style="float:right; height:150px;" />

<!-- badges: start -->
[![](https://img.shields.io/badge/release%20version-1.2.1-green.svg)](https://www.bioconductor.org/packages/microbiomeMarker)
[![](https://img.shields.io/badge/devel%20version-1.3.2-green.svg)](https://github.com/yiluheihei/microbiomeMarker)
[![platform](http://www.bioconductor.org/shields/availability/devel/microbiomeMarker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/microbiomeMarker.html#archives)
[![BioC status](http://www.bioconductor.org/shields/build/release/bioc/microbiomeMarker.svg)](https://bioconductor.org/checkResults/release/bioc-LATEST/microbiomeMarker)
[![Bioc years](http://www.bioconductor.org/shields/years-in-bioc/microbiomeMarker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/microbiomeMarker.html#since)
[![R build status](https://github.com/yiluheihei/microbiomeMarker/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/yiluheihei/microbiomeMarker/actions)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://github.com/yiluheihei/microbiomeMarker/blob/master/LICENSE.md)
[![Codecov test coverage](https://codecov.io/gh/yiluheihei/microbiomeMarker/branch/master/graph/badge.svg)](https://codecov.io/gh/yiluheihei/microbiomeMarker?branch=master)
[![DOI](https://zenodo.org/badge/215731961.svg)](https://zenodo.org/badge/latestdoi/215731961)
[![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
![GitHub Repo stars](https://img.shields.io/github/stars/yiluheihei/microbiomeMarker?style=social)
<!-- badges: end -->

***microbiomeMarker*** is still under development, your suggestion and contribution will be highly appreciated. If you think this project is helpful to you, you can give this project a :star:.

## Motivation

The aim of this package is to build a unified toolbox in R for microbiome 
biomarker discovery by integrating existing widely used differential analysis methods.

## Installation

Install the package from Bioconductor directly:

```{r install-bioc,eval=FALSE}
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("microbiomeMarker")
```

Or install the development version of the package from
[Github](https://github.com/yiluheihei/microbiomeMarker). 

```{r install-gh,eval=FALSE}
if (!requireNamespace("remotes", quietly=TRUE))
  install.packages("remotes")
remotes::install_github("yiluheihei/microbiomeMarker")
```

For more details on how to use ***microbiomeMarker***, please see the help page 
or [website](https://yiluheihei.github.io/microbiomeMarker/index.html) of our 
package.

## Citation

Kindly cite as follows: 

**Yang Cao**, Qingyang Dong, Dan Wang, Pengcheng Zhang, Ying Liu, Chao 
Niu, microbiomeMarker: an R/Bioconductor package for microbiome marker 
identification and visualization. Bioinformatics, 2022, btac438. doi: [10.1093/bioinformatics/btac438](https://doi.org/10.1093/bioinformatics/btac438)

Please cite the corresponding methods paper too:

- LEfSe: Segata, Nicola, Jacques Izard, et al. 2011. Metagenomic Biomarker 
  Discovery and Explanation. Genome Biology 12 (6): 1–18. 
  doi: [10.1186/gb-2011-12-6-r60](https://doi.org/10.1186/gb-2011-12-6-r60)
- metagenomeSeq: Paulson, Joseph N, O Colin Stine, et al. 2013. Differential 
  Abundance Analysis for Microbial Marker-Gene Surveys. Nature Methods 10 (12): 
  1200–1202. doi: [10.1038/nmeth.2658](https://doi.org/10.1038/nmeth.2658)
- ANCOM: Mandal, Siddhartha, Will Van Treuren, et al. 2015. Analysis of 
  Composition of Microbiomes: A Novel Method for Studying Microbial Composition.
  Microbial Ecology in Health and Disease 26 (1): 27663. 
  doi: [10.3402/mehd.v26.27663](https://doi.org/10.3402/mehd.v26.27663)
- ANCOMBC: Lin, Huang, and Shyamal Das Peddada. 2020. Analysis of Compositions 
  of Microbiomes with Bias Correction. Nature Communications 11 (1): 1–11. 
   doi: [10.1038/s41522-020-00160-w](https://doi.org/10.1038/s41522-020-00160-w)
- ALDEx2: Fernandes, Andrew D, Jennifer Ns Reid, et al. 2014. Unifying the 
  Analysis of High-Throughput Sequencing Datasets: Characterizing Rna-Seq, 16S 
  rRNA Gene Sequencing   and Selective Growth Experiments by Compositional Data 
  Analysis.  Genome Biology 15(2): 1–17. 
  doi: [10.1186/2049-2618-2-15](https://doi.org/10.1186/2049-2618-2-15)
- edgeR: Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010. EdgeR: A
  Bioconductor Package for Differential Expression Analysis of Digital Gene 
  Expression Data. Bioinformatics 26 (1): 139–40. doi: 
  [10.1093/bioinformatics/btp616](https://doi.org/10.1093/bioinformatics/btp616)
- DESeq2: Love, Michael I, Wolfgang Huber, and Simon Anders. 2014. Moderated 
  Estimation of Fold Change and Dispersion for Rna-Seq Data with Deseq2. Genome 
  Biology 15 (12): 1–21. 
  doi: [10.1186/s13059-014-0550-8](https://doi.org/10.1186/s13059-014-0550-8)
- limma-voom: Law, Charity W, Yunshun Chen, et al. 2014. Voom: Precision Weights
  Unlock Linear Model Analysis Tools for Rna-Seq Read Counts. Genome biology, 
  15(2), 1-17. 
  doi: [10.1186/gb-2014-15-2-r29](https://doi.org/10.1186/gb-2014-15-2-r29)

### Publications citing microbiomeMarker

- Jorge F, Froissard C, Dheilly N M, et al. Bacterial community dynamics
  following antibiotic exposure in a trematode parasite. International 
  journal for parasitology, 2022, 52(5): 265-274. 
  https://doi.org/10.1016/j.ijpara.2021.11.006
- Ghosh A, Thakur M, Sharma L K, et al. Linking gut microbiome with the feeding
  behavior of the Arunachal macaque (Macaca munzala). Scientific reports, 2021,
  11(1): 1-10. https://doi.org/10.1038/s41598-021-01316-0
- Doi R, Wu Y, Kawai Y, et al. Transition and regulation mechanism of bacteria 
  biota   in Kishu saba-narezushi (mackerel narezushi) during its fermentation 
  step. Journal of bioscience and bioengineering, 2021, 132(6): 606-612. 
  https://doi.org/10.1016/j.jbiosc.2021.09.002
- Nielsen K R, Ingham A C, Midjord J, et al. Similar Gut Bacterial Composition
  Between Patients With Ulcerative Colitis and Healthy Controls in a High
  Incidence Population: A Cross-sectional Study of the Faroe Islands IBD Cohort.
  Inflammatory Bowel Diseases. https://doi.org/10.1093/ibd/izab355
- Prevel R, Enaud R, Orieux A, et al. Gut bacteriobiota and mycobiota are both
  associated with Day-28 mortality among critically ill patients. Critical Care,
  2022, 26(1): 1-9. https://doi.org/10.1186/s13054-022-03980-8
- Tandon K, Chiou Y J, Yu S P, et al. Microbiome restructuring: dominant coral
  bacterium Endozoicomonas species display differential adaptive capabilities 
  to environmental changes[J]. bioRxiv, 2021.
  https://doi.org/10.1101/2021.10.31.466697
- Dai D, Zhu J, Sun C, et al. GMrepo v2: a curated human gut microbiome database
  with special focus on disease markers and cross-dataset comparison. Nucleic 
  acids research, 2022, 50(D1): D777-D784. https://doi.org/10.1093/nar/gkab1019
- Molinero N, Taladrid D, Zorraquín-Peña I, et al. Ulcerative Colitis Seems to 
  Imply Oral Microbiome Dysbiosis. Current Issues in Molecular Biology, 2022,
  44(4): 1513-1527. https://doi.org/10.3390/cimb44040103
- Ricci F, Tandon K, Black J R, et al. Host Traits and Phylogeny Contribute to
  Shaping Coral-Bacterial Symbioses[J]. Msystems, 2022, 7(2): e00044-22.
  https://doi.org/10.1128/msystems.00044-22
- Chavarria K A, Saltonstall K, Vinda J, et al. Land use influences stream 
  bacterial communities in lowland tropical watersheds. Scientific reports, 
  2021, 11(1): 1-12. https://doi.org/10.1038/s41598-021-01193-7
- Lu H, Gao N L, Tong F, et al. Alterations of the Human Lung and Gut 
  Microbiomes in Non-Small Cell Lung Carcinomas and Distant Metastasis. 
  Microbiology spectrum, 2021, 9(3): e00802-21.
  https://doi.org/10.1128/Spectrum.00802-21
- Ingham A C, Kielsen K, Mordhorst H, et al. Microbiota long-term dynamics and
  prediction of acute graft-versus-host disease in pediatric allogeneic stem 
  cell transplantation[J]. Microbiome, 2021, 9(1): 1-28.
  https://doi.org/10.1186/s40168-021-01100-2
- Wang R, Cao S, Bashir M E H, et al. Microbial metabolite butyrate-prodrug
  polymeric micelles promote gut health and treat food allergies. bioRxiv, 
  2022. https://doi.org/10.1101/2022.05.01.490224
- Shanmugam G, Lee SH, Jeon J. EzMAP: Easy Microbiome Analysis Platform. BMC
  bioinformatics. 2021 Dec;22(1):1-0. 
  https://doi.org/10.1186/s12859-021-04106-7
- Altaib H, Nakamura K, Abe M, et al. Differences in the concentration of the 
  fecal neurotransmitters GABA and glutamate are associated with microbial 
  composition among healthy human subjects. Microorganisms. 2021. Feb;9(2):378. 
  https://doi.org/10.3390/microorganisms9020378
- Künstner A, Aherrahrou R, Hirose M, et al. Effect of Differences in the 
  Microbiome of Cyp17a1-Deficient Mice on Atherosclerotic Background. Cells. 
  2021 Jun;10(6):1292. https://doi.org/10.3390/cells10061292
- Ingham AC, Urth TR, Sieber RN, et al. Dynamics of the human nasal microbiota 
  and Staphylococcus aureus CC398 carriage in pig truck drivers across one 
  workweek. Applied and Environmental Microbiology. 2021 Jun 30:AEM-01225.
  https://doi.org/10.1128/AEM.01225-21
- Shibata T, Nakagawa M, Coleman HN, et al. Evaluation of DNA extraction 
  protocols from liquid-based cytology specimens for studying cervical 
  microbiota. Plos one 16, no. 8 2021. https://doi.org/10.1371/journal.pone.0237556

## Question

If you have any question, please file an issue on the issue tracker following 
the instructions in the issue template:

Please briefly describe your problem, what output actually happened, and what 
output you expect.

Please provide a minimal reproducible example. For more details on how to make 
a great minimal reproducible example, see https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example 
and https://www.tidyverse.org/help/#reprex.

```
Brief description of the problem

# insert minimal reprducible example here
```

## Acknowledgement

We thanks all the developers of the methods integrated into our package.

- [lefse python script](https://bitbucket.org/biobakery/biobakery/wiki/lefse), 
  The main lefse code are translated from ***lefse python script***,
- [microbiomeViz](https://github.com/lch14forever/microbiomeViz), cladogram 
  visualization of lefse is modified from ***microbiomeViz***.
- [phyloseq](https://github.com/joey711/phyloseq), the main data structures used 
  in ***microbiomeMarker*** are from or inherit from `phyloseq-class` in package 
  ***phyloseq***.
- [MicrobiotaProcess](https://github.com/YuLab-SMU/MicrobiotaProcess), function 
  `import_dada2()` and `import_qiime2()` are modified from the 
  `MicrobiotaProcess::import_dada2()`.
- [qiime2R](https://github.com/jbisanz/qiime2R), `import_qiime2()` are refer to 
  the functions in ***qiime2R***.


================================================
FILE: README.md
================================================

<!-- README.md is generated from README.Rmd. Please edit that file -->

# microbiomeMarker

<a href='https://github.com/yiluheihei/microbiomeMarker'/><img src='man/figures/microbiomeMarker.png' height="150" align="right" style="float:right; height:150px;" />

<!-- badges: start -->

[![](https://img.shields.io/badge/release%20version-1.2.1-green.svg)](https://www.bioconductor.org/packages/microbiomeMarker)
[![](https://img.shields.io/badge/devel%20version-1.3.2-green.svg)](https://github.com/yiluheihei/microbiomeMarker)
[![platform](http://www.bioconductor.org/shields/availability/devel/microbiomeMarker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/microbiomeMarker.html#archives)
[![BioC
status](http://www.bioconductor.org/shields/build/release/bioc/microbiomeMarker.svg)](https://bioconductor.org/checkResults/release/bioc-LATEST/microbiomeMarker)
[![Bioc
years](http://www.bioconductor.org/shields/years-in-bioc/microbiomeMarker.svg)](https://www.bioconductor.org/packages/devel/bioc/html/microbiomeMarker.html#since)
[![R build
status](https://github.com/yiluheihei/microbiomeMarker/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/yiluheihei/microbiomeMarker/actions)
[![License: GPL
v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://github.com/yiluheihei/microbiomeMarker/blob/master/LICENSE.md)
[![Codecov test
coverage](https://codecov.io/gh/yiluheihei/microbiomeMarker/branch/master/graph/badge.svg)](https://codecov.io/gh/yiluheihei/microbiomeMarker?branch=master)
[![DOI](https://zenodo.org/badge/215731961.svg)](https://zenodo.org/badge/latestdoi/215731961)
[![Lifecycle:
stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
![GitHub Repo
stars](https://img.shields.io/github/stars/yiluheihei/microbiomeMarker?style=social)
<!-- badges: end -->

***microbiomeMarker*** is still under development, your suggestion and
contribution will be highly appreciated. If you think this project is
helpful to you, you can give this project a :star:.

## Motivation

The aim of this package is to build a unified toolbox in R for
microbiome biomarker discovery by integrating existing widely used
differential analysis methods.

## Installation

Install the package from Bioconductor directly:

``` r
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("microbiomeMarker")
```

Or install the development version of the package from
[Github](https://github.com/yiluheihei/microbiomeMarker).

``` r
if (!requireNamespace("remotes", quietly=TRUE))
  install.packages("remotes")
remotes::install_github("yiluheihei/microbiomeMarker")
```

For more details on how to use ***microbiomeMarker***, please see the
help page or
[website](https://yiluheihei.github.io/microbiomeMarker/index.html) of
our package.

## Citation

Kindly cite as follows:

**Yang Cao**, Qingyang Dong, Dan Wang, Pengcheng Zhang, Ying Liu, Chao
Niu, microbiomeMarker: an R/Bioconductor package for microbiome marker
identification and visualization. Bioinformatics, 2022, btac438. doi:
[10.1093/bioinformatics/btac438](https://doi.org/10.1093/bioinformatics/btac438)

Please cite the corresponding methods paper too:

-   LEfSe: Segata, Nicola, Jacques Izard, et al. 2011. Metagenomic
    Biomarker Discovery and Explanation. Genome Biology 12 (6): 1–18.
    doi:
    [10.1186/gb-2011-12-6-r60](https://doi.org/10.1186/gb-2011-12-6-r60)
-   metagenomeSeq: Paulson, Joseph N, O Colin Stine, et al. 2013.
    Differential Abundance Analysis for Microbial Marker-Gene Surveys.
    Nature Methods 10 (12): 1200–1202. doi:
    [10.1038/nmeth.2658](https://doi.org/10.1038/nmeth.2658)
-   ANCOM: Mandal, Siddhartha, Will Van Treuren, et al. 2015. Analysis
    of Composition of Microbiomes: A Novel Method for Studying Microbial
    Composition. Microbial Ecology in Health and Disease 26 (1): 27663.
    doi:
    [10.3402/mehd.v26.27663](https://doi.org/10.3402/mehd.v26.27663)
-   ANCOMBC: Lin, Huang, and Shyamal Das Peddada. 2020. Analysis of
    Compositions of Microbiomes with Bias Correction. Nature
    Communications 11 (1): 1–11. doi:
    [10.1038/s41522-020-00160-w](https://doi.org/10.1038/s41522-020-00160-w)
-   ALDEx2: Fernandes, Andrew D, Jennifer Ns Reid, et al. 2014. Unifying
    the Analysis of High-Throughput Sequencing Datasets: Characterizing
    Rna-Seq, 16S rRNA Gene Sequencing and Selective Growth Experiments
    by Compositional Data Analysis. Genome Biology 15(2): 1–17. doi:
    [10.1186/2049-2618-2-15](https://doi.org/10.1186/2049-2618-2-15)
-   edgeR: Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010.
    EdgeR: A Bioconductor Package for Differential Expression Analysis
    of Digital Gene Expression Data. Bioinformatics 26 (1): 139–40. doi:
    [10.1093/bioinformatics/btp616](https://doi.org/10.1093/bioinformatics/btp616)
-   DESeq2: Love, Michael I, Wolfgang Huber, and Simon Anders. 2014.
    Moderated Estimation of Fold Change and Dispersion for Rna-Seq Data
    with Deseq2. Genome Biology 15 (12): 1–21. doi:
    [10.1186/s13059-014-0550-8](https://doi.org/10.1186/s13059-014-0550-8)
-   limma-voom: Law, Charity W, Yunshun Chen, et al. 2014. Voom:
    Precision Weights Unlock Linear Model Analysis Tools for Rna-Seq
    Read Counts. Genome biology, 15(2), 1-17. doi:
    [10.1186/gb-2014-15-2-r29](https://doi.org/10.1186/gb-2014-15-2-r29)

### Publications citing microbiomeMarker

-   Jorge F, Froissard C, Dheilly N M, et al. Bacterial community
    dynamics following antibiotic exposure in a trematode parasite.
    International journal for parasitology, 2022, 52(5): 265-274.
    <https://doi.org/10.1016/j.ijpara.2021.11.006>
-   Ghosh A, Thakur M, Sharma L K, et al. Linking gut microbiome with
    the feeding behavior of the Arunachal macaque (Macaca munzala).
    Scientific reports, 2021, 11(1): 1-10.
    <https://doi.org/10.1038/s41598-021-01316-0>
-   Doi R, Wu Y, Kawai Y, et al. Transition and regulation mechanism of
    bacteria biota in Kishu saba-narezushi (mackerel narezushi) during
    its fermentation step. Journal of bioscience and bioengineering,
    2021, 132(6): 606-612.
    <https://doi.org/10.1016/j.jbiosc.2021.09.002>
-   Nielsen K R, Ingham A C, Midjord J, et al. Similar Gut Bacterial
    Composition Between Patients With Ulcerative Colitis and Healthy
    Controls in a High Incidence Population: A Cross-sectional Study of
    the Faroe Islands IBD Cohort. Inflammatory Bowel Diseases.
    <https://doi.org/10.1093/ibd/izab355>
-   Prevel R, Enaud R, Orieux A, et al. Gut bacteriobiota and mycobiota
    are both associated with Day-28 mortality among critically ill
    patients. Critical Care, 2022, 26(1): 1-9.
    <https://doi.org/10.1186/s13054-022-03980-8>
-   Tandon K, Chiou Y J, Yu S P, et al. Microbiome restructuring:
    dominant coral bacterium Endozoicomonas species display differential
    adaptive capabilities to environmental changes\[J\]. bioRxiv, 2021.
    <https://doi.org/10.1101/2021.10.31.466697>
-   Dai D, Zhu J, Sun C, et al. GMrepo v2: a curated human gut
    microbiome database with special focus on disease markers and
    cross-dataset comparison. Nucleic acids research, 2022, 50(D1):
    D777-D784. <https://doi.org/10.1093/nar/gkab1019>
-   Molinero N, Taladrid D, Zorraquín-Peña I, et al. Ulcerative Colitis
    Seems to Imply Oral Microbiome Dysbiosis. Current Issues in
    Molecular Biology, 2022, 44(4): 1513-1527.
    <https://doi.org/10.3390/cimb44040103>
-   Ricci F, Tandon K, Black J R, et al. Host Traits and Phylogeny
    Contribute to Shaping Coral-Bacterial Symbioses\[J\]. Msystems,
    2022, 7(2): e00044-22. <https://doi.org/10.1128/msystems.00044-22>
-   Chavarria K A, Saltonstall K, Vinda J, et al. Land use influences
    stream bacterial communities in lowland tropical watersheds.
    Scientific reports, 2021, 11(1): 1-12.
    <https://doi.org/10.1038/s41598-021-01193-7>
-   Lu H, Gao N L, Tong F, et al. Alterations of the Human Lung and Gut
    Microbiomes in Non-Small Cell Lung Carcinomas and Distant
    Metastasis. Microbiology spectrum, 2021, 9(3): e00802-21.
    <https://doi.org/10.1128/Spectrum.00802-21>
-   Ingham A C, Kielsen K, Mordhorst H, et al. Microbiota long-term
    dynamics and prediction of acute graft-versus-host disease in
    pediatric allogeneic stem cell transplantation\[J\]. Microbiome,
    2021, 9(1): 1-28. <https://doi.org/10.1186/s40168-021-01100-2>
-   Wang R, Cao S, Bashir M E H, et al. Microbial metabolite
    butyrate-prodrug polymeric micelles promote gut health and treat
    food allergies. bioRxiv,
    2022. <https://doi.org/10.1101/2022.05.01.490224>
-   Shanmugam G, Lee SH, Jeon J. EzMAP: Easy Microbiome Analysis
    Platform. BMC bioinformatics. 2021 Dec;22(1):1-0.
    <https://doi.org/10.1186/s12859-021-04106-7>
-   Altaib H, Nakamura K, Abe M, et al. Differences in the concentration
    of the fecal neurotransmitters GABA and glutamate are associated
    with microbial composition among healthy human subjects.
    Microorganisms. 2021. Feb;9(2):378.
    <https://doi.org/10.3390/microorganisms9020378>
-   Künstner A, Aherrahrou R, Hirose M, et al. Effect of Differences in
    the Microbiome of Cyp17a1-Deficient Mice on Atherosclerotic
    Background. Cells. 2021 Jun;10(6):1292.
    <https://doi.org/10.3390/cells10061292>
-   Ingham AC, Urth TR, Sieber RN, et al. Dynamics of the human nasal
    microbiota and Staphylococcus aureus CC398 carriage in pig truck
    drivers across one workweek. Applied and Environmental Microbiology.
    2021 Jun 30:AEM-01225. <https://doi.org/10.1128/AEM.01225-21>
-   Shibata T, Nakagawa M, Coleman HN, et al. Evaluation of DNA
    extraction protocols from liquid-based cytology specimens for
    studying cervical microbiota. Plos one 16, no. 8 2021.
    <https://doi.org/10.1371/journal.pone.0237556>

## Question

If you have any question, please file an issue on the issue tracker
following the instructions in the issue template:

Please briefly describe your problem, what output actually happened, and
what output you expect.

Please provide a minimal reproducible example. For more details on how
to make a great minimal reproducible example, see
<https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example>
and <https://www.tidyverse.org/help/#reprex>.

    Brief description of the problem

    # insert minimal reprducible example here

## Acknowledgement

We thanks all the developers of the methods integrated into our package.

-   [lefse python
    script](https://bitbucket.org/biobakery/biobakery/wiki/lefse), The
    main lefse code are translated from ***lefse python script***,
-   [microbiomeViz](https://github.com/lch14forever/microbiomeViz),
    cladogram visualization of lefse is modified from
    ***microbiomeViz***.
-   [phyloseq](https://github.com/joey711/phyloseq), the main data
    structures used in ***microbiomeMarker*** are from or inherit from
    `phyloseq-class` in package ***phyloseq***.
-   [MicrobiotaProcess](https://github.com/YuLab-SMU/MicrobiotaProcess),
    function `import_dada2()` and `import_qiime2()` are modified from
    the `MicrobiotaProcess::import_dada2()`.
-   [qiime2R](https://github.com/jbisanz/qiime2R), `import_qiime2()` are
    refer to the functions in ***qiime2R***.


================================================
FILE: _pkgdown.yml
================================================
reference:
  - title: "Data import"
    desc: >
      Functions for importing external data and converting other R object
      as phyloseq or reverse converting
    contents:
      - starts_with("import")
      - import_biom
      - phyloseq2DESeq2
      - phyloseq2edgeR
      - phyloseq2metagenomeSeq
      - otu_table2metagenomeSeq
  - title: "Microbiome marker"
    desc: S4 class and methods for microbiomeMarker
    contents:
      - microbiomeMarker
      - starts_with("microbiomeMarker-class")
      - marker_table-class
      - marker_table
      - marker_table<-
      - assign-otu_table
      - abundances
      - nmarker
      - "["
      - starts_with("postHocTest-class")
      - postHocTest
      - extract_posthoc_res
  - title: Functions reexports from phyloseq
    contents: reexports
      # - nsamples
      # - ntaxa
      # - otu_table
      # - sample_data
      # - sample_names
      # - tax_table
      # - taxa_names
  - title: "Normalization"
    contents:
      - abundances
      - transform_abundances
      - starts_with("norm")
  - title: "Differential analysis"
    desc: Functions for identifying the microbiome markers
    contents:
      - confounder
      - starts_with("run")
  - title: "Comparison of different methods"
    contents: 
      - compare_DA
      - summary.compareDA
  - title: "Visualization"
    contents:
      - starts_with("plot")
  - title: "Example data"
    contents:
      starts_with("data")
  - title: Miscellaneous
    contents:
      - aggregate_taxa
      - subset_marker
      - summarize_taxa


================================================
FILE: codecov.yml
================================================
comment: false

# disable project and patch check
coverage:
  status:
    project:
      default:
        enabled: false
    patch:
      default:
        enabled: false
    changes: no

after_success:
    - Rscript -e 'covr::codecov()'


================================================
FILE: data-raw/available_ranks.R
================================================
# available taxonomic ranks, Summarize represents summarized tax
available_ranks <- c(
    "Kingdom", "Phylum", "Class", "Order",
    "Family", "Genus", "Species"
)
available_ranks <- factor(
    available_ranks,
    levels = available_ranks
)

usethis::use_data(available_ranks, internal = TRUE, overwrite = TRUE)


================================================
FILE: data-raw/data.R
================================================
library(MicrobiomeAnalystR)
library(phyloseq)
library(magrittr)
# Human Moving Picture from MicrobiomeAnalyst server ----------------------

download.file(
    "https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/data/treebiom.zip",
    "data-raw/caporaso.zip"
)
unzip("data-raw/caporaso.zip", exdir = "data-raw/")
file.rename("data-raw/treebiom/", "data-raw/caporaso/")

ps <- import_biom(
    "data-raw/caporaso/otu_table_mc2_w_tax_no_pynast_failures.biom",
    treefilename = "data-raw/caporaso/rep_set.tre",
)

colnames(tax_table(ps)) <- c(
    "Kingdom", "Phylum", "Class", "Order",
    "Family", "Genus", "Species"
)

sampledata <- read.delim("data-raw/caporaso/map.txt", row.names = 1) %>%
    sample_data()
caporaso <- merge_phyloseq(ps, sampledata)

usethis::use_data(caporaso, overwrite = TRUE)
unlink("data-raw/cap*", recursive = TRUE)


# cid data from github.com/ying14/yingtools2 ------------------------------
download.file(
    "https://github.com/ying14/yingtools2/raw/master/data/cid.phy.rda",
    "data-raw/cid.phy.rda"
)
load("data-raw/cid.phy.rda")
cid_ying <- cid.phy
tax_table(cid_ying) <- tax_table(cid.phy)[, -7]
usethis::use_data(cid_ying, overwrite = TRUE)
unlink("data-raw/cid*")

# pediatric ibd -----------------------------------------------------------

# https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/data/ibd_data.zip
download.file(
    "https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/data/ibd_data.zip",
    "data-raw/pediatric_idb.zip"
)
unzip("data-raw/pediatric_idb.zip", exdir = "data-raw/")
asv_abundance <- readr::read_tsv("data-raw/ibd_data/IBD_data/ibd_asv_table.txt") %>%
    tibble::column_to_rownames("#NAME")
asv_table <- readr::read_tsv("data-raw/ibd_data/IBD_data/ibd_taxa.txt") %>%
    tibble::column_to_rownames("#TAXONOMY")
sample_table <- readr::read_csv("data-raw/ibd_data/IBD_data/ibd_meta.csv") %>%
    tibble::column_to_rownames("#NAME")
pediatric_ibd <- phyloseq(
    otu_table(asv_abundance, taxa_are_rows = TRUE),
    tax_table(as.matrix(asv_table)),
    sample_data(sample_table)
)
tree <- read_tree(treefile = "data-raw/ibd_data/IBD_data/ibd_tree.tre")
phy_tree(pediatric_ibd) <- tree
usethis::use_data(pediatric_ibd, overwrite = TRUE)
unlink("data-raw/ibd_data", recursive = TRUE)
unlink("data-raw/pediatric_idb.zip")


# oxygen availability -----------------------------------------------------
# a small subset of the HMP 16S dataset for finding biomarkers characterizing
# different level of oxygen availability in different bodysites

# oxygen_dat <- readr::read_tsv(
#   "https://raw.githubusercontent.com/biobakery/biobakery/master/demos/biobakery_demos/data/lefse/input/hmp_small_aerobiosis.txt",
#   col_names = FALSE
# )

# sample_meta <- data.frame(
#   oxygen_availability	= c(oxygen_dat[1, ][-1], recursive = TRUE),
#   body_site = c(oxygen_dat[2, ][-1], recursive = TRUE),
#   subject_id = c(oxygen_dat[3, ][-1], recursive = TRUE)
# )
#   # tibble::rownames_to_column() %>%
#   # tidyr::pivot_longer(-rowname) %>%
#   #tidyr::pivot_wider(names_from = "rowname", values_from = "value") %>%
#   #tibble::column_to_rownames("name")
# tax_dat <- oxygen_dat$X1[-(1:3)]
#
# sample_abd <- dplyr::slice(oxygen_dat, -(1:3)) %>%
#   dplyr::select(-1) %>%
#   purrr::map_df(as.numeric)
# row.names(sample_abd) <- tax_dat
#
# tax_mat <- as.matrix(tax_dat)
# row.names(tax_mat) <- tax_dat
# colnames(tax_mat) <-  "Summarize"
#
# oxygen <- phyloseq(
#   otu_table(sample_abd, taxa_are_rows = TRUE),
#   tax_table(tax_mat),
#   sample_data(sample_meta)
# )
#
download.file("https://raw.githubusercontent.com/biobakery/biobakery/master/demos/biobakery_demos/data/lefse/input/hmp_small_aerobiosis.txt", "data-raw/oxygen.txt")
oxygen <- import_biobakery_lefse_in(
    "data-raw/oxygen.txt",
    ranks_prefix = c("k", "p", "c", "o", "f", "g"),
    meta_rows = 1:3,
)
unlink("data-raw/oxygen.txt")
usethis::use_data(oxygen, overwrite = TRUE)

# data from lefse galaxy --------------------------------------------------
# Fecal microbiota in a mouse model of spontaneous colitis. The dataset contains
# 30 abundance profiles (obtained processing the 16S reads with RDP) belonging
# to 10 rag2 (control) and 20 truc (case) mice
# spontaneous_colitis <- readr::read_tsv(
#   "https://raw.githubusercontent.com/biobakery/galaxy_lefse/master/test-data/lefse_input",
#   col_names = FALSE
# )
# class <- spontaneous_colitis[1, ]
# taxas <- spontaneous_colitis[, 1]
#
# sample_meta <- data.frame(
#   class = unlist(class[-1]),
#   stringsAsFactors = FALSE
# )
# tax_dat <- as.matrix(taxas[-1, ])
# row.names(tax_dat) <- tax_dat
# colnames(tax_dat) <- "Summarize"
# tax_abd <- spontaneous_colitis[-1, -1] %>%
#   purrr::map_df(as.numeric)
# row.names(tax_abd) <- tax_dat[,1]
#
# spontaneous_colitis <- phyloseq(
#   otu_table(tax_abd, taxa_are_rows = TRUE),
#   tax_table(tax_dat),
#   sample_data(sample_meta)
# )
download.file("https://raw.githubusercontent.com/biobakery/galaxy_lefse/master/test-data/lefse_input", "data-raw/lefse_in")
spontaneous_colitis <- import_biobakery_lefse_in(
    "data-raw/lefse_in",
    ranks_prefix = c("k", "p", "c", "o", "f", "g"),
    meta_rows = 1,
)
unlink("data-raw/lefse_in")
usethis::use_data(spontaneous_colitis, overwrite = TRUE)

# Enterotypes data from Arumugam's paper from stamp -----------------------

enterotypes_arumugam <- readr::read_tsv("https://github.com/yiluheihei/STAMP/raw/master/examples/EnterotypesArumugam/Enterotypes.profile.spf")

enterotypes_arumugam_meta <- readr::read_tsv("https://github.com/yiluheihei/STAMP/raw/master/examples/EnterotypesArumugam/Enterotypes.metadata.tsv") %>% as.data.frame()
row.names(enterotypes_arumugam_meta) <- enterotypes_arumugam_meta$`Sample Id`

enterotype_abd <- dplyr::select(enterotypes_arumugam, -Phyla, -Genera)
enterotype_tax <- dplyr::select(enterotypes_arumugam, Phylum = Phyla, Genus = Genera)

enterotypes_arumugam <- phyloseq(
    otu_table(enterotype_abd, taxa_are_rows = TRUE),
    tax_table(as.matrix(enterotype_tax)),
    sample_data(enterotypes_arumugam_meta)
)

usethis::use_data(enterotypes_arumugam, overwrite = TRUE)


# kostic crc --------------------------------------------------------------
# data from https://bioconductor.org/packages/devel/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html

# A publicly available data from a study on colorectal cancer:
# Genomic analysis identifies association of Fusobacterium with colorectal
# carcinoma. Kostic, A. D., Gevers, D., Pedamallu, C. S., Michaud, M., Duke,
# ., Earl, A. M., et al. (2012). Genome research, 22(2), 292-298.
#
filepath <- system.file(
    "extdata",
    "study_1457_split_library_seqs_and_mapping.zip",
    package = "phyloseq"
)
kostic <- phyloseq::microbio_me_qiime(filepath)

# remove the 5 samples that had no DIAGNOSIS attribute assigned
kostic <- subset_samples(kostic, DIAGNOSIS != "None")
# remove samples with less than 500 reads (counts)
kostic_crc <- prune_samples(sample_sums(kostic) > 500, kostic)
usethis::use_data(kostic_crc, overwrite = TRUE)


# ecam from ancom paper ---------------------------------------------------
ecam_meta <- readr::read_tsv("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/data/ecam-sample-metadata.tsv")
# remove var types: #q2:types
ecam_meta <- ecam_meta[-1, ]
ecam_feature_table <- readr::read_tsv("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/data/ecam-table-taxa.tsv", skip = 1)
taxa <- ecam_feature_table$`feature-id`
feature_table <- dplyr::select(ecam_feature_table, -`feature-id`)

# taxa table
taxa_table <- lapply(taxa, strsplit, split = ";", fixed = TRUE)
taxa_table <- lapply(taxa_table, unlist)
taxa_table <- do.call(rbind, taxa_table)
taxa_table <- data.frame(taxa_table)
names(taxa_table) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
# make sure all unknown taxa as prefix__
# set the NA|unknown|unclassified to __, and then add prefix
prefixes <- c("k", "p", "c", "o", "f", "g")
taxa_table <- purrr::map2_df(
    taxa_table, prefixes,
    ~ ifelse(.x == "__", paste0(.y, .x), .x)
)
ecam_meta <- phyloseq::sample_data(ecam_meta)
row.names(ecam_meta) <- names(feature_table)


ecam <- phyloseq::phyloseq(
    phyloseq::otu_table(as(feature_table, "matrix"), taxa_are_rows = TRUE),
    phyloseq::tax_table(as(taxa_table, "matrix")),
    phyloseq::sample_data(ecam_meta)
)
usethis::use_data(ecam, overwrite = TRUE)


================================================
FILE: inst/CITATION
================================================
citHeader("To cite microbiomeMarker in publications use:")

citEntry(
    entry  = "article",
    title  = "microbiomeMarker: an R/Bioconductor package for microbiome marker identification and visualization",
    author = personList(
        as.person("Yang Cao"),
        as.person("Qingyang Dong"),
        as.person("Dan Wang"),
        as.person("Pengcheng Zhang"),
        as.person("Ying Liu"),
        as.person("Chao Niu")
        ),
    journal = "Bioinformatics",
    year    = "2022",
    month   = "06",
    doi     = "10.1093/bioinformatics/btac438",
    PMID    = "35771644",
    textVersion = paste("Yang Cao, Qingyang Dong, Dan Wang, Pengcheng Zhang, Ying Liu, Chao Niu.",
                        "microbiomeMarker: an R/Bioconductor package for microbiome marker identification and visualization.",
                        "Bioinformatics, 2022, btac438. doi: 10.1093/bioinformatics/btac438")
)


================================================
FILE: inst/extdata/dada2_samdata.txt
================================================
Subject	Gender	Day	When
F3D0	3	F	0	Early
F3D1	3	F	1	Early
F3D141	3	F	141	Late
F3D142	3	F	142	Late
F3D143	3	F	143	Late
F3D144	3	F	144	Late
F3D145	3	F	145	Late
F3D146	3	F	146	Late
F3D147	3	F	147	Late
F3D148	3	F	148	Late
F3D149	3	F	149	Late
F3D150	3	F	150	Late
F3D2	3	F	2	Early
F3D3	3	F	3	Early
F3D5	3	F	5	Early
F3D6	3	F	6	Early
F3D7	3	F	7	Early
F3D8	3	F	8	Early
F3D9	3	F	9	Early
Mock	ock	M	NA	Early


================================================
FILE: inst/extdata/picrust2_metadata.tsv
================================================
SampleID	Facility	Genotype
100CHE6KO	PaloAlto	KO
101CHE6WT	PaloAlto	WT
102CHE6WT	PaloAlto	WT
103CHE6KO	PaloAlto	KO
104CHE6KO	PaloAlto	KO
20CMK6KO	Dalhousie	KO
21CMK6WT	Dalhousie	WT
22CMK6KO	Dalhousie	KO
23CMK6WT	Dalhousie	WT
24CMK6KO	Dalhousie	KO
26CMK6WT	Dalhousie	WT
30CMK6KO	Dalhousie	KO
32CMK6KO	Dalhousie	KO
33CMK6WT	Dalhousie	WT
34CMK6KO	Dalhousie	KO
36CMK6WT	Dalhousie	WT
81CHE6WT	PaloAlto	WT
82CHE6WT	PaloAlto	WT
84CHE6KO	PaloAlto	KO
86CHE6WT	PaloAlto	WT
87CHE6KO	PaloAlto	KO
88CHE6KO	PaloAlto	KO
99CHE6KO	PaloAlto	KO
9CMK6KO	Dalhousie	KO


================================================
FILE: inst/extdata/sample-metadata.tsv
================================================
sample-id	barcode-sequence	body-site	year	month	day	subject	reported-antibiotic-usage	days-since-experiment-start
#q2:types	categorical	categorical	numeric	numeric	numeric	categorical	categorical	numeric
L1S8	AGCTGACTAGTC	gut	2008	10	28	subject-1	Yes	0
L1S57	ACACACTATGGC	gut	2009	1	20	subject-1	No	84
L1S76	ACTACGTGTGGT	gut	2009	2	17	subject-1	No	112
L1S105	AGTGCGATGCGT	gut	2009	3	17	subject-1	No	140
L2S155	ACGATGCGACCA	left palm	2009	1	20	subject-1	No	84
L2S175	AGCTATCCACGA	left palm	2009	2	17	subject-1	No	112
L2S204	ATGCAGCTCAGT	left palm	2009	3	17	subject-1	No	140
L2S222	CACGTGACATGT	left palm	2009	4	14	subject-1	No	168
L3S242	ACAGTTGCGCGA	right palm	2008	10	28	subject-1	Yes	0
L3S294	CACGACAGGCTA	right palm	2009	1	20	subject-1	No	84
L3S313	AGTGTCACGGTG	right palm	2009	2	17	subject-1	No	112
L3S341	CAAGTGAGAGAG	right palm	2009	3	17	subject-1	No	140
L3S360	CATCGTATCAAC	right palm	2009	4	14	subject-1	No	168
L5S104	CAGTGTCAGGAC	tongue	2008	10	28	subject-1	Yes	0
L5S155	ATCTTAGACTGC	tongue	2009	1	20	subject-1	No	84
L5S174	CAGACATTGCGT	tongue	2009	2	17	subject-1	No	112
L5S203	CGATGCACCAGA	tongue	2009	3	17	subject-1	No	140
L5S222	CTAGAGACTCTT	tongue	2009	4	14	subject-1	No	168
L1S140	ATGGCAGCTCTA	gut	2008	10	28	subject-2	Yes	0
L1S208	CTGAGATACGCG	gut	2009	1	20	subject-2	No	84
L1S257	CCGACTGAGATG	gut	2009	3	17	subject-2	No	140
L1S281	CCTCTCGTGATC	gut	2009	4	14	subject-2	No	168
L2S240	CATATCGCAGTT	left palm	2008	10	28	subject-2	Yes	0
L2S309	CGTGCATTATCA	left palm	2009	1	20	subject-2	No	84
L2S357	CTAACGCAGTCA	left palm	2009	3	17	subject-2	No	140
L2S382	CTCAATGACTCA	left palm	2009	4	14	subject-2	No	168
L3S378	ATCGATCTGTGG	right palm	2008	10	28	subject-2	Yes	0
L4S63	CTCGTGGAGTAG	right palm	2009	1	20	subject-2	No	84
L4S112	GCGTTACACACA	right palm	2009	3	17	subject-2	No	140
L4S137	GAACTGTATCTC	right palm	2009	4	14	subject-2	No	168
L5S240	CTGGACTCATAG	tongue	2008	10	28	subject-2	Yes	0
L6S20	GAGGCTCATCAT	tongue	2009	1	20	subject-2	No	84
L6S68	GATACGTCCTGA	tongue	2009	3	17	subject-2	No	140
L6S93	GATTAGCACTCT	tongue	2009	4	14	subject-2	No	168

================================================
FILE: man/abundances-methods.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/abundances-methods.R
\docType{methods}
\name{abundances}
\alias{abundances}
\alias{abundances,otu_table-method}
\alias{abundances,}
\alias{otu_table-method}
\alias{abundances,phyloseq-method}
\alias{abundances,microbiomeMarker-method}
\title{Extract taxa abundances}
\usage{
abundances(object, transform = c("identity", "log10", "log10p"), norm = FALSE)

\S4method{abundances}{otu_table}(object, transform = c("identity", "log10", "log10p"), norm = FALSE)

\S4method{abundances}{phyloseq}(object, transform = c("identity", "log10", "log10p"), norm = FALSE)

\S4method{abundances}{microbiomeMarker}(object, transform = c("identity", "log10", "log10p"))
}
\arguments{
\item{object}{\code{\linkS4class{otu_table}}, \code{\linkS4class{phyloseq}}, or
\code{\linkS4class{microbiomeMarker}}.}

\item{transform}{transformation to apply, the options inclulde:
\itemize{
\item "identity", return the original data without any transformation.
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{logical, indicating whether or not to return the normalized
taxa abundances.}
}
\value{
abundance matrix with taxa in rows and samples in columns.
}
\description{
Extract taxa abundances from phyloseq objects.
}
\examples{
data(caporaso)
abd <- abundances(caporaso)
}
\seealso{
\code{\linkS4class{otu_table}}, \code{\linkS4class{phyloseq}},
\code{\linkS4class{microbiomeMarker}},\code{\link{transform_abundances}}
}


================================================
FILE: man/aggregate_taxa.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/aggregate-taxa.R
\name{aggregate_taxa}
\alias{aggregate_taxa}
\title{Aggregate Taxa}
\usage{
aggregate_taxa(x, level, verbose = FALSE)
}
\arguments{
\item{x}{\code{\link{phyloseq-class}} object}

\item{level}{Summarization level (from \code{rank_names(pseq)})}

\item{verbose}{verbose}
}
\value{
Summarized phyloseq object
}
\description{
Summarize phyloseq data into a higher phylogenetic level.
}
\details{
This provides a convenient way to aggregate phyloseq OTUs
(or other taxa) when the phylogenetic tree is missing. Calculates the
sum of OTU abundances over all OTUs that map to the same higher-level
group. Removes ambiguous levels from the taxonomy table. Returns a
phyloseq object with the summarized abundances.
}
\examples{
data(caporaso)
caporaso_phylum <- aggregate_taxa(caporaso, "Phylum")
}
\references{
See citation('microbiome')
}
\author{
Contact: Leo Lahti \email{microbiome-admin@googlegroups.com}
}
\keyword{utilities}


================================================
FILE: man/assign-marker_table.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllGenerics.R
\name{marker_table<-}
\alias{marker_table<-}
\alias{assign-marker_table}
\title{Assign marker_table to \code{object}}
\usage{
marker_table(object) <- value
}
\arguments{
\item{object}{a \code{\linkS4class{microbiomeMarker}} object to modify.}

\item{value}{new value to replace the \code{marker_table} slot of \code{object}.
Either a \code{marker_table-class}, a \code{data.frame} that can be coerced
into \code{marker_table-class}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
This function replace the \code{marker_table} slot of \code{object} with \code{value}.
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.1,
    p_adjust = "fdr"
)
mm_marker <- marker_table(mm)
mm_marker
marker_table(mm) <- mm_marker[1:2, ]
marker_table(mm)
}


================================================
FILE: man/assign-otu_table.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/assignment-methods.R
\docType{methods}
\name{assign-otu_table}
\alias{assign-otu_table}
\alias{otu_table<-,microbiomeMarker,otu_table-method}
\alias{otu_table<-,microbiomeMarker,phyloseq-method}
\alias{otu_table<-,microbiomeMarker,microbiomeMarker-method}
\title{Assign a new OTU table}
\usage{
\S4method{otu_table}{microbiomeMarker,otu_table}(x) <- value

\S4method{otu_table}{microbiomeMarker,phyloseq}(x) <- value

\S4method{otu_table}{microbiomeMarker,microbiomeMarker}(x) <- value
}
\arguments{
\item{x}{\code{\linkS4class{microbiomeMarker}}}

\item{value}{\code{\linkS4class{otu_table}}, \code{\linkS4class{phyloseq}},
or \code{\linkS4class{microbiomeMarker}}}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Assign a new OTU table in microbiomeMarker object
}


================================================
FILE: man/compare_DA.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-comparing.R
\name{compare_DA}
\alias{compare_DA}
\title{Comparing the results of differential analysis methods by Empirical power
and False Discovery Rate}
\usage{
compare_DA(
  ps,
  group,
  taxa_rank = "none",
  methods,
  args = list(),
  n_rep = 20,
  effect_size = 5,
  k = NULL,
  relative = TRUE,
  BPPARAM = BiocParallel::SnowParam(progressbar = TRUE)
)
}
\arguments{
\item{ps, group, taxa_rank}{main arguments of all differential analysis
methods. \code{ps}: a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object; \code{group}, character,
the variable to set the group, must be one of the var of the sample
metadata; \code{taxa_rank}: character, taxonomic rank, please not that \strong{since
the abundance table is spiked in the lowest level, only
\code{taxa_rank = "none"} is allowed}.}

\item{methods}{character vector, differential analysis methods to be
compared, available methods are "aldex", "ancom", "ancombc", "deseq2",
"edger", "lefse", "limma_voom", "metagenomeseq", "simple_stat".}

\item{args}{named list, which used to set the extra arguments of the
differential analysis methods, so the names must be contained in \code{methods}.
For more see details below.}

\item{n_rep}{integer, number of times to run the differential analyses.}

\item{effect_size}{numeric, the effect size for the spike-ins. Default 5.}

\item{k}{numeric vector of length 3, number of features to spike in each
tertile (lower, mid, upper), e.g. \code{k=c(5,10,15)} means 5 features spiked
in low abundance tertile, 10 features spiked in mid abundance tertile and
15 features spiked in high abundance tertile. Default \code{NULL}, which will
spike 2 percent of the total amount of features in each tertile (a total
of 6 percent), but minimum c(5,5,5).}

\item{relative}{logical, whether rescale the total number of individuals
observed for each sample to the original level after spike-in. Default
\code{TRUE}.}

\item{BPPARAM}{\code{\link[BiocParallel:BiocParallelParam-class]{BiocParallel::BiocParallelParam}} instance defining the
parallel back-end.}
}
\value{
an \code{compareDA} object, which contains a two-length list of:
\itemize{
\item \code{metrics}: \code{data.frame}, FPR, AUC and spike detection rate for each run.
\item \code{mm}: differential analysis results.
}
}
\description{
Calculating power, false discovery rates, false positive rates and auc (
area under the receiver operating characteristic (ROC) curve)
for various DA methods.
}
\details{
To make this function support for different arguments for a certain DA method
\code{args} allows list of list of list e.g. \code{args = list(lefse = list(list(norm = "CPM"), list(norm = "TSS")))}, which specify to compare the different norm
arguments for lefse analysis.

For \code{taxa_rank}, only \code{taxa_rank = "none"} is supported, if this argument is
not "none", it will be forced to "none" internally.
}


================================================
FILE: man/confounder.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/confounder.R
\name{confounder}
\alias{confounder}
\title{Confounder analysis}
\usage{
confounder(
  ps,
  target_var,
  norm = "none",
  confounders = NULL,
  permutations = 999,
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{target_var}{character, the variable of interest}

\item{norm}{norm the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.}

\item{confounders}{the confounding variables to be measured, if \code{NULL}, all
variables in the meta data will be analyzed.}

\item{permutations}{the number of permutations, see \code{\link[vegan:anova.cca]{vegan::anova.cca()}}.}

\item{...}{extra arguments passed to \code{\link[vegan:anova.cca]{vegan::anova.cca()}}.}
}
\value{
a \code{data.frame} contains three variables: confounder,
pseudo-F and p value.
}
\description{
Confounding variables may mask the actual differential features. This
function utilizes constrained correspondence analysis (CCA) to measure the
confounding factors.
}
\examples{
data(caporaso)
confounder(caporaso, "SampleType", confounders = "ReportedAntibioticUsage")

}


================================================
FILE: man/data-caporaso.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-caporaso}
\alias{data-caporaso}
\alias{caporaso}
\title{16S rRNA data from "Moving pictures of the human microbiome"}
\format{
a \link[phyloseq:phyloseq]{phyloseq::phyloseq} object
}
\source{
Data was downloaded from https://www.microbiomeanalyst.ca
}
\description{
16S read counts and phylogenetic tree file of 34 Illumina samples derived
from Moving Pictures of the Human Microbiome (Caporaso et al.) Group label:
gut, left palm, right palm, and tongue - indicating different sampled body
sites.
}
\references{
Caporaso, et al. Moving pictures of the human microbiome. Genome Biol 12,
R50 (2011).

\url{https://doi.org/10.1186/gb-2011-12-5-r50}
}
\author{
Yang Cao
}


================================================
FILE: man/data-cid_ying.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-cid_ying}
\alias{data-cid_ying}
\alias{cid_ying}
\title{16S rRNA data of 94 patients from CID 2012}
\format{
a \link[phyloseq:phyloseq]{phyloseq::phyloseq} object
}
\source{
\url{https://github.com/ying14/yingtools2/tree/master/data}
}
\description{
Data from a cohort of 94 Bone Marrow Transplant patients previously published
on in CID
}
\references{
Ying, et al. Intestinal Domination and the Risk of Bacteremia in Patients
Undergoing Allogeneic Hematopoietic Stem Cell Transplantation,
Clinical Infectious Diseases, Volume 55, Issue 7, 1 October 2012,
Pages 905–914,

\url{https://academic.oup.com/cid/article/55/7/905/428203}
}
\author{
Yang Cao
}


================================================
FILE: man/data-ecam.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-ecam}
\alias{data-ecam}
\alias{ecam}
\title{Data from Early Childhood Antibiotics and the Microbiome (ECAM) study}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\description{
The data from a subset of the Early Childhood Antibiotics and the
Microbiome (ECAM) study, which tracked the microbiome composition and
development of 43 infants in the United States from birth to 2 years of age,
identifying microbiome associations with antibiotic exposure, delivery mode,
and diet.
}
\references{
Bokulich, Nicholas A., et al. "Antibiotics, birth mode, and diet shape
microbiome maturation during early life." Science translational medicine
8.343 (2016): 343ra82-343ra82.

\url{https://github.com/FrederickHuangLin/ANCOM/tree/master/data}
}


================================================
FILE: man/data-enterotypes_arumugam.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-enterotypes_arumugam}
\alias{data-enterotypes_arumugam}
\alias{enterotypes_arumugam}
\title{Enterotypes data of 39 samples}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\description{
The data contains 22 European metagenomes from Danish, French, Italian,
and Spanish individuals, and 13 Japanese and 4 American.
}
\references{
Arumugam, Manimozhiyan, et al. Enterotypes of the human gut microbiome.
nature 473.7346 (2011): 174-180.
}
\author{
Yang Cao
}


================================================
FILE: man/data-kostic_crc.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-kostic_crc}
\alias{data-kostic_crc}
\alias{kostic_crc}
\title{Data from a study on colorectal cancer (kostic 2012)}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\description{
The data from a study on colorectal cancer. Samples that had no \code{DIAGNOSIS}
attribute assigned and with less than 500 reads (counts) were removed, and
191 samples remains (91 healthy and 86 Tumors).
}
\references{
Kostic et al. Genomic analysis identifies association of Fusobacterium with
colorectal carcinoma. Genome research, 2012, 22(2), 292-298.
}
\author{
Yang Cao
}


================================================
FILE: man/data-oxygen.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-oxygen}
\alias{data-oxygen}
\alias{oxygen}
\title{Oxygen availability 16S dataset, of which taxa table has been summarized for
python lefse input}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\source{
\url{http://huttenhower.sph.harvard.edu/webfm_send/129}
}
\description{
A small subset of the HMP 16S dataset for finding biomarkers characterizing
different level of oxygen availability in different bodysites
}
\author{
Yang Cao
}


================================================
FILE: man/data-pediatric_ibd.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-pediatric_ibd}
\alias{data-pediatric_ibd}
\alias{pediatric_ibd}
\title{IBD stool samples}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\source{
\url{https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources}
}
\description{
43 pediatric IBD stool samples obtained from the Integrative Human Microbiome
Project Consortium (iHMP). Group label: CD and Controls.
}
\author{
Yang Cao
}


================================================
FILE: man/data-spontaneous_colitis.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{data-spontaneous_colitis}
\alias{data-spontaneous_colitis}
\alias{spontaneous_colitis}
\title{This is a sample data from lefse python script, a 16S dataset for studying
the characteristics of the fecal microbiota in a mouse model of spontaneous
colitis.}
\format{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object
}
\source{
\url{http://www.huttenhower.org/webfm_send/73}
}
\description{
The dataset contains 30 abundance profiles (obtained processing the 16S reads
with RDP) belonging to 10 rag2 (control) and 20 truc (case) mice.
}
\author{
Yang Cao
}


================================================
FILE: man/effect_size-plot.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-effect-size.R
\name{plot_ef_bar}
\alias{plot_ef_bar}
\alias{ef-barplot,ef-dotplot}
\alias{plot_ef_dot}
\title{bar and dot plot of effect size of microbiomeMarker data}
\usage{
plot_ef_bar(mm, label_level = 1, max_label_len = 60, markers = NULL)

plot_ef_dot(mm, label_level = 1, max_label_len = 60, markers = NULL)
}
\arguments{
\item{mm}{a \code{\linkS4class{microbiomeMarker}} object}

\item{label_level}{integer, number of label levels to be displayed, default
\code{1}, \code{0} means display the full name of the feature}

\item{max_label_len}{integer, maximum number of characters of feature label,
default \code{60}}

\item{markers}{character vector, markers to display, default \code{NULL},
indicating plot all markers.}
}
\value{
a ggplot project
}
\description{
bar and dot plot of effect size microbiomeMarker data. This function returns
a \code{ggplot2} object that can be saved or further customized using \strong{ggplot2}
package.
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.01,
    p_adjust = "none"
)
plot_ef_bar(mm)
}


================================================
FILE: man/extract-methods.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/extract-methods.R
\name{[}
\alias{[}
\alias{[,marker_table,ANY,ANY,ANY-method}
\title{Extract \code{marker_table} object}
\usage{
\S4method{[}{marker_table,ANY,ANY,ANY}(x, i, j, ..., drop = TRUE)
}
\arguments{
\item{x}{a \code{\link{marker_table}} object.}

\item{i, j}{elements to extract.}

\item{...}{see \code{\link[base:Extract]{base::Extract()}}.}

\item{drop}{ignored now.}
}
\value{
a \code{marker_table} object.
}
\description{
Operators acting on \code{marker_table} to extract parts.
}
\seealso{
\code{\link[base:Extract]{base::Extract()}}
}


================================================
FILE: man/extract_posthoc_res.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/post-hoc-test.R
\name{extract_posthoc_res}
\alias{extract_posthoc_res}
\title{Extract results from a posthoc test}
\usage{
extract_posthoc_res(object, features = NULL)
}
\arguments{
\item{object}{a \code{\linkS4class{postHocTest}} object.}

\item{features}{either \code{NULL} extracts results of all features, or a
character vector to specify the test resuts of which features are
extracted.}
}
\value{
a \code{\link[IRanges:DataFrameList-class]{IRanges::SimpleDFrameList}} object.
}
\description{
This function extracts the results of posthoc test.
}
\examples{
require(IRanges)
pht <- postHocTest(
    result = DataFrameList(
        featureA = DataFrame(
            comparisons = c("group2-group1",
                "group3-group1",
                "group3-group2"),
            diff_mean = runif(3),
            pvalue = rep(0.01, 3),
            ci_lower = rep(0.01, 3),
            ci_upper = rep(0.011, 3)
        ),
        featureB = DataFrame(
            comparisons = c("group2-group1",
                "group3-group1",
                "group3-group2"),
            diff_mean = runif(3),
            pvalue = rep(0.01, 3),
            ci_lower = rep(0.01, 3),
            ci_upper = rep(0.011, 3)
        )
    ),
    abundance = data.frame(
        featureA = runif(3),
        featureB = runif(3),
        group = c("group1", "group2", "grou3")
    )
)
extract_posthoc_res(pht, "featureA")[[1]]
}


================================================
FILE: man/figures/sticker.R
================================================
imgurl <- "man/figures/microbiome.png"
sticker(
  subplot = imgurl,
  package="microbiomeMarker",
  p_size=14, s_x=1, s_y=0.75 ,  s_width = 0.6, asp = 0.8,
  filename="man/figures/microbiomeMarker.png",
  h_color = "grey"
)


================================================
FILE: man/get_treedata_phyloseq.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-cladogram.R
\name{get_treedata_phyloseq}
\alias{get_treedata_phyloseq}
\title{Generate tree data from phyloseq object}
\usage{
get_treedata_phyloseq(ps, sep = "|")
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{sep}{character, separate between different levels of taxa, default \code{|}}
}
\value{
a \code{\link[tidytree:treedata-class]{tidytree::treedata}} object
}
\description{
Generate tree data from phyloseq object
}
\author{
Yang Cao
}
\keyword{internal}


================================================
FILE: man/import_dada2.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/import-dada2.R
\name{import_dada2}
\alias{import_dada2}
\title{Import function to read the the output of dada2 as phyloseq object}
\usage{
import_dada2(
  seq_tab,
  tax_tab = NULL,
  sam_tab = NULL,
  phy_tree = NULL,
  keep_taxa_rows = TRUE
)
}
\arguments{
\item{seq_tab}{matrix-like, ASV table, the output of
\code{dada2::removeBimeraDenovo}.}

\item{tax_tab}{matrix, taxonomy table, the output of
\code{dada2::assignTaxonomy} or \code{dada2::addSpecies}.}

\item{sam_tab}{data.frame or \code{\link[phyloseq:sample_data-class]{phyloseq::sample_data}}, sample data}

\item{phy_tree}{\code{\link[ape:read.tree]{ape::phylo}} class or character represents the path of
the tree file}

\item{keep_taxa_rows}{logical, whether keep taxa in rows or not in the
\code{otu_table} of the returned \code{phyloseq} object, default \code{TRUE}.}
}
\value{
\code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object hold the taxonomy info,
sample metadata, number of reads per ASV.
}
\description{
Import the output of dada2 into phyloseq object
}
\details{
The output of the dada2 pipeline is a feature table of amplicon sequence
variants (an ASV table): A matrix with rows corresponding to samples and
columns to ASVs, in which the value of each entry is the number of times
that ASV was observed in that sample. This table is analogous to the
traditional OTU table. Conveniently, taxa names are saved as ASV1, ASV2,
..., in the returned phyloseq object.
}
\examples{
seq_tab <- readRDS(system.file("extdata", "dada2_seqtab.rds",
    package = "microbiomeMarker"
))
tax_tab <- readRDS(system.file("extdata", "dada2_taxtab.rds",
    package = "microbiomeMarker"
))
sam_tab <- read.table(system.file("extdata", "dada2_samdata.txt",
    package = "microbiomeMarker"
), sep = "\t", header = TRUE, row.names = 1)
ps <- import_dada2(seq_tab = seq_tab, tax_tab = tax_tab, sam_tab = sam_tab)
ps
}


================================================
FILE: man/import_picrust2.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/import-picrust2.R
\name{import_picrust2}
\alias{import_picrust2}
\title{Import function to read the output of picrust2 as phyloseq object}
\usage{
import_picrust2(
  feature_tab,
  sam_tab = NULL,
  trait = c("PATHWAY", "COG", "EC", "KO", "PFAM", "TIGRFAM", "PHENO")
)
}
\arguments{
\item{feature_tab}{character, file path of the prediction abundance table of
functional feature.}

\item{sam_tab}{character, file path of the sample meta data.}

\item{trait}{character, options are picrust2 function traits (including
"COG", "EC", "KO", "PFAM", "TIGRFAM", and "PHENO") and "PATHWAY".}
}
\value{
\code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.
}
\description{
Import the output of picrust2 into phyloseq object
}
\details{
\href{https://huttenhower.sph.harvard.edu/picrust/}{PICRUSt2} is a software for
predicting abundances of functional profiles based on marker gene sequencing
data. The functional profiles can be predicted from the taxonomic
profiles using PICRUSt2. "Function" usually refers to gene families such as
KEGG orthologs and Enzyme Classification numbers, but predictions can be
made for any arbitrary trait.

In the \verb{phyloseq object}, the predicted function abundance profile is stored
in \code{otu_table} slot. And the functional trait is saved in \code{tax_table} slot,
if the descriptions of function features is not added to the predicted table,
\code{tax_table} will have only one rank \code{Picrust_trait} to represent the function
feature id, or if the desciptions are added one more rank
\code{Picrust_description} will be added to represent the description of
function feature.
}
\examples{
sam_tab <- system.file(
    "extdata", "picrust2_metadata.tsv",
    package = "microbiomeMarker")
feature_tab <- system.file(
    "extdata", "path_abun_unstrat_descrip.tsv.gz",
    package = "microbiomeMarker")
ps <- import_picrust2(feature_tab, sam_tab, trait = "PATHWAY")
ps
}


================================================
FILE: man/import_qiime2.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/import-qiime2.R
\name{import_qiime2}
\alias{import_qiime2}
\title{Import function to read the the output of dada2 as phyloseq object}
\usage{
import_qiime2(
  otu_qza,
  taxa_qza = NULL,
  sam_tab = NULL,
  refseq_qza = NULL,
  tree_qza = NULL
)
}
\arguments{
\item{otu_qza}{character, file path of the feature table from qiime2.}

\item{taxa_qza}{character, file path of the taxonomic table from qiime2,
default \code{NULL}.}

\item{sam_tab}{character, file path of the sample metadata in tsv format,
default \code{NULL}.}

\item{refseq_qza}{character, file path of the representative sequences from
qiime2, default \code{NULL}.}

\item{tree_qza}{character, file path of the phylogenetic tree from
qiime2, default \code{NULL}.}
}
\value{
\code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.
}
\description{
Import the qiime2 artifacts, including feature table, taxonomic table,
phylogenetic tree, representative sequence and sample metadata into
phyloseq object.
}
\examples{
otuqza_file <- system.file(
    "extdata", "table.qza",
    package = "microbiomeMarker"
)
taxaqza_file <- system.file(
    "extdata", "taxonomy.qza",
    package = "microbiomeMarker"
)
sample_file <- system.file(
    "extdata", "sample-metadata.tsv",
    package = "microbiomeMarker"
)
treeqza_file <- system.file(
    "extdata", "tree.qza",
    package = "microbiomeMarker"
)
ps <- import_qiime2(
    otu_qza = otuqza_file, taxa_qza = taxaqza_file,
    sam_tab = sample_file, tree_qza = treeqza_file
)
ps
}


================================================
FILE: man/marker_table-class.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllClasses.R
\docType{class}
\name{marker_table-class}
\alias{marker_table-class}
\title{The S4 class for storing microbiome marker information}
\description{
This Class is inherit from \code{data.frame}. Rows represent the microbiome
markers and variables represents feature of the marker.
}
\section{Fields}{

\describe{
\item{\code{names,row.names}}{a character vector, inherited from the input
data.frame}

\item{\code{.data}}{a list, each element corresponding the each column of the
input data.frame}

\item{\code{.S3Class}}{character, the S3 class \code{marker_table} inherited from:
"\code{data.frame}"}
}}

\author{
Yang Cao
}


================================================
FILE: man/marker_table-methods.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllGenerics.R
\name{marker_table}
\alias{marker_table}
\alias{marker_table,data.frame-method}
\alias{marker_table,microbiomeMarker-method}
\title{Build or access the marker_table}
\usage{
marker_table(object)

\S4method{marker_table}{data.frame}(object)

\S4method{marker_table}{microbiomeMarker}(object)
}
\arguments{
\item{object}{an object among the set of classes defined by the
microbiomeMarker package that contain \code{marker_table}}
}
\value{
a \code{\linkS4class{marker_table}} object.
}
\description{
This is the recommended function for both building and accessing microbiome
marker table (\code{\linkS4class{marker_table}}).
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.05,
    p_adjust = "fdr"
)
marker_table(mm)
}


================================================
FILE: man/microbiomeMarker-class.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllClasses.R, R/AllGenerics.R
\docType{class}
\name{microbiomeMarker-class}
\alias{microbiomeMarker-class}
\alias{show,microbiomeMarker-method}
\title{The main class for microbiomeMarker data}
\usage{
\S4method{show}{microbiomeMarker}(object)
}
\arguments{
\item{object}{a \code{microbiomeMarker-class} object}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
\code{microbiomeMarker-class} is inherited from the \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}}
by adding a custom slot \code{microbiome_marker} to save the differential analysis
results. And it provides a seamless interface with \strong{phyloseq}, which makes
\strong{microbiomeMarker} simple and easy to use. For more details on see the
document of \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}}.
}
\section{Slots}{

\describe{
\item{\code{marker_table}}{a data.frame, a \code{\linkS4class{marker_table}} object.}

\item{\code{norm_method}}{character, method used to normalize the input \code{phyloseq}
object.}

\item{\code{diff_method}}{character, method used for marker identification.}
}}

\seealso{
\code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}}, \code{\linkS4class{marker_table}},
\code{\link[=summarize_taxa]{summarize_taxa()}}
}


================================================
FILE: man/microbiomeMarker-package.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/microbiomeMarker.R
\docType{package}
\name{microbiomeMarker-package}
\alias{microbiomeMarker-package}
\title{microbiomeMarker: A package for microbiome biomarker discovery}
\description{
The microboimeMarker package provides several methods to identify micribome
biomarker, such as lefse, deseq2.
}
\seealso{
Useful links:
\itemize{
  \item \url{https://github.com/yiluheihei/microbiomeMarker}
  \item Report bugs at \url{https://github.com/yiluheihei/microbiomeMarker/issues}
}

}
\author{
\strong{Maintainer}: Yang Cao \email{caoyang.name@gmail.com}

}
\keyword{internal}


================================================
FILE: man/microbiomeMarker.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllClasses.R
\name{microbiomeMarker}
\alias{microbiomeMarker}
\title{Build microbiomeMarker-class objects}
\usage{
microbiomeMarker(
  marker_table = NULL,
  norm_method = NULL,
  diff_method = NULL,
  ...
)
}
\arguments{
\item{marker_table}{a \code{\linkS4class{marker_table}} object differtial analysis.}

\item{norm_method}{character, method used to normalize the input \code{phyloseq}
object.}

\item{diff_method}{character, method used for microbiome marker
identification.}

\item{...}{arguments passed to \code{\link[phyloseq:phyloseq]{phyloseq::phyloseq()}}}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
This the constructor to build the \code{\linkS4class{microbiomeMarker}} object, don't
use the \code{new()} constructor.
}
\examples{
microbiomeMarker(
    marker_table = marker_table(data.frame(
        feature = c("speciesA", "speciesB"),
        enrich_group = c("groupA", "groupB"),
        ef_logFC = c(-2, 2),
        pvalue = c(0.01, 0.01),
        padj = c(0.01, 0.01),
        row.names = c("marker1", "marker2")
    )),
    norm_method = "TSS",
    diff_method = "DESeq2",
    otu_table = otu_table(matrix(
        c(4, 1, 1, 4),
        nrow = 2, byrow = TRUE,
        dimnames = list(c("speciesA", "speciesB"), c("sample1", "sample2"))
    ),
    taxa_are_rows = TRUE
    ),
    tax_table = tax_table(matrix(
        c("speciesA", "speciesB"),
        nrow = 2,
        dimnames = list(c("speciesA", "speciesB"), "Species")
    )),
    sam_data = sample_data(data.frame(
        group = c("groupA", "groupB"),
        row.names = c("sample1", "sample2")
    ))
)
}
\seealso{
\code{\link[phyloseq:phyloseq]{phyloseq::phyloseq()}}
}


================================================
FILE: man/nmarker-methods.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllGenerics.R
\docType{methods}
\name{nmarker}
\alias{nmarker}
\alias{nmarker,microbiomeMarker-method}
\alias{nmarker,marker_table-method}
\title{Get the number of microbiome markers}
\usage{
nmarker(object)

\S4method{nmarker}{microbiomeMarker}(object)

\S4method{nmarker}{marker_table}(object)
}
\arguments{
\item{object}{a \code{\linkS4class{microbiomeMarker}} or \code{\linkS4class{marker_table}} object}
}
\value{
an integer, the number of microbiome markers
}
\description{
Get the number of microbiome markers
}
\examples{
mt <- marker_table(data.frame(
    feature = c("speciesA", "speciesB"),
    enrich_group = c("groupA", "groupB"),
    ef_logFC = c(-2, 2),
    pvalue = c(0.01, 0.01),
    padj = c(0.01, 0.01),
    row.names = c("marker1", "marker2")
))
nmarker(mt)
}


================================================
FILE: man/normalize-methods.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/normalization.R
\name{normalize,phyloseq-method}
\alias{normalize,phyloseq-method}
\alias{normalize,otu_table-method}
\alias{normalize}
\alias{normalize,data.frame-method}
\alias{normalize,matrix-method}
\alias{norm_rarefy}
\alias{norm_tss}
\alias{norm_css}
\alias{norm_rle}
\alias{norm_tmm}
\alias{norm_clr}
\alias{norm_cpm}
\title{Normalize the microbial abundance data}
\usage{
\S4method{normalize}{phyloseq}(object, method = "TSS", ...)

\S4method{normalize}{otu_table}(object, method = "TSS", ...)

\S4method{normalize}{data.frame}(object, method = "TSS", ...)

\S4method{normalize}{matrix}(object, method = "TSS", ...)

norm_rarefy(
  object,
  size = min(sample_sums(object)),
  rng_seed = FALSE,
  replace = TRUE,
  trim_otus = TRUE,
  verbose = TRUE
)

norm_tss(object)

norm_css(object, sl = 1000)

norm_rle(
  object,
  locfunc = stats::median,
  type = c("poscounts", "ratio"),
  geo_means = NULL,
  control_genes = NULL
)

norm_tmm(
  object,
  ref_column = NULL,
  logratio_trim = 0.3,
  sum_trim = 0.05,
  do_weighting = TRUE,
  Acutoff = -1e+10
)

norm_clr(object)

norm_cpm(object)
}
\arguments{
\item{object}{a \link[phyloseq:phyloseq-class]{phyloseq::phyloseq} or \link[phyloseq:otu_table-class]{phyloseq::otu_table}}

\item{method}{the methods used to normalize the microbial abundance data.
Options includes:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over
the differences of the log-transformed gene-count fold-change between
the sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{...}{other arguments passed to the corresponding normalization
methods.}

\item{size, rng_seed, replace, trim_otus, verbose}{extra arguments passed to
\code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth()}}.}

\item{sl}{The value to scale.}

\item{locfunc}{a function to compute a location for a sample. By default,
the median is used.}

\item{type}{method for estimation: either "ratio"or "poscounts" (recommend).}

\item{geo_means}{default \code{NULL}, which means the geometric means of the
counts are used. A vector of geometric means from another count matrix can
be provided for a "frozen" size factor calculation.}

\item{control_genes}{default \code{NULL}, which means all taxa are used for size
factor estimation, numeric or logical index vector specifying the taxa
used for size factor estimation (e.g. core taxa).}

\item{ref_column}{column to use as reference}

\item{logratio_trim}{amount of trim to use on log-ratios}

\item{sum_trim}{amount of trim to use on the combined absolute levels
("A" values)}

\item{do_weighting}{whether to compute the weights or not}

\item{Acutoff}{cutoff on "A" values to use before trimming}
}
\value{
the same class with \code{object}.
}
\description{
It is critical to normalize the feature table to eliminate any bias due to
differences in the sampling sequencing depth.This function implements six
widely-used normalization methods for microbial compositional data.

For rarefying, reads in the different samples are randomly removed until
the same predefined number has been reached, to assure all samples have the
same library size. Rarefying normalization method is the standard in
microbial ecology. Please note that the authors of phyloseq do not advocate
using this rarefying a normalization procedure, despite its recent
popularity

TSS simply transforms the feature table into relative abundance by dividing
the number of total reads of each sample.

CSS is based on the assumption that the count distributions in each sample
are equivalent for low abundant genes up to a certain threshold.  Only the
segment of each sample’s count distribution that is relatively invariant
across samples is scaled by CSS

RLE assumes most features are not differential and uses the relative
abundances to calculate the normalization factor.

TMM calculates the normalization factor using a robust statistics based on
the assumption that most features are not differential and should, in
average, be equal between the samples. The TMM scaling factor is  calculated
as the weighted mean of log-ratios between each pair of samples, after
excluding the highest count OTUs and OTUs with the largest log-fold change.

In CLR, the log-ratios are computed relative to the geometric mean of all
features.

\code{norm_cpm}: This normalization method is from the original LEfSe algorithm,
recommended when very low values are present (as shown in the LEfSe galaxy).
}
\examples{
data(caporaso)
normalize(caporaso, "TSS")
}
\seealso{
\code{\link[edgeR:calcNormFactors]{edgeR::calcNormFactors()}},\code{\link[DESeq2:estimateSizeFactorsForMatrix]{DESeq2::estimateSizeFactorsForMatrix()}},
\code{\link[metagenomeSeq:cumNorm]{metagenomeSeq::cumNorm()}}

\code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth()}}

\code{\link[metagenomeSeq:calcNormFactors]{metagenomeSeq::calcNormFactors()}}

\code{\link[DESeq2:estimateSizeFactorsForMatrix]{DESeq2::estimateSizeFactorsForMatrix()}}

\code{\link[edgeR:calcNormFactors]{edgeR::calcNormFactors()}}
}


================================================
FILE: man/phyloseq2DESeq2.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-deseq2.R
\name{phyloseq2DESeq2}
\alias{phyloseq2DESeq2}
\title{Convert \code{phyloseq-class} object to \code{DESeqDataSet-class} object}
\usage{
phyloseq2DESeq2(ps, design, ...)
}
\arguments{
\item{ps}{the [phyloseq::phyloseq-class\verb{] object to convert, which must have a [}phyloseq::sample_data()`] component.}

\item{design}{a \code{formula} or \code{matrix}, the formula expresses how the counts
for each gene depend on the variables in colData. Many R formula are valid,
including designs with multiple variables, e.g., \code{~ group + condition}.
This argument is passed to \code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}}.}

\item{...}{additional arguments passed to
\code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}}, Most users will not need to pass any
additional arguments here.}
}
\value{
a \code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSet}} object.
}
\description{
This function convert [phyloseq::phyloseq-class\verb{] to [}DESeq2::DESeqDataSet-class\verb{], which can then be tested using [}DESeq2::DESeq()`].
}
\examples{
data(caporaso)
phyloseq2DESeq2(caporaso, ~SampleType)
}
\seealso{
\code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}},\code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}
}


================================================
FILE: man/phyloseq2edgeR.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-edgeR.R
\name{phyloseq2edgeR}
\alias{phyloseq2edgeR}
\title{Convert phyloseq data to edgeR \code{DGEList} object}
\usage{
phyloseq2edgeR(ps, ...)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{...}{optional, additional named arguments passed  to
\code{\link[edgeR:DGEList]{edgeR::DGEList()}}. Most users will not need to pass any additional
arguments here.}
}
\value{
A \code{\link[edgeR:DGEList-class]{edgeR::DGEList}} object.
}
\description{
This function convert \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object to
\code{\link[edgeR:DGEList-class]{edgeR::DGEList}} object, can then can be used to perform
differential analysis using the methods in \strong{edgeR}.
}
\examples{
data(caporaso)
dge <- phyloseq2edgeR(caporaso)
}


================================================
FILE: man/phyloseq2metagenomeSeq.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-metagenomeSeq.R
\name{phyloseq2metagenomeSeq}
\alias{phyloseq2metagenomeSeq}
\alias{otu_table2metagenomeSeq}
\title{Convert phyloseq data to MetagenomeSeq \code{MRexperiment} object}
\usage{
phyloseq2metagenomeSeq(ps, ...)

otu_table2metagenomeSeq(ps, ...)
}
\arguments{
\item{ps}{\code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object for
\code{phyloseq2metagenomeSeq()}, or \code{\link[phyloseq:otu_table-class]{phyloseq::otu_table}} object
for \code{otu_table2metagenomeseq()}.}

\item{...}{optional, additional named arguments passed  to
\code{\link[metagenomeSeq:newMRexperiment]{metagenomeSeq::newMRexperiment()}}. Most users will not need to pass
any additional arguments here.}
}
\value{
A \code{\link[metagenomeSeq:MRexperiment-class]{metagenomeSeq::MRexperiment}} object.
}
\description{
The phyloseq data is converted to the relevant
\code{\link[metagenomeSeq:MRexperiment-class]{metagenomeSeq::MRexperiment}} object, which can then be tested in
the zero-inflated mixture model framework in the metagenomeSeq package.
}
\examples{
data(caporaso)
phyloseq2metagenomeSeq(caporaso)
}
\seealso{
\code{\link[metagenomeSeq:fitTimeSeries]{metagenomeSeq::fitTimeSeries()}},
\code{\link[metagenomeSeq:fitLogNormal]{metagenomeSeq::fitLogNormal()}},\code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}},
\code{\link[metagenomeSeq:MRtable]{metagenomeSeq::MRtable()}},\code{\link[metagenomeSeq:MRfulltable]{metagenomeSeq::MRfulltable()}}
}


================================================
FILE: man/plot.compareDA.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-comparing.R
\name{plot.compareDA}
\alias{plot.compareDA}
\title{Plotting DA comparing result}
\usage{
\method{plot}{compareDA}(x, sort = c("score", "auc", "fpr", "power"), ...)
}
\arguments{
\item{x}{an \code{compareDA} object, output from \code{\link[=compare_DA]{compare_DA()}}.}

\item{sort}{character string specifying sort method. Possibilities are
"score" which is calculated as \eqn{(auc - 0.5) * power - fdr}, "auc" for
area under the ROC curve, "fpr" for false positive rate, "power" for
empirical power.}

\item{...}{extra arguments, just ignore it.}
}
\value{
a \code{\link[ggplot2:ggplot]{ggplot2::ggplot}} object containing 4 subplots: "auc", "fdr",
"power"and "score" plot.
}
\description{
Plotting DA comparing result
}


================================================
FILE: man/plot_abundance.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-abundance.R
\name{plot_abundance}
\alias{plot_abundance}
\title{plot the abundances of markers}
\usage{
plot_abundance(mm, label_level = 1, max_label_len = 60, markers = NULL, group)
}
\arguments{
\item{mm}{a \code{\linkS4class{microbiomeMarker}} object}

\item{label_level}{integer, number of label levels to be displayed, default
\code{1}, \code{0} means display the full name of the feature}

\item{max_label_len}{integer, maximum number of characters of feature label,
default \code{60}}

\item{markers}{character vector, markers to display, default \code{NULL},
indicating plot all markers.}

\item{group}{character, the variable to set the group}
}
\value{
a \code{\link[ggplot2:ggplot]{ggplot2::ggplot}} object.
}
\description{
plot the abundances of markers
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.01,
    p_adjust = "none"
)
plot_abundance(mm, group = "Enterotype")
}


================================================
FILE: man/plot_cladogram.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-cladogram.R
\name{plot_cladogram}
\alias{plot_cladogram}
\title{plot cladogram of micobiomeMaker results}
\usage{
plot_cladogram(
  mm,
  color,
  only_marker = FALSE,
  branch_size = 0.2,
  alpha = 0.2,
  node_size_scale = 1,
  node_size_offset = 1,
  clade_label_level = 4,
  clade_label_font_size = 4,
  annotation_shape = 22,
  annotation_shape_size = 5,
  group_legend_param = list(),
  marker_legend_param = list()
)
}
\arguments{
\item{mm}{a \linkS4class{microbiomeMarker} object}

\item{color}{a color vector, used to highlight the clades of microbiome
biomarker. The values will be matched in order (usually alphabetical) with
the groups. If this is a named vector, then the colors will be matched
based on the names instead.}

\item{only_marker}{logical, whether show all the features or only
markers in the cladogram, default \code{FALSE}.}

\item{branch_size}{numeric, size of branch, default \code{0.2}}

\item{alpha}{alpha parameter for shading, default \code{0.2}}

\item{node_size_scale}{the parameter 'a' controlling node size:
\code{node_size=a*log(relative_abundance) + b}}

\item{node_size_offset}{the parameter 'b' controlling node size:
\code{node_size=a*log(relative_abundance) + b}}

\item{clade_label_level}{max level of taxa used to label the clade, other
level of taxa  will be shown on the side.}

\item{clade_label_font_size}{font size of the clade label, default 4.}

\item{annotation_shape}{shape used for annotation, default \code{22}}

\item{annotation_shape_size}{size used for annotation shape, default \code{5}}

\item{group_legend_param, marker_legend_param}{a list specifying
extra parameters of group legend and marker legend, such as \code{direction} (
the direction of the guide), \code{nrow} (the desired number of rows of
legends). See \code{\link[ggplot2:guide_legend]{ggplot2::guide_legend()}} for more details.}
}
\value{
a ggtree object
}
\description{
plot cladogram of micobiomeMaker results
}
\examples{
data(kostic_crc)
kostic_crc_small <- phyloseq::subset_taxa(
    kostic_crc,
    Phylum \%in\% c("Firmicutes")
)
mm_lefse <- run_lefse(
    kostic_crc_small,
    wilcoxon_cutoff = 0.01,
    group = "DIAGNOSIS",
    kw_cutoff = 0.01,
    multigrp_strat = TRUE,
    lda_cutoff = 4
)
plot_cladogram(mm_lefse, color = c("darkgreen", "red"))
}
\references{
This function is modified from \code{clada.anno} from microbiomeViz.
}
\seealso{
\code{\link[ggtree:ggtree]{ggtree::ggtree()}}
}
\author{
Chenhao Li, Guangchuang Yu, Chenghao Zhu, Yang Cao
}


================================================
FILE: man/plot_heatmap.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-heatmap.R
\name{plot_heatmap}
\alias{plot_heatmap}
\title{Heatmap of microbiome marker}
\usage{
plot_heatmap(
  mm,
  transform = c("log10", "log10p", "identity"),
  cluster_marker = FALSE,
  cluster_sample = FALSE,
  markers = NULL,
  label_level = 1,
  max_label_len = 60,
  sample_label = FALSE,
  scale_by_row = FALSE,
  annotation_col = NULL,
  group,
  ...
)
}
\arguments{
\item{mm}{a \code{\linkS4class{microbiomeMarker}} object}

\item{transform}{transformation to apply, for more details see
\code{\link[=transform_abundances]{transform_abundances()}}:
\itemize{
\item "identity", return the original data without any transformation.
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{cluster_marker, cluster_sample}{logical, controls whether to perform
clustering in markers (rows) and samples (cols), default \code{FALSE}.}

\item{markers}{character vector, markers to display, default \code{NULL},
indicating plot all markers.}

\item{label_level}{integer, number of label levels to be displayed, default
\code{1}, \code{0} means display the full name of the feature}

\item{max_label_len}{integer, maximum number of characters of feature label,
default \code{60}}

\item{sample_label}{logical, controls whether to show the sample labels in
the heatmap, default \code{FALSE}.}

\item{scale_by_row}{logical, controls whether to scale the heatmap by the
row (marker) values, default \code{FALSE}.}

\item{annotation_col}{assign colors for the top annotation using a named
vector, passed to \code{col} in \code{\link[ComplexHeatmap:HeatmapAnnotation]{ComplexHeatmap::HeatmapAnnotation()}}.}

\item{group}{character, the variable to set the group}

\item{...}{extra arguments passed to \code{\link[ComplexHeatmap:Heatmap]{ComplexHeatmap::Heatmap()}}.}
}
\value{
a \code{\link[ComplexHeatmap:Heatmap-class]{ComplexHeatmap::Heatmap}} object.
}
\description{
Display the microbiome marker using heatmap, in which rows represents the
marker and columns represents the samples.
}
\examples{
data(kostic_crc)
kostic_crc_small <- phyloseq::subset_taxa(
    kostic_crc,
    Phylum \%in\% c("Firmicutes")
)
mm_lefse <- run_lefse(
    kostic_crc_small,
    wilcoxon_cutoff = 0.01,
    group = "DIAGNOSIS",
    kw_cutoff = 0.01,
    multigrp_strat = TRUE,
    lda_cutoff = 4
)
plot_heatmap(mm_lefse, group = "DIAGNOSIS")
}
\seealso{
\code{\link{transform_abundances}},\code{\link[ComplexHeatmap:Heatmap]{ComplexHeatmap::Heatmap()}}
}


================================================
FILE: man/plot_postHocTest.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-postHocTest.R
\name{plot_postHocTest}
\alias{plot_postHocTest}
\title{\code{postHocTest} plot}
\usage{
plot_postHocTest(pht, feature, step_increase = 0.12)
}
\arguments{
\item{pht}{a \code{\linkS4class{postHocTest}} object}

\item{feature}{character, to plot the post-toc test result of this feature}

\item{step_increase}{numeric vector with the increase in fraction of total
height for every additional comparison to minimize overlap, default \code{0.12}.}
}
\value{
a \code{ggplot} object
}
\description{
Visualize the result of post-hoc test using ggplot2
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
) \%>\%
    phyloseq::subset_taxa(Phylum == "Bacteroidetes")
pht <- run_posthoc_test(ps, group = "Enterotype")
plot_postHocTest(pht, feature = "p__Bacteroidetes|g__Alistipes")
}


================================================
FILE: man/plot_sl_roc.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/plot-sl-roc.R
\name{plot_sl_roc}
\alias{plot_sl_roc}
\title{ROC curve of microbiome marker from supervised learning methods}
\usage{
plot_sl_roc(mm, group, nfolds = 3, nrepeats = 3, tune_length = 5, ...)
}
\arguments{
\item{mm}{a \linkS4class{microbiomeMarker} object.}

\item{group, nfolds, nrepeats, tune_length, ...}{same with the \code{run_sl()}.}
}
\value{
a \code{\link[ggplot2:ggplot]{ggplot2::ggplot}} object.
}
\description{
Show the ROC curve of the microbiome marker calculated by \code{run_sl}.
}
\examples{
data(enterotypes_arumugam)
# small example phyloseq object for test
ps_s <- phyloseq::subset_taxa(
    enterotypes_arumugam,
    Phylum \%in\% c("Firmicutes", "Bacteroidetes")
)

set.seed(2021)
mm <- run_sl(
    ps_s,
    group = "Gender",
    taxa_rank = "Genus",
    nfolds = 2,
    nrepeats = 1,
    top_n = 15,
    norm = "TSS",
    method = "LR",
)
plot_sl_roc(mm, group = "Gender")
}
\seealso{
\code{\link[=run_sl]{run_sl()}}
}


================================================
FILE: man/postHocTest-class.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllClasses.R, R/AllGenerics.R
\docType{class}
\name{postHocTest-class}
\alias{postHocTest-class}
\alias{show,postHocTest-method}
\alias{show,}
\alias{postHocTest-method}
\title{The postHocTest Class, represents the result of post-hoc test result among
multiple groups}
\usage{
\S4method{show}{postHocTest}(object)
}
\arguments{
\item{object}{a \code{postHocTest-class} object}
}
\value{
a \code{\linkS4class{postHocTest}} object.
}
\description{
The postHocTest Class, represents the result of post-hoc test result among
multiple groups
}
\section{Slots}{

\describe{
\item{\code{result}}{a \code{\link[IRanges:DataFrameList-class]{IRanges::DataFrameList}}, each \code{DataFrame} consists
of five variables:
\itemize{
\item \code{comparisons}: character, specify which two groups to test (the group names
are separated by "_)
\item \code{diff_mean}: numeric, difference in mean abundances
\item \code{pvalue}: numeric, p values
\item \code{ci_lower} and \code{ci_upper}: numeric, lower and upper confidence interval of
difference in mean abundances
}}

\item{\code{abundance}}{abundance of each feature in each group}

\item{\code{conf_level}}{confidence level}

\item{\code{method}}{method used for post-hoc test}

\item{\code{method_str}}{method illustration}
}}

\author{
Yang Cao
}


================================================
FILE: man/postHocTest.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllClasses.R
\name{postHocTest}
\alias{postHocTest}
\title{Build postHocTest object}
\usage{
postHocTest(
  result,
  abundance,
  conf_level = 0.95,
  method = "tukey",
  method_str = paste("Posthoc multiple comparisons of means: ", method)
)
}
\arguments{
\item{result}{a \code{\link[IRanges:DataFrameList-class]{IRanges::SimpleDFrameList}} object.}

\item{abundance}{data.frame.}

\item{conf_level}{numeric, confidence level.}

\item{method}{character, method for posthoc test.}

\item{method_str}{character, illustrates which method is used for posthoc
test.}
}
\value{
a \code{\linkS4class{postHocTest}} object.
}
\description{
This function is used for create \code{postHocTest} object, and is only used for
developers.
}
\examples{
require(IRanges)
pht <- postHocTest(
    result = DataFrameList(
        featureA = DataFrame(
            comparisons = c("group2-group1", 
                "group3-group1", 
                "group3-group2"),
            diff_mean = runif(3),
            pvalue = rep(0.01, 3),
            ci_lower = rep(0.01, 3),
            ci_upper = rep(0.011, 3)
        ),
        featureB = DataFrame(
            comparisons = c("group2-group1", 
                "group3-group1", 
                "group3-group2"),
            diff_mean = runif(3),
            pvalue = rep(0.01, 3),
            ci_lower = rep(0.01, 3),
            ci_upper = rep(0.011, 3)
        )
    ),
    abundance = data.frame(
        featureA = runif(3),
        featureB = runif(3),
        group = c("group1", "group2", "grou3")
    )
)
pht
}


================================================
FILE: man/reexports.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reexports.R
\docType{import}
\name{reexports}
\alias{reexports}
\alias{ntaxa}
\alias{nsamples}
\alias{otu_table}
\alias{sample_data}
\alias{tax_table}
\alias{taxa_names}
\alias{sample_names}
\alias{\%>\%}
\alias{import_qiime}
\alias{import_mothur}
\alias{import_biom}
\title{Objects exported from other packages}
\keyword{internal}
\description{
These objects are imported from other packages. Follow the links
below to see their documentation.

\describe{
  \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}}

  \item{phyloseq}{\code{\link[phyloseq]{import_biom}}, \code{\link[phyloseq]{import_mothur}}, \code{\link[phyloseq]{import_qiime}}, \code{\link[phyloseq:nsamples-methods]{nsamples}}, \code{\link[phyloseq:ntaxa-methods]{ntaxa}}, \code{\link[phyloseq:otu_table-methods]{otu_table}}, \code{\link[phyloseq:sample_data-methods]{sample_data}}, \code{\link[phyloseq:sample_names-methods]{sample_names}}, \code{\link[phyloseq:tax_table-methods]{tax_table}}, \code{\link[phyloseq:taxa_names-methods]{taxa_names}}}
}}


================================================
FILE: man/run_aldex.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-aldex.R
\name{run_aldex}
\alias{run_aldex}
\title{Perform differential analysis using ALDEx2}
\usage{
run_aldex(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  method = c("t.test", "wilcox.test", "kruskal", "glm_anova"),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  mc_samples = 128,
  denom = c("all", "iqlr", "zero", "lvha"),
  paired = FALSE
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{method}{test method, options include: "t.test" and "wilcox.test"
for two groups comparison,  "kruskal" and "glm_anova" for multiple groups
comparison.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{cutoff of p value, default 0.05.}

\item{mc_samples}{integer, the number of Monte Carlo samples to use for
underlying distributions estimation, 128 is usually sufficient.}

\item{denom}{character string, specifiy which features used to as the
denominator for the geometric mean calculation. Options are:
\itemize{
\item "all", with all features.
\item "iqlr", accounts for data with systematic variation and centers the
features on the set features that have variance that is between the lower
and upper quartile of variance.
\item "zero", a more extreme case where there are many non-zero features in
one condition but many zeros in another. In this case the geometric mean
of each group is calculated using the set of per-group non-zero features.
\item "lvha", with house keeping features.
}}

\item{paired}{logical, whether to perform paired tests, only worked for
method "t.test" and "wilcox.test".}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Perform differential analysis using ALDEx2
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
run_aldex(ps, group = "Enterotype")
}
\references{
Fernandes, A.D., Reid, J.N., Macklaim, J.M. et al. Unifying the
analysis of high-throughput sequencing datasets: characterizing RNA-seq,
16S rRNA gene sequencing and selective growth experiments by compositional
data analysis. Microbiome 2, 15 (2014).
}
\seealso{
\code{\link[ALDEx2:aldex]{ALDEx2::aldex()}}
}


================================================
FILE: man/run_ancom.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-ancom.R
\name{run_ancom}
\alias{run_ancom}
\title{Perform differential analysis using ANCOM}
\usage{
run_ancom(
  ps,
  group,
  confounders = character(0),
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "TSS",
  norm_para = list(),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  W_cutoff = 0.75
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq]{phyloseq-class}} object.}

\item{group}{character, the variable to set the group.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation.
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over
the differences of the log-transformed gene-count fold-change between
the sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{named \code{list}. other arguments passed to specific
normalization methods.  Most users will not need to pass any additional
arguments here.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{significance level for each of the statistical tests,
default 0.05.}

\item{W_cutoff}{lower bound for the proportion for the W-statistic, default
0.7.}
}
\value{
a \linkS4class{microbiomeMarker} object, in which the \code{slot} of
\code{marker_table} contains four variables:
\itemize{
\item \code{feature}, significantly different features.
\item \code{enrich_group}, the class of the differential features enriched.
\item \code{effect_size}, differential means for two groups, or F statistic for more
than two groups.
\item \code{W}, the W-statistic, number of features that a single feature is tested
to be significantly different against.
}
}
\description{
Perform significant test by comparing the pairwise log ratios between all
features.
}
\details{
In an experiment with only two treatments, this tests the following
hypothesis for feature \eqn{i}:

\deqn{H_{0i}: E(log(\mu_i^1)) =  E(log(\mu_i^2))}

where \eqn{\mu_i^1} and \eqn{\mu_i^2} are the mean abundances for feature
\eqn{i} in the two groups.

The developers of this method recommend the following significance tests
if there are 2 groups, use non-parametric Wilcoxon rank sum test
\code{\link[stats:wilcox.test]{stats::wilcox.test()}}. If there are more than 2 groups, use nonparametric
\code{\link[stats:kruskal.test]{stats::kruskal.test()}} or one-way ANOVA \code{\link[stats:aov]{stats::aov()}}.
}
\examples{
\donttest{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
run_ancom(ps, group = "Enterotype")
}
}
\references{
Mandal et al. "Analysis of composition of microbiomes: a novel
method for studying microbial composition", Microbial Ecology in Health
& Disease, (2015), 26.
}
\author{
Huang Lin, Yang Cao
}


================================================
FILE: man/run_ancombc.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-ancombc.R
\name{run_ancombc}
\alias{run_ancombc}
\title{Differential analysis of compositions of microbiomes with bias correction
(ANCOM-BC).}
\usage{
run_ancombc(
  ps,
  group,
  confounders = character(0),
  contrast = NULL,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  prv_cut = 0.1,
  lib_cut = 0,
  struc_zero = FALSE,
  neg_lb = FALSE,
  tol = 1e-05,
  max_iter = 100,
  conserve = FALSE,
  pvalue_cutoff = 0.05
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object, which consists of a feature
table, a sample metadata and a taxonomy table.}

\item{group}{the name of the group variable in metadata. Specifying
\code{group} is required for detecting structural zeros and performing
global test.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{contrast}{this parameter only used for two groups comparison while
there are multiple groups. For more please see the following details.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{named \code{list}. other arguments passed to specific
normalization methods.  Most users will not need to pass any additional
arguments here.}

\item{p_adjust}{method to adjust p-values by. Default is "holm".
Options include "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
"fdr", "none". See \code{\link[stats:p.adjust]{stats::p.adjust()}} for more details.}

\item{prv_cut}{a numerical fraction between 0 and 1. Taxa with prevalences
less than \code{prv_cut} will be excluded in the analysis. Default
is 0.10.}

\item{lib_cut}{a numerical threshold for filtering samples based on library
sizes. Samples with library sizes less than \code{lib_cut} will be excluded
in the analysis. Default is 0, i.e. do not filter any sample.}

\item{struc_zero}{whether to detect structural zeros. Default is FALSE.}

\item{neg_lb}{whether to classify a taxon as a structural zero in the
corresponding study group using its asymptotic lower bound.
Default is FALSE.}

\item{tol}{the iteration convergence tolerance for the E-M algorithm.
Default is 1e-05.}

\item{max_iter}{the maximum number of iterations for the E-M algorithm.
Default is 100.}

\item{conserve}{whether to use a conservative variance estimate of
the test statistic. It is recommended if the sample size is small and/or
the number of differentially abundant taxa is believed to be large.
Default is FALSE.}

\item{pvalue_cutoff}{level of significance. Default is 0.05.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Differential abundance analysis for microbial absolute abundance data. This
function is a wrapper of \code{\link[ANCOMBC:ancombc]{ANCOMBC::ancombc()}}.
}
\details{
\code{contrast} must be a two length character or \code{NULL} (default). It is only
required to set manually for two groups comparison when there are multiple
groups. The order determines the direction of comparison, the first element
is used to specify the reference group (control). This means that, the first
element is the denominator for the fold change, and the second element is
used as baseline (numerator for fold change). Otherwise, users do required
to concern this parameter (set as default \code{NULL}), and if there are
two groups, the first level of groups will set as the reference group; if
there are multiple groups, it will perform an ANOVA-like testing to find
markers which difference in any of the groups.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
if (requireNamespace("microbiome", quietly = TRUE)) {
    run_ancombc(ps, group = "Enterotype")
} else {
    message("The 'mirobiome' package is not installed, please install it to use this example")
}
}
\references{
Lin, Huang, and Shyamal Das Peddada. "Analysis of compositions of microbiomes
with bias correction." Nature communications 11.1 (2020): 1-11.
}
\seealso{
\code{\link[ANCOMBC:ancombc]{ANCOMBC::ancombc}}
}


================================================
FILE: man/run_deseq2.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-deseq2.R
\name{run_deseq2}
\alias{run_deseq2}
\title{Perform DESeq differential analysis}
\usage{
run_deseq2(
  ps,
  group,
  confounders = character(0),
  contrast = NULL,
  taxa_rank = "all",
  norm = "RLE",
  norm_para = list(),
  transform = c("identity", "log10", "log10p"),
  fitType = c("parametric", "local", "mean", "glmGamPoi"),
  sfType = "poscounts",
  betaPrior = FALSE,
  modelMatrixType,
  useT = FALSE,
  minmu = ifelse(fitType == "glmGamPoi", 1e-06, 0.5),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{group}{character, the variable to set the group, must be one of
the var of the sample metadata.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{contrast}{this parameter only used for two groups comparison while
there are multiple groups. For more please see the following details.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
}}

\item{norm_para}{arguments passed to specific normalization methods. Most
users will not need to pass any additional arguments here.}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{fitType, sfType, betaPrior, modelMatrixType, useT, minmu}{these seven
parameters are inherited form \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}.
\itemize{
\item \code{fitType}, either "parametric", "local", "mean", or "glmGamPoi" for the
type of fitting of dispersions to the mean intensity.
\item \code{sfType}, either "ratio", "poscounts", or "iterate" for the type of size
factor estimation. We recommend to use "poscounts".
\item \code{betaPrior}, whether or not to put a zero-mean normal prior on the
non-intercept coefficients.
\item \code{modelMatrixType}, either "standard" or "expanded", which describe how
the model matrix,
\item \code{useT}, logical, where Wald statistics are assumed to follow a standard
Normal.
\item \code{minmu}, lower bound on the estimated count for fitting gene-wise
dispersion.
}

For more details, see \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}.  Most users will not need to
set this arguments (just use the defaults).}

\item{p_adjust}{method for multiple test correction, default \code{none}, for
more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{pvalue_cutoff numeric, p value cutoff, default 0.05.}

\item{...}{extra parameters passed to \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Differential expression analysis based on the Negative Binomial distribution
using \strong{DESeq2}.
}
\details{
\strong{Note}: DESeq2 requires the input is raw counts (un-normalized counts), as
only the counts values allow assessing the measurement precision correctly.
For more details see the vignette of DESeq2 (\code{vignette("DESeq2")}).

Thus, this function only supports "none", "rarefy", "RLE", "CSS", and
"TMM" normalization methods. We strongly recommend using the "RLE" method
(default normalization method in the DESeq2 package). The other
normalization methods are used for expert users and comparisons among
different normalization methods.

For two groups comparison, this function utilizes the Wald test (defined by
\code{\link[DESeq2:nbinomWaldTest]{DESeq2::nbinomWaldTest()}}) for hypothesis testing. A Wald test statistic
is computed along with a probability (p-value) that a test statistic at least
as extreme as the observed value were selected at random. \code{contrasts} are
used to specify which two groups to compare. The order of the names
determines the direction of fold change that is reported.

Likelihood ratio test (LRT) is used to identify the genes that significantly
changed across all the different levels for multiple groups comparisons. The
LRT identified the significant features by comparing the full model to the
reduced model. It is testing whether a feature removed in the reduced
model explains a significant variation in the data.

\code{contrast} must be a two length character or \code{NULL} (default). It is only
required to set manually for two groups comparison when there are multiple
groups. The order determines the direction of comparison, the first element
is used to specify the reference group (control). This means that, the first
element is the denominator for the fold change, and the second element is
used as baseline (numerator for fold change). Otherwise, users do required
to concern this parameter (set as default \code{NULL}), and if there are
two groups, the first level of groups will set as the reference group; if
there are multiple groups, it will perform an ANOVA-like testing to find
markers which difference in any of the groups.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")) \%>\%
    phyloseq::subset_taxa(Phylum \%in\% c("Firmicutes"))
run_deseq2(ps, group = "Enterotype")
}
\references{
Love, Michael I., Wolfgang Huber, and Simon Anders. "Moderated estimation
of fold change and dispersion for RNA-seq data with DESeq2." Genome
biology 15.12 (2014): 1-21.
}
\seealso{
\code{\link[DESeq2:results]{DESeq2::results()}},\code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}
}


================================================
FILE: man/run_edger.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-edgeR.R
\name{run_edger}
\alias{run_edger}
\title{Perform differential analysis using edgeR}
\usage{
run_edger(
  ps,
  group,
  confounders = character(0),
  contrast = NULL,
  taxa_rank = "all",
  method = c("LRT", "QLFT"),
  transform = c("identity", "log10", "log10p"),
  norm = "TMM",
  norm_para = list(),
  disp_para = list(),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  ...
)
}
\arguments{
\item{ps}{ps a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{group}{character, the variable to set the group, must be one of
the var of the sample metadata.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{contrast}{this parameter only used for two groups comparison while
there are multiple groups. For more please see the following details.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{method}{character, used for differential analysis, please see details
below for more info.}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over
the differences of the log-transformed gene-count fold-change between
the sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods. Most
users will not need to pass any additional arguments here.}

\item{disp_para}{additional arguments passed to \code{\link[edgeR:estimateDisp]{edgeR::estimateDisp()}}
used for dispersions estimation. Most users will not need to pass any
additional arguments here.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05}

\item{...}{extra arguments passed to the model. See \code{\link[edgeR:glmQLFit]{edgeR::glmQLFit()}}
and \code{\link[edgeR:glmfit]{edgeR::glmFit()}} for more details.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Differential expression analysis based on the Negative Binomial distribution
using \strong{edgeR}.
}
\details{
\strong{Note} that edgeR is designed to work with actual counts. This means that
transformation is not required in any way before inputting them to edgeR.

There are two test methods for differential analysis in \strong{edgeR},
likelihood ratio test (LRT) and quasi-likelihood F-test (QLFT). The QLFT
method is recommended as it allows stricter error rate control by
accounting for the uncertainty in dispersion estimation.

\code{contrast} must be a two length character or \code{NULL} (default). It is only
required to set manually for two groups comparison when there are multiple
groups. The order determines the direction of comparison, the first element
is used to specify the reference group (control). This means that, the first
element is the denominator for the fold change, and the second element is
used as baseline (numerator for fold change). Otherwise, users do required
to concern this parameter (set as default \code{NULL}), and if there are
two groups, the first level of groups will set as the reference group; if
there are multiple groups, it will perform an ANOVA-like testing to find
markers which difference in any of the groups.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
run_edger(ps, group = "Enterotype")
}
\references{
Robinson, Mark D., and Alicia Oshlack. "A scaling normalization method for
differential expression analysis of RNA-seq data." Genome biology 11.3
(2010): 1-9.

Robinson, Mark D., Davis J. McCarthy, and Gordon K. Smyth. "edgeR: a
Bioconductor package for differential expression analysis of digital
gene expression data." Bioinformatics 26.1 (2010): 139-140.
}
\seealso{
\code{\link[edgeR:glmfit]{edgeR::glmFit()}},\code{\link[edgeR:glmQLFit]{edgeR::glmQLFit()}},\code{\link[edgeR:estimateDisp]{edgeR::estimateDisp()}}
,\code{\link[=normalize]{normalize()}}
}
\author{
Yang Cao
}


================================================
FILE: man/run_lefse.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-lefse.R
\name{run_lefse}
\alias{run_lefse}
\title{Liner discriminant analysis (LDA) effect size (LEFSe) analysis}
\usage{
run_lefse(
  ps,
  group,
  subgroup = NULL,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "CPM",
  norm_para = list(),
  kw_cutoff = 0.05,
  lda_cutoff = 2,
  bootstrap_n = 30,
  bootstrap_fraction = 2/3,
  wilcoxon_cutoff = 0.05,
  multigrp_strat = FALSE,
  strict = c("0", "1", "2"),
  sample_min = 10,
  only_same_subgrp = FALSE,
  curv = FALSE
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq]{phyloseq-class}} object}

\item{group}{character, the column name to set the group}

\item{subgroup}{character, the column name to set the subgroup}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{named \code{list}. other arguments passed to specific
normalization methods.  Most users will not need to pass any additional
arguments here.}

\item{kw_cutoff}{numeric, p value cutoff of kw test, default 0.05}

\item{lda_cutoff}{numeric, lda score cutoff, default 2}

\item{bootstrap_n}{integer, the number of bootstrap iteration for LDA,
default 30}

\item{bootstrap_fraction}{numeric, the subsampling fraction value for each
bootstrap iteration, default \code{2/3}}

\item{wilcoxon_cutoff}{numeric, p value cutoff of wilcoxon test, default 0.05}

\item{multigrp_strat}{logical, for multiple group tasks, whether the test is
performed in a one-against one (more strict) or in a one-against all
setting, default \code{FALSE}.}

\item{strict}{multiple testing options, 0 for no correction (default), 1 for
independent comparisons, 2 for independent comparison.}

\item{sample_min}{integer, minimum number of samples per subclass for
performing wilcoxon test, default 10}

\item{only_same_subgrp}{logical, whether perform the wilcoxon test only
among the subgroups with the same name, default \code{FALSE}}

\item{curv}{logical, whether perform the wilcoxon test using the
Curtis's approach, defalt \code{FALSE}}
}
\value{
a \linkS4class{microbiomeMarker} object, in which the \code{slot} of
\code{marker_table}
contains four variables:
\itemize{
\item \code{feature}, significantly different features.
\item \code{enrich_group}, the class of the differential features enriched.
\item \code{lda}, logarithmic LDA score (effect size)
\item \code{pvalue}, p value of kw test.
}
}
\description{
Perform Metagenomic LEFSe analysis based on phyloseq object.
}
\examples{
data(kostic_crc)
kostic_crc_small <- phyloseq::subset_taxa(
    kostic_crc,
    Phylum == "Firmicutes"
)
mm_lefse <- run_lefse(
    kostic_crc_small,
    wilcoxon_cutoff = 0.01,
    group = "DIAGNOSIS",
    kw_cutoff = 0.01,
    multigrp_strat = TRUE,
    lda_cutoff = 4
)
}
\references{
Segata, Nicola, et al. Metagenomic biomarker discovery and
explanation. Genome biology 12.6 (2011): R60.
}
\seealso{
\link{normalize}
}
\author{
Yang Cao
}


================================================
FILE: man/run_limma_voom.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-limma-voom.R
\name{run_limma_voom}
\alias{run_limma_voom}
\title{Differential analysis using limma-voom}
\usage{
run_limma_voom(
  ps,
  group,
  confounders = character(0),
  contrast = NULL,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  voom_span = 0.5,
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  ...
)
}
\arguments{
\item{ps}{ps a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{group}{character, the variable to set the group, must be one of
the var of the sample metadata.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{contrast}{this parameter only used for two groups comparison while
there are multiple groups. For more please see the following details.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
}}

\item{norm_para}{arguments passed to specific normalization methods. Most
users will not need to pass any additional arguments here.}

\item{voom_span}{width of the smoothing window used for the lowess
mean-variance trend for \code{\link[limma:voom]{limma::voom()}}. Expressed as a proportion
between 0 and 1.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{cutoff of p value, default 0.05.}

\item{...}{extra arguments passed to \code{\link[limma:ebayes]{limma::eBayes()}}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Differential analysis using limma-voom
}
\details{
\code{contrast} must be a two length character or \code{NULL} (default). It is only
required to set manually for two groups comparison when there are multiple
groups. The order determines the direction of comparison, the first element
is used to specify the reference group (control). This means that, the first
element is the denominator for the fold change, and the second element is
used as baseline (numerator for fold change). Otherwise, users do required
to concern this parameter (set as default \code{NULL}), and if there are
two groups, the first level of groups will set as the reference group; if
there are multiple groups, it will perform an ANOVA-like testing to find
markers which difference in any of the groups.
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.01,
    p_adjust = "none"
)
mm
}
\references{
Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014).
voom: Precision weights unlock linear model analysis tools for RNA-seq read
counts. Genome biology, 15(2), 1-17.
}


================================================
FILE: man/run_marker.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-all.R
\name{run_marker}
\alias{run_marker}
\title{Find makers (differentially expressed metagenomic features)}
\usage{
run_marker(
  ps,
  group,
  da_method = c("lefse", "simple_t", "simple_welch", "simple_white", "simple_kruskal",
    "simple_anova", "edger", "deseq2", "metagenomeseq", "ancom", "ancombc", "aldex",
    "limma_voom", "sl_lr", "sl_rf", "sl_svm"),
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{da_method}{character to specify the differential analysis method. The
options include:
\itemize{
\item "lefse", linear discriminant analysis (LDA) effect size (LEfSe) method,
for more details see \code{\link[=run_lefse]{run_lefse()}}.
\item "simple_t", "simple_welch", "simple_white", "simple_kruskal",
and "simple_anova", simple statistic methods; "simple_t", "simple_welch"
and "simple_white" for two groups comparison; "simple_kruskal", and
"simple_anova" for multiple groups comparison. For more details see
\code{\link[=run_simple_stat]{run_simple_stat()}}.
\item "edger", see \code{\link[=run_edger]{run_edger()}}.
\item "deseq2", see \code{\link[=run_deseq2]{run_deseq2()}}.
\item "metagenomeseq", differential expression analysis based on the
Zero-inflated Log-Normal mixture model or Zero-inflated Gaussian mixture
model using metagenomeSeq, see \code{\link[=run_metagenomeseq]{run_metagenomeseq()}}.
\item "ancom", see \code{\link[=run_ancom]{run_ancom()}}.
\item "ancombc", differential analysis of compositions of microbiomes with
bias correction, see \code{\link[=run_ancombc]{run_ancombc()}}.
\item "aldex", see \code{\link[=run_aldex]{run_aldex()}}.
\item "limma_voom", see \code{\link[=run_limma_voom]{run_limma_voom()}}.
\item "sl_lr", "sl_rf", and "sl_svm", there supervised leaning (SL) methods:
logistic regression (lr), random forest (rf), or support vector machine
(svm). For more details see \code{\link[=run_sl]{run_sl()}}.
}}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over
the differences of the log-transformed gene-count fold-change between
the sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05.}

\item{...}{extra arguments passed to the corresponding differential analysis
functions, e.g. \code{\link[=run_lefse]{run_lefse()}}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
\code{run_marker} is a wrapper of all differential analysis functions.
}
\details{
This function is only a wrapper of all differential analysis
functions, We recommend to use the corresponding function, since it has a
better default arguments setting.
}
\seealso{
\code{\link[=run_lefse]{run_lefse()}},\code{\link[=run_simple_stat]{run_simple_stat()}},\code{\link[=run_test_two_groups]{run_test_two_groups()}},
\code{\link[=run_test_multiple_groups]{run_test_multiple_groups()}},\code{\link[=run_edger]{run_edger()}},\code{\link[=run_deseq2]{run_deseq2()}},
\code{\link{run_metagenomeseq}},\code{\link[=run_ancom]{run_ancom()}},\code{\link[=run_ancombc]{run_ancombc()}},\code{\link[=run_aldex]{run_aldex()}},
\code{\link[=run_limma_voom]{run_limma_voom()}},\code{\link[=run_sl]{run_sl()}}
}


================================================
FILE: man/run_metagenomeseq.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-metagenomeSeq.R
\name{run_metagenomeseq}
\alias{run_metagenomeseq}
\title{metagenomeSeq differential analysis}
\usage{
run_metagenomeseq(
  ps,
  group,
  confounders = character(0),
  contrast = NULL,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "CSS",
  norm_para = list(),
  method = c("ZILN", "ZIG"),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  ...
)
}
\arguments{
\item{ps}{ps a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object.}

\item{group}{character, the variable to set the group, must be one of
the var of the sample metadata.}

\item{confounders}{character vector, the confounding variables to be adjusted.
default \code{character(0)}, indicating no confounding variable.}

\item{contrast}{this parameter only used for two groups comparison while
there are multiple groups. For more please see the following details.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of \code{phyloseq::rank_names(ps)},
or "all" means to summarize the taxa by the top taxa ranks
(\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or "none" means perform
differential analysis on the original taxa (\code{taxa_names(ps)}, e.g.,
OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods.}

\item{method}{character, which model used for differential analysis,
"ZILN" (Zero-inflated Log-Normal mixture model)" or "ZIG" (Zero-inflated
Gaussian mixture model). And the zero-inflated log-normal model is
preferred due to the high sensitivity and low FDR.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05}

\item{...}{extra arguments passed to the model. more details see
\code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}} and \code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}},
e.g. \code{control} (can be setted using \code{\link[metagenomeSeq:zigControl]{metagenomeSeq::zigControl()}}) for
\code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Differential expression analysis based on the Zero-inflated Log-Normal
mixture model or Zero-inflated Gaussian mixture model using metagenomeSeq.
}
\details{
metagnomeSeq provides two differential analysis methods, zero-inflated
log-normal mixture model (implemented in
\code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}}) and zero-inflated Gaussian mixture
model (implemented in \code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}}). We recommend
fitFeatureModel over fitZig due to high sensitivity and low FDR. Both
\code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}} and \code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}} require
the abundance profiles before normalization.

For \code{\link[metagenomeSeq:fitZig]{metagenomeSeq::fitZig()}}, the output column is the coefficient of
interest, and logFC column in the output of
\code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}} is analogous to coefficient. Thus,
logFC is really just the estimate the coefficient of interest in
\code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}}. For more details see
these question \href{https://support.bioconductor.org/p/94138/}{Difference between fitFeatureModel and fitZIG in metagenomeSeq}.

\code{contrast} must be a two length character or \code{NULL} (default). It is only
required to set manually for two groups comparison when there are multiple
groups. The order determines the direction of comparison, the first element
is used to specify the reference group (control). This means that, the first
element is the denominator for the fold change, and the second element is
used as baseline (numerator for fold change). Otherwise, users do required
to concern this paramerter (set as default \code{NULL}), and if there are
two groups, the first level of groups will set as the reference group; if
there are multiple groups, it will perform an ANOVA-like testing to find
markers which difference in any of the groups.

Of note, \code{\link[metagenomeSeq:fitFeatureModel]{metagenomeSeq::fitFeatureModel()}} is not allows for multiple
groups comparison.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
run_metagenomeseq(ps, group = "Enterotype")
}
\references{
Paulson, Joseph N., et al. "Differential abundance analysis for microbial
marker-gene surveys." Nature methods 10.12 (2013): 1200-1202.
}
\author{
Yang Cao
}


================================================
FILE: man/run_posthoc_test.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/post-hoc-test.R
\name{run_posthoc_test}
\alias{run_posthoc_test}
\title{Post hoc pairwise comparisons for multiple groups test.}
\usage{
run_posthoc_test(
  ps,
  group,
  transform = c("identity", "log10", "log10p"),
  norm = "TSS",
  norm_para = list(),
  conf_level = 0.95,
  method = c("tukey", "games_howell", "scheffe", "welch_uncorrected")
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item a integer, e.g. 1e6 (default), indicating pre-sample normalization of
the sum of the values to 1e6.
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{conf_level}{confidence level, default 0.95}

\item{method}{one of "tukey", "games_howell", "scheffe", "welch_uncorrected",
defining the method for the pairwise comparisons. See details for more
information.}
}
\value{
a \linkS4class{postHocTest} object
}
\description{
Multiple group test, such as anova and Kruskal-Wallis rank sum test, can be
used to uncover the significant feature among all groups. Post hoc tests are
used to uncover specific mean differences between pair of groups.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
) \%>\%
    phyloseq::subset_taxa(Phylum == "Bacteroidetes")
pht <- run_posthoc_test(ps, group = "Enterotype")
pht
}
\seealso{
\linkS4class{postHocTest}, \code{\link[=run_test_multiple_groups]{run_test_multiple_groups()}}
}


================================================
FILE: man/run_simple_stat.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-simple-statistic.R
\name{run_simple_stat}
\alias{run_simple_stat}
\title{Simple statistical analysis of metagenomic profiles}
\usage{
run_simple_stat(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "TSS",
  norm_para = list(),
  method = c("welch.test", "t.test", "white.test", "anova", "kruskal"),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  diff_mean_cutoff = NULL,
  ratio_cutoff = NULL,
  eta_squared_cutoff = NULL,
  conf_level = 0.95,
  nperm = 1000,
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{method}{test method, options include: "welch.test", "t.test" and
"white.test" for two groups comparison,  "anova"and "kruskal" for multiple
groups comparison.}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05}

\item{diff_mean_cutoff, ratio_cutoff}{only used for two groups comparison,
cutoff of different means and ratios, default \code{NULL} which means no effect
size filter.}

\item{eta_squared_cutoff}{only used for multiple groups comparison, numeric,
cutoff of effect size (eta squared) default \code{NULL} which means no effect
size filter.}

\item{conf_level}{only used for two groups comparison, numeric, confidence
level of interval.}

\item{nperm}{integer, only used for two groups comparison, number of
permutations for white non parametric t test estimation}

\item{...}{only used for two groups comparison, extra arguments passed to
\code{\link[=t.test]{t.test()}} or \code{\link[=fisher.test]{fisher.test()}}.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Perform simple statistical analysis of metagenomic profiles. This function
is a wrapper of \code{run_test_two_groups} and \code{run_test_multiple_groups}.
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2")
)
run_simple_stat(ps, group = "Enterotype")
}
\seealso{
\code{\link[=run_test_two_groups]{run_test_two_groups()}},\code{\link[=run_test_multiple_groups]{run_test_multiple_groups()}}
}


================================================
FILE: man/run_sl.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-sl.R
\name{run_sl}
\alias{run_sl}
\title{Identify biomarkers using supervised leaning (SL) methods}
\usage{
run_sl(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  nfolds = 3,
  nrepeats = 3,
  sampling = NULL,
  tune_length = 5,
  top_n = 10,
  method = c("LR", "RF", "SVM"),
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq]{phyloseq-class}} object.}

\item{group}{character, the variable to set the group.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{named \code{list}. other arguments passed to specific
normalization methods.  Most users will not need to pass any additional
arguments here.}

\item{nfolds}{the number of splits in CV.}

\item{nrepeats}{the number of complete sets of folds to compute.}

\item{sampling}{a single character value describing the type of additional
sampling that is conducted after resampling (usually to resolve class
imbalances). Values are "none", "down", "up", "smote", or "rose". For
more details see \code{\link[caret:trainControl]{caret::trainControl()}}.}

\item{tune_length}{an integer denoting the amount of granularity in the
tuning parameter grid. For more details see \code{\link[caret:train]{caret::train()}}.}

\item{top_n}{an integer denoting the top \code{n} features as the biomarker
according the importance score.}

\item{method}{supervised learning method, options are "LR" (logistic
regression), "RF" (rando forest), or "SVM" (support vector machine).}

\item{...}{extra arguments passed to the classification. e.g., \code{importance}
for \code{randomForest::randomForest}.}
}
\value{
a \linkS4class{microbiomeMarker} object.
}
\description{
Identify biomarkers using logistic regression, random forest, or support
vector machine.
}
\details{
Only support two groups comparison in the current version. And the
marker was selected based on its importance score. Moreover, The
hyper-parameters are selected automatically by a grid-search based method
in the N-time K-fold cross-validation. Thus, the identified biomarker based
can be biased due to model overfitting for small datasets (e.g., with less
than 100 samples).

The argument \code{top_n} is used to denote the number of markers based on the
importance score. There is no rule or principle on how to select \code{top_n},
however, usually it is very useful to try a different \code{top_n} and compare
the performance of the marker predictions for the testing data.
}
\examples{
data(enterotypes_arumugam)
# small example phyloseq object for test
ps_small <- phyloseq::subset_taxa(
    enterotypes_arumugam,
    Phylum \%in\% c("Firmicutes", "Bacteroidetes")
)

set.seed(2021)
mm <- run_sl(
    ps_small,
    group = "Gender",
    taxa_rank = "Genus",
    nfolds = 2,
    nrepeats = 1,
    top_n = 15,
    norm = "TSS",
    method = "LR",
)
mm
}
\seealso{
\code{\link[caret:train]{caret::train()}},\code{\link[caret:trainControl]{caret::trainControl()}}
}
\author{
Yang Cao
}


================================================
FILE: man/run_test_multiple_groups.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-test-multiple-groups.R
\name{run_test_multiple_groups}
\alias{run_test_multiple_groups}
\title{Statistical test for multiple groups}
\usage{
run_test_multiple_groups(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "TSS",
  norm_para = list(),
  method = c("anova", "kruskal"),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  effect_size_cutoff = NULL
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{method}{test method, must be one of "anova" or "kruskal"}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05.}

\item{effect_size_cutoff}{numeric, cutoff of effect size default \code{NULL}
which means no effect size filter. The eta squared is used to measure the
effect size for anova/kruskal test.}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Statistical test for multiple groups
}
\examples{
data(enterotypes_arumugam)
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype \%in\% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
)
mm_anova <- run_test_multiple_groups(
    ps,
    group = "Enterotype",
    method = "anova"
)
}
\seealso{
\code{\link[=run_posthoc_test]{run_posthoc_test()}},\code{\link[=run_test_two_groups]{run_test_two_groups()}},\code{\link[=run_simple_stat]{run_simple_stat()}}
}


================================================
FILE: man/run_test_two_groups.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-test-two-groups.R
\name{run_test_two_groups}
\alias{run_test_two_groups}
\title{Statistical test between two groups}
\usage{
run_test_two_groups(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "TSS",
  norm_para = list(),
  method = c("welch.test", "t.test", "white.test"),
  p_adjust = c("none", "fdr", "bonferroni", "holm", "hochberg", "hommel", "BH", "BY"),
  pvalue_cutoff = 0.05,
  diff_mean_cutoff = NULL,
  ratio_cutoff = NULL,
  conf_level = 0.95,
  nperm = 1000,
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object}

\item{group}{character, the variable to set the group}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{arguments passed to specific normalization methods}

\item{method}{test method, must be one of "welch.test", "t.test" or
"white.test"}

\item{p_adjust}{method for multiple test correction, default \code{none},
for more details see \link[stats:p.adjust]{stats::p.adjust}.}

\item{pvalue_cutoff}{numeric, p value cutoff, default 0.05}

\item{diff_mean_cutoff, ratio_cutoff}{cutoff of different means and ratios,
default \code{NULL} which means no effect size filter.}

\item{conf_level}{numeric, confidence level of interval.}

\item{nperm}{integer, number of permutations for white non parametric t test
estimation}

\item{...}{extra arguments passed to \code{\link[=t.test]{t.test()}} or \code{\link[=fisher.test]{fisher.test()}}}
}
\value{
a \code{\linkS4class{microbiomeMarker}} object.
}
\description{
Statistical test between two groups
}
\examples{
data(enterotypes_arumugam)
mm_welch <- run_test_two_groups(
    enterotypes_arumugam,
    group = "Gender",
    method = "welch.test"
)
mm_welch
}
\seealso{
\code{\link[=run_test_multiple_groups]{run_test_multiple_groups()}},\code{\link{run_simple_stat}}
}
\author{
Yang Cao
}


================================================
FILE: man/subset_marker.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/subset-marker.R
\name{subset_marker}
\alias{subset_marker}
\title{Subset microbiome markers}
\usage{
subset_marker(mm, ...)
}
\arguments{
\item{mm}{a \code{\linkS4class{microbiomeMarker}} or \code{\linkS4class{marker_table}} object.}

\item{...}{the subsetting expression passed to \code{\link[base:subset]{base::subset()}}.}
}
\value{
a subset object in the same class with \code{mm}.
}
\description{
Subset markers based on an expression related to the columns and values
within the \code{marker_table} slot of \code{mm}.
}
\examples{
data(enterotypes_arumugam)
mm <- run_limma_voom(
    enterotypes_arumugam,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.01,
    p_adjust = "none"
)
subset_marker(mm, pvalue < 0.005)
}


================================================
FILE: man/summarize_taxa.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/summarize-taxa.R
\name{summarize_taxa}
\alias{summarize_taxa}
\title{Summarize taxa into a taxonomic level within each sample}
\usage{
summarize_taxa(ps, level = rank_names(ps)[1], absolute = TRUE, sep = "|")
}
\arguments{
\item{ps}{a \code{\link[phyloseq]{phyloseq-class}} object.}

\item{level}{taxonomic level to summarize, default the top level rank of the
\code{ps}.}

\item{absolute}{logical, whether return the absolute abundance or
relative abundance, default \code{FALSE}.}

\item{sep}{a character string to separate the taxonomic levels.}
}
\value{
a \code{\link[phyloseq:phyloseq-class]{phyloseq::phyloseq}} object, where each row represents a
taxa, and each col represents the taxa abundance of each sample.
}
\description{
Provides summary information of the representation of a taxonomic levels
within each sample.
}
\examples{
data(enterotypes_arumugam)
summarize_taxa(enterotypes_arumugam)
}


================================================
FILE: man/summary.compareDA.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-comparing.R
\name{summary.compareDA}
\alias{summary.compareDA}
\title{Summary differential analysis methods comparison results}
\usage{
\method{summary}{compareDA}(
  object,
  sort = c("score", "auc", "fpr", "power"),
  boot = TRUE,
  boot_n = 1000L,
  prob = c(0.05, 0.95),
  ...
)
}
\arguments{
\item{object}{an \code{compareDA} object, output from \code{\link[=compare_DA]{compare_DA()}}.}

\item{sort}{character string specifying sort method. Possibilities are
"score" which is calculated as \eqn{(auc - 0.5) * power - fdr}, "auc" for
area under the ROC curve, "fpr" for false positive rate, "power" for
empirical power.}

\item{boot}{logical, whether use bootstrap for confidence limites of the
score, default \code{TRUE}. Recommended to be \code{TRUE} unless \code{n_rep} is larger
then 100 in \code{\link[=compare_DA]{compare_DA()}}.}

\item{boot_n}{integer, number of bootstraps, default 1000L.}

\item{prob}{two length numeric vector, confidence limits for score, default
\code{c(0.05, 0.95)}.}

\item{...}{extra arguments affecting the summary produced.}
}
\value{
a \code{data.frame} containing measurements for differential analysis
methods:
\itemize{
\item \code{call}: differential analysis commands.
\item \code{auc}: area under curve of ROC.
\item \code{fpr}: false positive rate
\item \code{power}: empirical power.
\item \code{fdr}: false discover7y rate.
\item \code{score}: score whch is calculated as \eqn{(auc - 0.5) * power - fdr}.
\item \verb{score_*}: confidence limits of score.
}
}
\description{
Summary differential analysis methods comparison results
}


================================================
FILE: man/transform_abundances.Rd
================================================
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/transform.R
\name{transform_abundances}
\alias{transform_abundances}
\title{Transform the taxa abundances in \code{otu_table} sample by sample}
\usage{
transform_abundances(object, transform = c("identity", "log10", "log10p"))
}
\arguments{
\item{object}{\code{\linkS4class{otu_table}}, \code{\linkS4class{phyloseq}}, or
\code{\linkS4class{microbiomeMarker}}.}

\item{transform}{transformation to apply, the options inclulde:
\itemize{
\item "identity", return the original data without any transformation.
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}
}
\value{
A object matches the class of argument \code{object} with the transformed
\code{otu_table}.
}
\description{
Transform the taxa abundances in \code{otu_table} sample by sample, which means
the counts of each sample will be transformed individually.
}
\examples{
data(oxygen)
x1 <- transform_abundances(oxygen)
head(otu_table(x1), 10)
x2 <- transform_abundances(oxygen, "log10")
head(otu_table(x2), 10)
x3 <- transform_abundances(oxygen, "log10p")
head(otu_table(x3), 10)
}
\seealso{
\code{\link[=abundances]{abundances()}}
}


================================================
FILE: tests/testthat/_snaps/ancom.md
================================================
# ancom result

    Code
      print(head(curr_marker), digits = 5)
    Output
                            feature enrich_group ef_CLR_diff_mean  W
      marker1                    __     Cesarean         0.076290  1
      marker2     c__Actinobacteria     Cesarean         0.310201 10
      marker3     c__Coriobacteriia      Vaginal         0.110342  4
      marker4        c__Bacteroidia      Vaginal         0.120525  6
      marker5      c__Fusobacteriia     Cesarean         0.083236  1
      marker6 c__Betaproteobacteria     Cesarean         0.188186  3


================================================
FILE: tests/testthat/_snaps/edgeR.md
================================================
# result of edger

    Code
      print(marker_table(mm_edger), digits = 5)
    Output
                                                                                                                                                        feature
      marker1                                                                                              k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria
      marker2                                                                         k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales
      marker3                                                   k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae
      marker4                                                                                                                     k__Bacteria|p__Proteobacteria
      marker5                                    k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia
      marker6                            k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__coli
      marker7                                     k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Klebsiella
      marker8                   k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Klebsiella|g__Klebsiella_s__
      marker9                                                          k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia
      marker10                                        k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|g__Roseburia_s__
      marker11 k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|f__Clostridiaceae_g__Clostridium|f__Clostridiaceae_g__Clostridium_s__
      marker12                                           k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__[Paraprevotellaceae]|g__Paraprevotella
      marker13                     k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__[Paraprevotellaceae]|g__Paraprevotella|g__Paraprevotella_s__
      marker14                    k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__adolescentis
      marker15                                                            k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rickettsiales|f__mitochondria
      marker16                                        k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rickettsiales|f__mitochondria|f__mitochondria_g__
      marker17                k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rickettsiales|f__mitochondria|f__mitochondria_g__|f__mitochondria_g___s__
      marker18                                                                             k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales
      marker19                                                       k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae
      marker20                                                                            k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rickettsiales
      marker21                                    k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium
      marker22                                                                                             k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria
      marker23                                                                               k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales
      marker24                                                                                                           k__Bacteria|p__Cyanobacteria|c__4C0d-2
      marker25                                                                                                    k__Bacteria|p__Cyanobacteria|c__4C0d-2|o__YS2
      marker26                                                                                         k__Bacteria|p__Cyanobacteria|c__4C0d-2|o__YS2|o__YS2_f__
      marker27                                                                          k__Bacteria|p__Cyanobacteria|c__4C0d-2|o__YS2|o__YS2_f__|o__YS2_f___g__
      marker28                                                       k__Bacteria|p__Cyanobacteria|c__4C0d-2|o__YS2|o__YS2_f__|o__YS2_f___g__|o__YS2_f___g___s__
      marker29                                       k__Bacteria|p__Firmicutes|c__Erysipelotrichi|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Coprobacillus
      marker30                  k__Bacteria|p__Firmicutes|c__Erysipelotrichi|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Coprobacillus|g__Coprobacillus_s__
      marker31                                                                       k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae
      marker32                                   k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|g__Prevotella_s__
      marker33                                       k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae|g__Pseudoramibacter_Eubacterium
      marker34   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae|g__Pseudoramibacter_Eubacterium|g__Pseudoramibacter_Eubacterium_s__
      marker35                                                                          k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales
               enrich_group ef_logFC     pvalue      padj
      marker1            CD  -4.7938 1.3030e-06 0.0010241
      marker2            CD  -5.2183 4.2890e-06 0.0011237
      marker3            CD  -5.2183 4.2890e-06 0.0011237
      marker4            CD  -3.4448 1.0392e-05 0.0020420
      marker5            CD  -5.2156 1.6192e-05 0.0021211
      marker6            CD  -5.2156 1.6192e-05 0.0021211
      marker7            CD  -9.5467 3.1943e-05 0.0028091
      marker8            CD  -9.5467 3.1943e-05 0.0028091
      marker9       Control   3.2153 3.2165e-05 0.0028091
      marker10      Control   3.1372 5.3612e-05 0.0042139
      marker11      Control   5.0869 6.1050e-05 0.0043623
      marker12      Control   9.0024 1.2050e-04 0.0072857
      marker13      Control   9.0024 1.2050e-04 0.0072857
      marker14      Control   7.8854 2.6810e-04 0.0150517
      marker15           CD  -3.7258 4.2948e-04 0.0185286
      marker16           CD  -3.7256 4.2979e-04 0.0185286
      marker17           CD  -3.7256 4.2979e-04 0.0185286
      marker18      Control   3.0126 4.4789e-04 0.0185286
      marker19      Control   3.0126 4.4789e-04 0.0185286
      marker20           CD  -3.6959 4.7188e-04 0.0185449
      marker21      Control   3.0175 5.8161e-04 0.0214183
      marker22           CD  -3.4967 5.9950e-04 0.0214183
      marker23           CD  -3.7713 6.8581e-04 0.0234368
      marker24      Control   8.3772 1.2118e-03 0.0340169
      marker25      Control   8.3772 1.2118e-03 0.0340169
      marker26      Control   8.3772 1.2118e-03 0.0340169
      marker27      Control   8.3772 1.2118e-03 0.0340169
      marker28      Control   8.3772 1.2118e-03 0.0340169
      marker29      Control   5.4046 1.4182e-03 0.0363597
      marker30      Control   5.4046 1.4182e-03 0.0363597
      marker31      Control   5.4734 1.4340e-03 0.0363597
      marker32           CD  -6.5865 3.3337e-03 0.0818835
      marker33      Control   5.3561 3.6295e-03 0.0828593
      marker34      Control   5.3561 3.6295e-03 0.0828593
      marker35           CD  -3.6249 3.6897e-03 0.0828593


================================================
FILE: tests/testthat/_snaps/lefse.md
================================================
# lefse output of oxygen

    Code
      mm_lefse
    Output
      microbiomeMarker-class inherited from phyloseq-class
      normalization method:              [ CPM ]
      microbiome marker identity method: [ lefse ]
      marker_table() Marker Table:       [ 12 microbiome markers with 5 variables ]
      otu_table()    OTU Table:          [ 276 taxa and  177 samples ]
      sample_data()  Sample Data:        [ 177 samples by  71 sample variables ]
      tax_table()    Taxonomy Table:     [ 276 taxa by 1 taxonomic ranks ]

---

    Code
      marker_table(mm_lefse)
    Output
                                                                                                                                     feature
      marker1                                                    k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae
      marker2                                k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Faecalibacterium
      marker3        k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Faecalibacterium|s__Faecalibacterium_s__
      marker4                                                                                        k__Bacteria|p__Firmicutes|c__Clostridia
      marker5                                                                       k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales
      marker6                             k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcaceae_g__
      marker7  k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcaceae_g__|s__Ruminococcaceae_g___s__
      marker8                                                                                           k__Bacteria|p__Firmicutes|c__Bacilli
      marker9                                                                        k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales
      marker10             k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_s__
      marker11                                                   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae
      marker12                                  k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
               enrich_group   ef_lda       pvalue         padj
      marker1       Healthy 4.993303 7.154793e-05 7.154793e-05
      marker2       Healthy 4.832765 5.914547e-04 5.914547e-04
      marker3       Healthy 4.830757 6.043983e-04 6.043983e-04
      marker4       Healthy 4.648541 7.176046e-04 7.176046e-04
      marker5       Healthy 4.648541 7.176046e-04 7.176046e-04
      marker6       Healthy 4.317335 6.990210e-03 6.990210e-03
      marker7       Healthy 4.317335 6.990210e-03 6.990210e-03
      marker8         Tumor 4.648541 7.176046e-04 7.176046e-04
      marker9         Tumor 4.514859 3.267911e-03 3.267911e-03
      marker10        Tumor 4.211031 4.371742e-03 4.371742e-03
      marker11        Tumor 4.122063 4.724670e-03 4.724670e-03
      marker12        Tumor 4.121855 4.812008e-03 4.812008e-03


================================================
FILE: tests/testthat/_snaps/limma-voom.md
================================================
# limma voom

    Code
      print(marker_table(mm_lv), digits = 5)
    Output
                                        feature enrich_group ef_logFC     pvalue
      marker1    p__Bacteroidetes|g__Prevotella Enterotype.2   5.8537 1.0514e-12
      marker2 p__Verrucomicrobia|g__Akkermansia Enterotype.3  -8.6516 3.1047e-04
      marker3                p__Verrucomicrobia Enterotype.3  -8.3000 5.4262e-04
                    padj
      marker1 2.5023e-10
      marker2 3.6946e-02
      marker3 4.3048e-02


================================================
FILE: tests/testthat/_snaps/multiple-groups-test.md
================================================
# test post hoc test result

    Code
      print(res_test, digits = 5)
    Output
                                              comparisons diff_mean     pvalue
      Enterotype 2-Enterotype 1 Enterotype 2-Enterotype 1  -0.28139 4.7701e-08
      Enterotype 3-Enterotype 1 Enterotype 3-Enterotype 1  -0.26045 1.6364e-09
      Enterotype 3-Enterotype 2 Enterotype 3-Enterotype 2   0.02094 7.8899e-01
                                 ci_lower  ci_upper
      Enterotype 2-Enterotype 1 -0.371347 -0.191443
      Enterotype 3-Enterotype 1 -0.331229 -0.189681
      Enterotype 3-Enterotype 2 -0.057576  0.099457

# test visualization of posthoc test, data of signicance level annotation

    Code
      annotation_single
    Output
                xmin         xmax y_position annotation
      1 Enterotype 2 Enterotype 1    0.54668        ***
      2 Enterotype 3 Enterotype 1    0.61035        ***
      3 Enterotype 3 Enterotype 2    0.30558        NS.

---

    Code
      head(annotation_all)
    Output
                xmin         xmax y_position annotation           feature
      1 Enterotype 2 Enterotype 1    0.13116        NS. p__Actinobacteria
      2 Enterotype 3 Enterotype 1     0.2694        NS. p__Actinobacteria
      3 Enterotype 3 Enterotype 2    0.29804        NS. p__Actinobacteria
      4 Enterotype 2 Enterotype 1    0.57527        NS.  p__Bacteroidetes
      5 Enterotype 3 Enterotype 1    0.63693        ***  p__Bacteroidetes
      6 Enterotype 3 Enterotype 2    0.56712         **  p__Bacteroidetes


================================================
FILE: tests/testthat/_snaps/two-group-test.md
================================================
# test two group result

    Code
      mm_welch
    Output
      microbiomeMarker-class inherited from phyloseq-class
      normalization method:              [ TSS ]
      microbiome marker identity method: [ welch.test ]
      marker_table() Marker Table:       [ 3 microbiome markers with 5 variables ]
      otu_table()    OTU Table:          [ 244 taxa and  39 samples ]
      sample_data()  Sample Data:        [ 39 samples by  9 sample variables ]
      tax_table()    Taxonomy Table:     [ 244 taxa by 1 taxonomic ranks ]

---

    Code
      mm_t
    Output
      microbiomeMarker-class inherited from phyloseq-class
      normalization method:              [ TSS ]
      microbiome marker identity method: [ t.test ]
      marker_table() Marker Table:       [ 2 microbiome markers with 5 variables ]
      otu_table()    OTU Table:          [ 244 taxa and  39 samples ]
      sample_data()  Sample Data:        [ 39 samples by  9 sample variables ]
      tax_table()    Taxonomy Table:     [ 244 taxa by 1 taxonomic ranks ]


================================================
FILE: tests/testthat/data/ancom-zero.csv
================================================
"","structural_zero..delivery...Cesarean.","structural_zero..delivery...Vaginal."
"sp1",FALSE,FALSE
"sp2",FALSE,FALSE
"sp3",FALSE,FALSE
"sp4",FALSE,FALSE
"sp5",FALSE,FALSE
"sp6",FALSE,FALSE
"sp7",FALSE,FALSE
"sp8",FALSE,FALSE
"sp9",FALSE,FALSE
"sp10",FALSE,FALSE
"sp11",FALSE,FALSE
"sp12",FALSE,FALSE
"sp13",FALSE,FALSE
"sp14",FALSE,FALSE
"sp15",FALSE,FALSE
"sp16",FALSE,FALSE
"sp17",FALSE,FALSE
"sp18",FALSE,FALSE
"sp19",FALSE,FALSE
"sp20",FALSE,FALSE
"sp21",FALSE,FALSE
"sp22",FALSE,FALSE
"sp23",FALSE,FALSE
"sp24",FALSE,FALSE
"sp25",FALSE,FALSE
"sp26",FALSE,FALSE
"sp27",FALSE,FALSE
"sp28",FALSE,FALSE
"sp29",FALSE,FALSE
"sp30",FALSE,FALSE
"sp31",FALSE,FALSE
"sp32",FALSE,FALSE
"sp33",FALSE,FALSE
"sp34",FALSE,FALSE
"sp35",FALSE,FALSE
"sp36",FALSE,FALSE
"sp37",FALSE,FALSE
"sp38",FALSE,FALSE
"sp39",FALSE,FALSE
"sp40",FALSE,FALSE
"sp41",FALSE,FALSE
"sp42",FALSE,FALSE
"sp43",FALSE,FALSE
"sp44",FALSE,FALSE
"sp45",FALSE,FALSE
"sp46",FALSE,FALSE
"sp47",FALSE,FALSE
"sp48",FALSE,FALSE


================================================
FILE: tests/testthat/data/ancom-zero_neg_lb.csv
================================================
"","structural_zero..delivery...Cesarean.","structural_zero..delivery...Vaginal."
"sp1",FALSE,FALSE
"sp2",FALSE,FALSE
"sp3",FALSE,FALSE
"sp4",FALSE,FALSE
"sp5",FALSE,FALSE
"sp6",FALSE,FALSE
"sp7",FALSE,FALSE
"sp8",FALSE,FALSE
"sp9",FALSE,FALSE
"sp10",FALSE,FALSE
"sp11",FALSE,FALSE
"sp12",FALSE,FALSE
"sp13",FALSE,FALSE
"sp14",FALSE,FALSE
"sp15",FALSE,FALSE
"sp16",FALSE,FALSE
"sp17",FALSE,FALSE
"sp18",FALSE,FALSE
"sp19",FALSE,FALSE
"sp20",FALSE,FALSE
"sp21",FALSE,FALSE
"sp22",FALSE,FALSE
"sp23",FALSE,FALSE
"sp24",FALSE,FALSE
"sp25",FALSE,FALSE
"sp26",FALSE,FALSE
"sp27",FALSE,FALSE
"sp28",FALSE,FALSE
"sp29",FALSE,FALSE
"sp30",FALSE,FALSE
"sp31",FALSE,FALSE
"sp32",FALSE,FALSE
"sp33",FALSE,FALSE
"sp34",FALSE,FALSE
"sp35",FALSE,FALSE
"sp36",FALSE,FALSE
"sp37",FALSE,FALSE
"sp38",FALSE,FALSE
"sp39",FALSE,FALSE
"sp40",FALSE,FALSE
"sp41",FALSE,FALSE
"sp42",FALSE,FALSE
"sp43",FALSE,FALSE
"sp44",FALSE,FALSE
"sp45",FALSE,FALSE
"sp46",FALSE,FALSE
"sp47",FALSE,FALSE
"sp48",FALSE,FALSE


================================================
FILE: tests/testthat/test-abundances.R
================================================
ps <- phyloseq::phyloseq(
    otu_table = otu_table(
        matrix(
            sample(100, 40),
            nrow = 2,
            dimnames = list(
                c("feature1", "feature2"),
                paste0("sample", 1:20)
            )
        ),
        taxa_are_rows = TRUE
    ),
    tax_table = tax_table(
        matrix(
            c("taxa1", "taxa2"),
            nrow = 2,
            dimnames = list(c("feature1", "feature2"), c("Species"))
        )
    ),
    sam_data = sample_data(
        data.frame(
            group = rep(c("group1", "group2"), 10),
            row.names = paste0("sample", 1:20)
        )
    )
)

test_that("abundances", {
    abd <- abundances(ps, "log10p")
    abd_t <- transform_abundances(ps, "log10p")

    expect_identical(abd, as(otu_table(abd_t), "matrix"))
})

test_that("abundances normalization", {
    abd_norm <- normalize(ps, "TSS")
    expect_true(all(colSums(otu_table(abd_norm)) == 1))
    expect_s4_class(abd_norm, "phyloseq")
    expect_s4_class(normalize(otu_table(ps)), "otu_table")
})


================================================
FILE: tests/testthat/test-aldex.R
================================================
test_that("convert mc instances", {
    instance <- list(
        sample1 = data.frame(inst1 = runif(10), inst2 = runif(10)),
        sample2 = data.frame(inst1 = runif(10), inst2 = runif(10)),
        sample3 = data.frame(inst1 = runif(10), inst2 = runif(10))
    )
    instance_converted <- convert_instance(instance, 2)

    expect_identical(length(instance_converted), 2L)
    expect_identical(names(instance_converted[[1]]), paste0("sample", 1:3))
    expect_equal(
        instance_converted[[1]],
        tibble::tibble(
            sample1 = instance[[1]][[1]],
            sample2 = instance[[2]][[1]],
            sample3 = instance[[3]][[1]]
        )
    )
})


# fast test
data(kostic_crc)
x <- as(phyloseq::otu_table(kostic_crc), "matrix")[1:20, 1:20]
groups <- phyloseq::sample_data(kostic_crc)[["DIAGNOSIS"]][1:20]
idx1 <- groups == "Tumor"
idx2 <- groups == "Healthy"
x_clr <- suppressWarnings(ALDEx2::aldex.clr(x, groups, mc.samples = 12))
instance <- convert_instance(x_clr@analysisData, 12)[[1]]
# keep the same number of samples for two groups
idx2 <- which(idx2)[1:4]
instance <- instance[c(which(idx1), idx2)]
new_groups <- groups[c(which(idx1), idx2)]

test_that("t_fast equal to t.test and ALDEx2:::t.fast", {
    # unpaired
    t_res_up <-  apply(
        instance, 1,
        function(x) t.test(x[1:4], x[5:8], alternative = "greater")$p.value
    )
    expect_equal(t_res_up, t_fast(instance, new_groups))
    expect_equal(
       t_res_up,
       ALDEx2:::t.fast(instance, new_groups, paired = FALSE)$p
    )

    # paired
    t_res_p <- apply(
        instance, 1,
        function(x) t.test(x[1:4], x[5:8],
                           paired = TRUE,
                           alternative = "greater")$p.value
    )
    expect_equal(
        t_res_p,
        t_fast(instance, new_groups, paired = TRUE)
    )
    expect_equal(
        t_res_p,
        ALDEx2:::t.fast(instance, new_groups, paired = TRUE)$p
    )

})

test_that("wilcox_fast equal to wilcox.test", {
    # unpaired
    wilcox_res_p <- apply(
        instance, 1,
        function(x) wilcox.test(x[1:4], x[5:8],
                                alternative = "greater")$p.value
    )
    expect_equal(
        wilcox_res_p,
        wilcox_fast(instance, new_groups)
    )
    expect_equal(
        wilcox_res_p,
        ALDEx2:::wilcox.fast(instance, new_groups, paired = FALSE))

    # paired
    wilcox_res_p <- apply(
        instance, 1,
        function(x) wilcox.test(x[1:4], x[5:8],
                                paired = TRUE,
                                alternative = "greater")$p.value
    )
    expect_equal(
        wilcox_res_p,
        wilcox_fast(instance, new_groups, paired = TRUE)
    )
    expect_equal(
        wilcox_res_p,
        ALDEx2:::wilcox.fast(instance, new_groups, paired = TRUE)
    )

    expect_equal(
        apply(
            instance, 1, function(x) {
                wilcox.test(x[1:4], x[5:8],
                            alternative = "greater",
                            paired = TRUE, exact = TRUE)$p.value
            }
        ),
        wilcox_fast(instance, new_groups, paired = TRUE)
    )
})

test_that("aldex_t", {
    aldex_test_out <- ALDEx2::aldex.ttest(x_clr)
    t_out <- aldex_t(x_clr, groups, method = "t.test", mc_samples = 12)
    expect_equal(t_out$pvalue, aldex_test_out$we.ep)

    wilcox_out <- aldex_t(x_clr, groups, method = "wilcox.test", mc_samples = 12)
    expect_equal(wilcox_out$pvalue, aldex_test_out$wi.ep)
})

test_that("aldex_kw", {
    aldex_kw_out <- ALDEx2::aldex.kw(x_clr)
    kw_out <- aldex_kw(x_clr, groups, method = "kruskal", mc_samples = 12)
    expect_equal(kw_out$pvalue, aldex_kw_out$kw.ep)

    glm_out <- aldex_kw(x_clr, groups, method = "glm_anova", mc_samples = 12)
    expect_equal(glm_out$pvalue, aldex_kw_out$glm.ep)
})


================================================
FILE: tests/testthat/test-ancom.R
================================================
if (FALSE) {
    zero_neg_lb <- ANCOMBC:::get_struc_zero(
        data.frame(otu_table(ecam)),
        as(sample_data(ecam), "matrix"),
        "delivery",
        neg_lb = TRUE
    )
    write.csv(
        data.frame(zero_neg_lb),
        file = test_path("data", "ancom-zero_neg_lb.csv")
    )
    zero <- ANCOMBC:::get_struc_zero(
        data.frame(otu_table(ecam)),
        as(sample_data(ecam), "matrix"),
        "delivery",
        neg_lb = FALSE
    )
    write.csv(
        data.frame(zero),
        file = test_path("data", "ancom-zero.csv")
    )
}

data(ecam)
test_that("identify structural zeros", {
    expect_identical(
        get_struc_zero(ecam, "delivery", TRUE),
        read.csv(test_path("data/ancom-zero_neg_lb.csv"), row.names = 1)
    )

    expect_identical(
        get_struc_zero(ecam, "delivery", FALSE),
        read.csv(test_path("data/ancom-zero.csv"), row.names = 1)
    )
})

test_that("ancom result", {
    ancom_res <- run_ancom(
        ecam, "delivery",
        p_adjust = "BH",
        W_cutoff = 0,
        taxa_rank = "Class"
    )
    curr_marker <- marker_table(ancom_res)
    
    expect_snapshot(print(head(curr_marker), digits = 5))
})


================================================
FILE: tests/testthat/test-ancombc.R
================================================
test_that("ancombc works correctly", {
    if (FALSE) {
        # result of ancombc package, example from the package vignette
        # atalas1006 from microbiome package
        pseq <- subset_samples(atlas1006, time == 0)
        sample_data(pseq)$bmi_group <- recode(
            sample_data(pseq)$bmi_group,
            underweight = "lean",
            lean = "lean",
            overweight = "overweight",
            obese = "obese",
            severeobese = "obese",
            morbidobese = "obese"
        )
        sample_data(pseq)$nation <- dplyr::recode(
            sample_data(pseq)$nationality,
            Scandinavia = "NE",
            UKIE = "NE",
            SouthEurope = "SE",
            CentralEurope = "CE",
            EasternEurope = "EE"
        )

        phylum_data <- aggregate_taxa(pseq, "Phylum")
        tax_table(phylum_data) <- tax_table(phylum_data)[, 1]

        out <- ANCOMBC::ancombc(
            data = phylum_data,
            formula = "nation",
            p_adj_method = "holm",
            prv_cut = 0.10,
            lib_cut = 1000,
            group = "nation",
            struc_zero = FALSE,
            neg_lb = FALSE,
            tol = 1e-5,
            max_iter = 100,
            conserve = FALSE,
            alpha = 0.05,
            global = TRUE
        )
        group_lvls <- levels(phyloseq::sample_data(phylum_data)[["nation"]])
        ef <- out$res$lfc

        # extract enrich groups according the effect size
        #
        # https://github.com/FrederickHuangLin/ANCOMBC/issues/8
        # The first level (CE) of a categorical value will be set as a
        # reference level by default in R.
        # Yes, the W statistics from the primary result (res), which aims to
        # test for the effects of covariates of interest, can be either positive
        # or negative and its sign indicates the direction;
        get_enrich_group <- function(ef, group_lvls) {
            if (all(ef < 0)) {
                enrich_group <- group_lvls[1]
            } else {
                enrich_group <- group_lvls[which.max(ef) + 1]
            }

            enrich_group
        }

        enrich_group <- apply(ef, 1, get_enrich_group, group_lvls)
        global_res <- out$res_global
        global_res$enrich_group <- enrich_group
        global_res <- global_res[global_res$diff_abn, ]
        res <- data.frame(
            feature = paste0("p__", rownames(global_res)),
            enrich_group = global_res$enrich_group,
            ef_W = global_res$W,
            pvalue = global_res$p_val,
            padj = global_res$q_val
        )
        rownames(res) <- paste0("marker", 1:nrow(res))


        out2 <- run_ancombc(
            phylum_data,
            p_adjust = "holm",
            prv_cut = 0.10,
            lib_cut = 1000,
            group = "nation",
            struc_zero = FALSE,
            neg_lb = FALSE,
            tol = 1e-5,
            max_iter = 100,
            conserve = FALSE,
            pvalue_cutoff = 0.05
        )
        marker <- data.frame(marker_table(out2))

        # TRUE
        identical(marker, res)
    }
})


================================================
FILE: tests/testthat/test-assignment.R
================================================
marker <- marker_table(
    data.frame(
        feature = paste0("sp", 1:5),
        enrich_group = c("cr", "er", "cr", "cr", "er"),
        stringsAsFactors = FALSE
    )
)
otu <- otu_table(
    data.frame(
        s1 = runif(10),
        s2 = runif(10)
    ),
    taxa_are_rows = TRUE
)
tax <- tax_table(data.frame(feature = paste0("sp", 1:10)) %>% as.matrix())

test_mm <- microbiomeMarker(
    marker = marker,
    otu_table = otu,
    tax_table = tax
)

test_that("otu_table<- works correctly", {
    expect_silent(otu_table(test_mm) <- otu_table(test_mm))
    expect_silent(otu_table(test_mm) <- test_mm)
    ps_assign <- phyloseq(otu_table(test_mm), tax_table(test_mm))
    expect_silent(otu_table(test_mm) <- ps_assign)
})

test_that("marker_table<- works correcly", {
    new_marker <- marker[1, ]
    expect_silent(marker_table(test_mm) <- new_marker)
})


================================================
FILE: tests/testthat/test-barplot.R
================================================
test_that("feature label in bar plot", {
    feature <- "Bacteria|Bacteroidetes|Bacteroidia|Bacteroidales|Bacteroidaceae"
    short_feature <- "Bacteria|Bacteroidetes"

    expect_equal(get_feature_label(feature, 1), "Bacteroidaceae")
    expect_equal(get_feature_label(feature, 2), "Bacteroidales|Bacteroidaceae")
    expect_equal(
        get_feature_label(feature, 7),
        "Bacteria|Bacteroidetes|Bacte..Bacteroidales|Bacteroidaceae"
    )
    expect_equal(
        get_feature_label(feature, 0),
        "Bacteria|Bacteroidetes|Bacte..Bacteroidales|Bacteroidaceae"
    )
    expect_equal(get_feature_label(short_feature, 0), "Bacteria|Bacteroidetes")

    # replace "Unknown" in the species level as "sp."
    expect_equal(
        replace_unknown_species("s__Leucobacter_Unknown"),
        "s__Leucobacter_sp."
    )
    expect_equal(
        replace_unknown_species("g__abcd|s__Leucobacter_Unknown"),
        "g__abcd|s__Leucobacter_sp."
    )
    expect_equal(
        replace_unknown_species("g__abcd"), "g__abcd"
    )
    expect_equal(
        replace_unknown_species("g__abcd|s__Leucobacter_sp."),
        "g__abcd|s__Leucobacter_sp."
    )
})

mm <- microbiomeMarker(
    marker_table = marker_table(data.frame(
        feature = c("speciesA", "speciesB"),
        enrich_group = c("groupA", "groupB"),
        ef_logFC = c(-2, 2),
        pvalue = c(0.01, 0.01),
        padj = c(0.01, 0.01),
        row.names = c("marker1", "marker2")
    )),
    otu_table = otu_table(matrix(
        c(4, 1, 1, 4),
        nrow = 2, byrow = TRUE,
        dimnames = list(c("speciesA", "speciesB"), c("sample1", "sample2"))
    ),
    taxa_are_rows = TRUE
    ),
    tax_table = tax_table(matrix(
        c("speciesA", "speciesB"),
        nrow = 2,
        dimnames = list(c("speciesA", "speciesB"), "Species")
    ))
)


test_that("label of x, and effect size in descending order", {
    # logFC, such as edgeR, DESeq2 for two groups comparison
    p_logfc <- plot_ef_bar(mm)
    expect_identical(p_logfc$labels$x, "log2 Fold Change")

    # lefse - lda
    mt <- marker_table(data.frame(
        feature = c("speciesA", "speciesB"),
        enrich_group = c("groupA", "groupA"),
        ef_lda = c(2, 3),
        pvalue = c(0.01, 0.01),
        padj = c(0.01, 0.01),
        row.names = c("marker1", "marker2")
    ))
    marker_table(mm) <- mt
    p_lda <- plot_ef_bar(mm)
    expect_identical(p_lda$labels$x, "LDA score (log10)")
    # descending order
    ef <- p_lda$data$effect_size
    expect_true(all(diff(ef) >= 0))

    # two groups test, diff_mean
    names(mt)[3] <- "ef_diff_mean"
    marker_table(mm) <- mt
    p_diff_mean <- plot_ef_bar(mm)
    expect_identical(p_diff_mean$labels$x, "Differential means")

    # multiple group
    names(mt)[3] <- "ef_eta_squared"
    marker_table(mm) <- mt
    p_eta_squared <- plot_ef_bar(mm)
    expect_identical(p_eta_squared$labels$x, "Eta squared")

    # CLR diff mean
    names(mt)[3] <- "ef_CLR_diff_mean"
    marker_table(mm) <- mt
    p_clr_diff <- plot_ef_bar(mm)
    expect_identical(p_clr_diff$labels$x, "CLR differential means")

    # CLR F statistic
    names(mt)[3] <- "ef_CLR_F_statistic"
    marker_table(mm) <- mt
    p_clr_f <- plot_ef_bar(mm)
    expect_identical(p_clr_f$labels$x, "CLR F statistic")

    # W statistic
    names(mt)[3] <- "ef_W"
    marker_table(mm) <- mt
    p_w <- plot_ef_bar(mm)
    expect_identical(p_w$labels$x, "W")

    # importance
    names(mt)[3] <- "ef_imp"
    marker_table(mm) <- mt
    p_imp <- plot_ef_bar(mm)
    expect_identical(p_imp$labels$x, "Importance score")

    # likelihood ratio statistic
    names(mt)[3] <- "ef_LR"
    marker_table(mm) <- mt
    p_lr <- plot_ef_bar(mm)
    expect_identical(p_lr$labels$x, "Likelihood ratio statistic")

    # F statistic
    names(mt)[3] <- "ef_F"
    marker_table(mm) <- mt
    p_f <- plot_ef_bar(mm)
    expect_identical(p_f$labels$x, "F statistic")
})


================================================
FILE: tests/testthat/test-comparing.R
================================================
test_that("comparing da methods", {
    data(ecam)
    expect_error(compare_DA(ecam, "delivery", methods = c("test", "ancombc")),
                 "methods test not available")
})

test_that("flatten args",{
    expect_error(generate_compare_args("lefse", list(a=1)),
                 "does not match DA methods")

    args <-  list(lefse = list(list(norm = "CPM"), list(norm = "TSS")))
    out <- generate_compare_args("lefse", args)
    expect_identical(out$methods, c("lefse_1", "lefse_2"))
    expect_identical(out$args,
                     list(lefse_1 = list(norm = "CPM"),
                          lefse_2 = list(norm = "TSS")))
    expect_identical(out$methods, names(out$args))

    out <- generate_compare_args(c("lefse", "ancom"), args)
    expect_identical(out$methods, c("lefse_1", "lefse_2", "ancom"))
    expect_identical(out$args,
                     list(lefse_1 = list(norm = "CPM"),
                          lefse_2 = list(norm = "TSS"),
                          ancom = list()))


    args <-  list(lefse = list(list(norm = "CPM", pvalue_cutoff = 0.05),
                               list(norm = "TSS", pvalue_cutoff = 0.05)),
                  ancom = list(norm = "CPM", W_cutoff = 0.75))
    out <- generate_compare_args(c("lefse", "ancom"), args)
    expect_identical(out$methods, c("lefse_1", "lefse_2", "ancom"))
    expect_identical(out$args,
                     list(lefse_1 = list(norm = "CPM", pvalue_cutoff = 0.05),
                          lefse_2 = list(norm = "TSS", pvalue_cutoff = 0.05),
                          ancom = list(norm = "CPM", W_cutoff = 0.75)))
    expect_identical(out$methods, names(out$args))


})


================================================
FILE: tests/testthat/test-confounder.R
================================================
data("caporaso")
data("pediatric_ibd")
test_that("check confounding variables", {
    expect_error(
        check_confounder(caporaso, "target_var"),
        "`target_var` must be contained in the meta data")

    expect_error(
        check_confounder(pediatric_ibd, target_var = "Class"),
        "No confounding var"
    )

    expect_error(
        check_confounder(caporaso, "SampleType", c("Year", "test")),
        "`test` not be contained"
    )

    expect_identical(
        check_confounder(caporaso, "SampleType", c("Year", "Day")),
        c("Year", "Day")
    )

    vars <- names(sample_data(caporaso))
    confounders <- setdiff(vars, "SampleType")
    expect_identical(
        check_confounder(caporaso, "SampleType"),
        confounders
    )


})

test_that("confounder analysis", {
    res <- withr::with_seed(
        2022,
        confounder(caporaso,
            "SampleType",
            confounders = "ReportedAntibioticUsage"
        )
    )
    expect_identical(res$confounder, "ReportedAntibioticUsage")
    # expect_equal(res$pvalue, 0.239)
    # permutation testfor CCA were completely redesigned for vegan 2.6.6
    # https://github.com/vegandevs/vegan/blob/master/NEWS.md#new-features-2
})


================================================
FILE: tests/testthat/test-edgeR.R
================================================
test_that("result of edger", {
    data(pediatric_ibd)
    mm_edger <- run_edger(
        pediatric_ibd,
        "Class",
        pvalue_cutoff = 0.1,
        p_adjust = "fdr"
    )
    expect_snapshot(print(marker_table(mm_edger), digits = 5))
})


================================================
FILE: tests/testthat/test-extract.R
================================================
test_that("extract methods", {
    marker <- marker_table(
        data.frame(
            feature = paste0("sp", 1:5),
            enrich_group = c("cr", "er", "cr", "cr", "er"),
            stringsAsFactors = FALSE
        )
    )

    expect_s4_class(marker[1], "marker_table")
    expect_s4_class(marker[1, ], "marker_table")
})


================================================
FILE: tests/testthat/test-import-picrust2.R
================================================
test_that("import_picrust2 works", {
    sam_tab <- system.file(
        "extdata", "picrust2_metadata.tsv",
        package = "microbiomeMarker")
    feature_tab <- system.file(
        "extdata", "path_abun_unstrat_descrip.tsv.gz",
        package = "microbiomeMarker")
    ps <- import_picrust2(feature_tab, sam_tab, trait = "PATHWAY")
    
    expect_identical(rank_names(ps), c("Picrust_trait", "Picrust_description"))
})


================================================
FILE: tests/testthat/test-import-qiime2.R
================================================
test_that("whether the row.names of feature table is dna sequence or not", {
    expect_false(is_dna_seq("3597a2689efaf5525ce460494a8ac383"))
    expect_true(is_dna_seq(paste0(
        "TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGTG",
        "GTGATTTAAGTCAGCGGTGAAAGTTTGTGGCTCAACCATAAAATTGCCGTTGAA",
        "ACTGGGTTACTTGAGTGTGTTTGAGGTAGGCGGAATGCGTGG"
    )))
})


================================================
FILE: tests/testthat/test-lefse-input.R
================================================
test_that("add missing levels: keep abundance is lower than 1", {
    data(oxygen)
    oxygen_feature <- otu_table(oxygen)
    feature <- add_missing_levels(oxygen_feature)

    expect_true(max(feature) <= 1)
    expect_true(min(feature) >= 0)
})

test_that("check whether taxa has level prefix", {
    prefix <- paste0(c("k", "p", "c", "o", "f", "g", "s"), "__")
    tax_nms1 <- c("Bacteria|Verrucomicrobia|Verrucomicrobiae|")
    tax_nms2 <- paste0(prefix, tax_nms1)
    check1 <- check_tax_prefix(tax_nms1)
    check2 <- purrr::map_lgl(tax_nms2, check_tax_prefix)

    expect_false(check1)
    expect_true(all(check2))
})

test_that("format lefse input: subgroups, group_hie", {
    sample_meta <- data.frame(group = rep(c("cr", "exp"), 3),
                              subgroup = rep(c("a", "b"), each = 3))
    group_info <- lefse_format_grp(sample_meta, "group", "subgroup")
    expect_identical(group_info$subgroup,
                     paste(sample_meta$group, sample_meta$subgroup, sep = "_"))
    expect_identical(group_info$group_hie,
                     list(cr = c("cr_a", "cr_b"), exp = c("exp_a", "exp_b")))
})


================================================
FILE: tests/testthat/test-lefse.R
================================================
# lefse - lda
data(kostic_crc)
kostic_crc_small <- phyloseq::subset_taxa(
    kostic_crc,
    Phylum == "Firmicutes"
)
mm_lefse <- withr::with_seed(
    2020,
    run_lefse(kostic_crc_small,
        wilcoxon_cutoff = 0.01,
        group = "DIAGNOSIS",
        kw_cutoff = 0.01,
        multigrp_strat = TRUE,
        lda_cutoff = 4
    )
)

test_that("lefse output of oxygen", {
    expect_snapshot(mm_lefse)
    expect_snapshot(marker_table(mm_lefse))
})

test_that("create phyloseq object from microbiomeMarker object", {
    ps <- create_ps_from_mm(mm_lefse)
    expect_true(all(marker_table(mm_lefse)$feature %in% taxa_names(ps)))

    ps2 <- create_ps_from_mm(mm_lefse, only_marker = FALSE)
    expect_identical(taxa_names(ps2), taxa_names(mm_lefse))
})


================================================
FILE: tests/testthat/test-limma-voom.R
================================================
test_that("limma voom", {
    data(enterotypes_arumugam)
    enterotype <- phyloseq::subset_samples(
        enterotypes_arumugam,
        Enterotype %in% c("Enterotype 1", "Enterotype 2", "Enterotype 3")
    )
    mm_lv <- run_limma_voom(
        enterotype,
        "Enterotype",
        contrast = c("Enterotype 3", "Enterotype 2"),
        pvalue_cutoff = 0.05,
        p_adjust = "fdr"
    )
    
    expect_snapshot(print(marker_table(mm_lv), digits = 5))
})


================================================
FILE: tests/testthat/test-metagenomeSeq.R
================================================
test_that("result of metagenomeSeq", {
    ps <- phyloseq::phyloseq(
        otu_table = otu_table(
            matrix(
                1:12,
                nrow = 2,
                dimnames = list(
                    c("feature1", "feature2"),
                    paste0("sample", 1:6)
                )
            ),
            taxa_are_rows = TRUE
        ),
        tax_table = tax_table(
            matrix(
                c("taxa1", "taxa2"),
                nrow = 2,
                dimnames = list(c("feature1", "feature2"), c("Species"))
            )
        ),
        sam_data = sample_data(
            data.frame(
                group = rep(c("group1", "group2", "group3"), 2),
                row.names = paste0("sample", 1:6)
            )
        )
    )
    expect_error(
        run_metagenomeseq(ps, "group"),
        "ZILN method do not allows"
    )
    expect_error(
        run_metagenomeseq(ps, "group", contrast = c("group1", "group2")),
        "ZILN method do not allows"
    )
})

test_that("get enrich group of a featrue of multiple groups comparison", {
    group_pairs <- list(
        c("a", "b"),
        c("a", "c"),
        c("b", "c")
    )
    logFC_pairs <- c(1, 1, 1)

    expect_identical(get_mgs_enrich_group(group_pairs, logFC_pairs), "a")
})


================================================
FILE: tests/testthat/test-microbiomeMaker-methods.R
================================================
test_that("nmarker method", {
    mm <- microbiomeMarker(
        marker_table = marker_table(data.frame(
            feature = c("speciesA", "speciesB"),
            enrich_group = c("groupA", "groupB"),
            ef_logFC = c(-2, 2),
            pvalue = c(0.01, 0.01),
            padj = c(0.01, 0.01),
            row.names = c("marker1", "marker2")
        )),
        norm_method = "TSS",
        diff_method = "DESeq2",
        otu_table = otu_table(matrix(
            c(4, 1, 1, 4),
            nrow = 2, byrow = TRUE,
            dimnames = list(c("speciesA", "speciesB"), c("sample1", "sample2"))
        ),
        taxa_are_rows = TRUE
        ),
        tax_table = tax_table(matrix(
            c("speciesA", "speciesB"),
            nrow = 2,
            dimnames = list(c("speciesA", "speciesB"), "Species")
        )),
        sam_data = sample_data(data.frame(
            group = c("groupA", "groupB"),
            row.names = c("sample1", "sample2")
        ))
    )
    expect_identical(nmarker(mm), 2L)
    expect_identical(nmarker(marker_table(mm)), 2L)
    
    marker_table(mm) <- NULL
    expect_identical(nmarker(mm), 0L)
    
})


================================================
FILE: tests/testthat/test-microbiomeMarker-class.R
================================================
test_that("microbiomeMarker constructor", {
    marker1 <- marker_table(
        data.frame(
            feature = paste0("sp", 1:5),
            enrich_group = c("cr", "er", "cr", "cr", "er"),
            stringsAsFactors = FALSE
        )
    )
    marker2 <- marker_table(
        data.frame(
            feature = paste0("sp", c(1:5, 11)),
            enrich_group = c("cr", "er", "cr", "cr", "er", "cr"),
            stringsAsFactors = FALSE
        )
    )

    otu1 <- otu_table(
        data.frame(
            s1 = runif(10),
            s2 = runif(10)
        ),
        taxa_are_rows = TRUE
    )
    tax1 <- tax_table(data.frame(feature = paste0("sp", 1:10)) %>% as.matrix())
    otu2 <- otu1[1:3, ]
    tax2 <- tax1[1:3, ]

    # expect error message for microbiomeMarker constructor
    expect_microbiomeMarker_error <- function(message, ...) {
        expect_error(microbiomeMarker(...), message, fixed = TRUE)
    }

    # otu_table is required
    expect_microbiomeMarker_error("otu_table is required", marker1)
    expect_microbiomeMarker_error(
        "otu_table is required",
        marker1,
        tax_table = tax1
    )
    # tax_table is required
    expect_microbiomeMarker_error(
        "tax_table is required",
        marker1,
        otu_table = otu1
    )

    expect_silent(microbiomeMarker(marker1, otu_table = otu1, tax_table = tax1))

    msg1 <- paste0(
        "The number of different feature must be smaller than the",
        " total number of feature"
    )
    expect_microbiomeMarker_error(
        msg1,
        marker1,
        tax_table = tax2,
        otu_table = otu2
    )

    msg2 <- "marker in marker_table must be contained in tax"
    expect_microbiomeMarker_error(
        msg2,
        marker2,
        otu_table = otu1,
        tax_table = tax1
    )

    msg4 <- "nrow of `otu_table` must be equal to the length of `tax_table()`"
    expect_microbiomeMarker_error(
        msg4,
        marker1,
        tax_table = tax1,
        otu_table = otu2,
    )

    expect_equal(
        is(microbiomeMarker(marker1, otu_table = otu1, tax_table = tax1)),
        c("microbiomeMarker", "phyloseq")
    )
})


================================================
FILE: tests/testthat/test-multiple-groups-test.R
================================================
data(enterotypes_arumugam)
enterotype <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype %in% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
)

tukey_res <- run_posthoc_test(enterotype, "Enterotype", method = "tukey")

test_that("etaseq effect size", {
    etasq <- calc_etasq(c(1, 2, 1.2, 3, 4, 1.4), c("a", "b", "c", "a", "b", "c"))
    expect_equal(signif(etasq, 3), 0.421)
})

test_that("test multiple group enterotype result", {

    # error group
    expect_error(
        run_test_multiple_groups(enterotype, "Entertype"),
        regexp = "`group` must in the field of sample meta data",
        fixed = TRUE
    )
})

test_that("test post hoc test result", {
    res_test <- tukey_res@result[["p__Bacteroidetes|g__Bacteroides"]] %>%
        data.frame
    expect_snapshot(print(res_test, digits = 5))
})

test_that("test visualization of post hoc test, p value significance level ", {
    expect_equal(
        pvalue2siglevel(c(0.05, 0.01, 0.0001, 0.06)),
        c("*", "**", "***", "NS.")
    )
})

test_that(
    "test visualization of posthoc test, data of signicance level annotation",
    {
        # single feature
        abd <- tukey_res@abundance
        group <- abd$group
        pht_df <- data.frame(tukey_res@result[["p__Bacteroidetes|g__Bacteroides"]])
        annotation_single <- get_sig_annotation_single(
            abd[["p__Bacteroidetes|g__Bacteroides"]],
            pht_df,
            group
        )
        annotation_single$y_position <- formatC(
            annotation_single$y_position,
            format = "g",
            digits = 5
        )
        expect_snapshot(annotation_single)

        # all features
        annotation_all <- get_sig_annotation(tukey_res)
        annotation_all$y_position <- formatC(
            annotation_all$y_position,
            format = "g",
            digits = 5
        )
        expect_snapshot(head(annotation_all))
    }
)


================================================
FILE: tests/testthat/test-normalization.R
================================================
ct <- as(otu_table(pediatric_ibd), "matrix")
gm_mean <- function(x, na.rm = TRUE) {
    exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}
geoMeans <- apply(ct, 1, gm_mean)

test_that("ensure the results are the same for object in different class ", {
    ot <- otu_table(enterotypes_arumugam)
    df <- as.data.frame(ot)
    mat <- as.matrix(df)


    # tss ---------------------------------------------------------------------
    # results are the same
    tss_ps <- norm_tss(enterotypes_arumugam)
    tss_ot <- norm_tss(ot)
    tss_df <- normalize(df, "TSS")
    tss_mat <- normalize(mat, "TSS")
    expect_equal(tss_ot, otu_table(tss_ps))
    expect_equal(as.data.frame(tss_ot), tss_df)
    expect_equal(as.matrix(tss_df), tss_mat)

    # no norm_factor
    expect_true(is.null(attr(tss_df, "norm_factor")))
    expect_true(is.null(attr(tss_mat, "norm_factor")))
    expect_true(is.null(attr(tss_ot, "norm_factor")))
    expect_true(is.null(attr(otu_table(tss_ps), "norm_factor")))
    expect_false("norm_factor" %in% names(sample_data(tss_ps)))


    # rarefy ------------------------------------------------------------------
    rarefy_ps <- norm_rarefy(enterotypes_arumugam, rng_seed = 2020)
    rarefy_ot <- norm_rarefy(ot, rng_seed = 2020)
    rarefy_df <- normalize(df, "rarefy", rng_seed = 2020)
    rarefy_mat <- normalize(mat, "rarefy", rng_seed = 2020)
    expect_equal(otu_table(rarefy_ps), rarefy_ot)
    expect_equal(as.data.frame(rarefy_ot), rarefy_df)
    expect_equal(as.matrix(rarefy_df), rarefy_mat)

    # no norm_factor
    expect_true(is.null(attr(rarefy_df, "norm_factor")))
    expect_true(is.null(attr(rarefy_mat, "norm_factor")))
    expect_true(is.null(attr(rarefy_ot, "norm_factor")))
    expect_true(is.null(attr(otu_table(rarefy_ps), "norm_factor")))
    expect_false("norm_factor" %in% names(sample_data(rarefy_ps)))


    # css ---------------------------------------------------------------------
    css_ps <- norm_css(enterotypes_arumugam)
    css_ot <- norm_css(ot)
    css_df <- normalize(df, "CSS")
    css_mat <- normalize(mat, "CSS")
    expect_identical(otu_table(css_ps), css_ot)
    expect_equal(as.data.frame(css_ot), css_df, ignore_attr = TRUE)
    expect_equal(as.matrix(css_df), css_mat, ignore_attr = TRUE)

    # norm factor
    css_nf_ot <- attr(css_ot, "norm_factor")
    css_nf_ps <- sample_data(css_ps)$norm_factor
    css_nf_df <- attr(css_df, "norm_factor")
    css_nf_mat <- attr(css_mat, "norm_factor")
    expect_identical(css_nf_ps, css_nf_ot)
    expect_identical(css_nf_ot, css_nf_df)
    expect_identical(css_nf_mat, css_nf_df)


    # rle ---------------------------------------------------------------------
    rle_ps <- norm_rle(enterotypes_arumugam)
    rle_ot <- norm_rle(ot)
    rle_df <- normalize(df, "RLE")
    rle_mat <- normalize(mat, "RLE")
    expect_identical(otu_table(rle_ps), rle_ot)
    expect_equal(as.data.frame(rle_ot), rle_df, ignore_attr = TRUE)
    expect_equal(as.matrix(rle_df), rle_mat, ignore_attr = TRUE)

    # norm factor
    rle_nf_ot <- attr(rle_ot, "norm_factor")
    rle_nf_ps <- sample_data(rle_ps)$norm_factor
    rle_nf_df <- attr(rle_df, "norm_factor")
    rle_nf_mat <- attr(rle_mat, "norm_factor")
    expect_identical(rle_nf_ps, rle_nf_ot)
    expect_identical(rle_nf_ot, rle_nf_df)
    expect_identical(rle_nf_mat, rle_nf_df)


    # TMM ---------------------------------------------------------------------
    tmm_ps <- norm_tmm(enterotypes_arumugam)
    tmm_ot <- norm_tmm(ot)
    tmm_df <- normalize(df, "TMM")
    tmm_mat <- normalize(mat, "TMM")
    expect_identical(otu_table(tmm_ps), tmm_ot)
    expect_equal(as.data.frame(tmm_ot), tmm_df, ignore_attr = TRUE)
    expect_equal(as.matrix(tmm_df), tmm_mat, ignore_attr = TRUE)

    # norm factor
    tmm_nf_ot <- attr(tmm_ot, "norm_factor")
    tmm_nf_ps <- sample_data(tmm_ps)$norm_factor
    tmm_nf_df <- attr(tmm_df, "norm_factor")
    tmm_nf_mat <- attr(tmm_mat, "norm_factor")
    expect_identical(tmm_nf_ps, tmm_nf_ot)
    expect_identical(tmm_nf_ot, tmm_nf_df)
    expect_identical(tmm_nf_mat, tmm_nf_df)


    # clr ---------------------------------------------------------------------
    clr_ps <- norm_clr(enterotypes_arumugam)
    clr_ot <- norm_clr(ot)
    clr_df <- normalize(df, "CLR")
    clr_mat <- normalize(mat, "CLR")
    expect_identical(otu_table(clr_ps), clr_ot)
    expect_identical(as.data.frame(clr_ot), clr_df)
    expect_identical(as.matrix(clr_df), clr_mat)

    # no norm_factor
    expect_true(is.null(attr(clr_df, "norm_factor")))
    expect_true(is.null(attr(clr_mat, "norm_factor")))
    expect_true(is.null(attr(clr_ot, "norm_factor")))
    expect_true(is.null(attr(otu_table(clr_ps), "norm_factor")))
    expect_false("norm_factor" %in% names(sample_data(clr_ps)))


    # cpm -------------------------------------------------------------------
    value_ps <- norm_cpm(enterotypes_arumugam)
    value_ot <- norm_cpm(ot)
    value_df <- normalize(df, "CPM")
    value_mat <- normalize(mat, "CPM")
    expect_identical(otu_table(value_ps), value_ot)
    expect_identical(as.data.frame(value_ot), value_df)
    expect_identical(as.matrix(value_df), value_mat)

    # no norm_factor
    expect_true(is.null(attr(value_df, "norm_factor")))
    expect_true(is.null(attr(value_mat, "norm_factor")))
    expect_true(is.null(attr(value_ot, "norm_factor")))
    expect_true(is.null(attr(otu_table(value_ps), "norm_factor")))
    expect_false("norm_factor" %in% names(sample_data(value_ps)))
})


test_that(paste0(
    "`geoMeans` and `type = 'poscounts'` in ",
    "`estimateSizeFactorsForMatrix` are equal"
), {
    expect_equal(
        estimateSizeFactorsForMatrix(ct, geoMeans = geoMeans),
        estimateSizeFactorsForMatrix(ct, type = "poscounts")
    )
})

test_that(paste0(
    "the size factors from `estimateSizeFactorForMatrix()` ",
    "and `DESeq2::estimateSizeFactors` are equal"
), {
    suppressWarnings(dds <- phyloseq2DESeq2(pediatric_ibd, ~Class))
    sf1 <- DESeq2::estimateSizeFactors(dds, type = "poscounts") %>%
        DESeq2::sizeFactors()
    sf2 <- DESeq2::estimateSizeFactors(dds, geoMeans = geoMeans) %>%
        DESeq2::sizeFactors()
    sf3 <- estimateSizeFactorsForMatrix(ct, geoMeans = geoMeans)
    sf4 <- estimateSizeFactorsForMatrix(ct, type = "poscounts")

    expect_identical(sf1, sf2)
    expect_equal(sf3, sf4)
    expect_equal(sf1, sf3)
})


# clr normalization
test_that("gm_mean works well with NA values", {
    expect_equal(gm_mean(c(1, NA, 2, 3)), gm_mean(c(1, 0, 2, 3)))
    expect_equal(gm_mean(c(1, -1, 2, 3), na.rm = FALSE), gm_mean(c(1, 0, 2, 3)))
})

test_that("trans_clr works well with NA/infinite values", {
    expect_equal(trans_clr(c(1, NA, 2, 3)), trans_clr(c(1, 0, 2, 3)))
    # Actually，Inf will unlikely to appear
    expect_equal(trans_clr(c(1, Inf, 2, 3)), c(0, 0, 0, 0))
})


================================================
FILE: tests/testthat/test-sl.R
================================================
test_that("supervised machine learning method workds properly", {
    data(enterotypes_arumugam)
    ps_small <- phyloseq::subset_taxa(
        enterotypes_arumugam,
        Phylum %in% c("Firmicutes", "Bacteroidetes")
    )
    set.seed(2021)
    mm_lr <- run_sl(
        ps_small,
        group = "Gender",
        taxa_rank = "Genus",
        nfolds = 2,
        nrepeats = 1,
        top_n = 15,
        norm = "TSS",
        method = "LR",
    )
    expect_identical(nmarker(mm_lr), 15L)
    expect_identical(mm_lr@norm_method, "TSS")
    expect_identical(mm_lr@diff_method, "logistic regression")
})


================================================
FILE: tests/testthat/test-summarize-tax.R
================================================
ps <- phyloseq::phyloseq(
    otu_table = otu_table(
        matrix(
            sample(100, 40),
            nrow = 4,
            dimnames = list(
                paste0("otu", 1:4),
                paste0("sample", 1:10)
            )
        ),
        taxa_are_rows = TRUE
    ),
    tax_table = tax_table(
        matrix(
            c(rep("g1", 4), rep(c("s1", "s2"), 2)),
            nrow = 4, byrow = FALSE,
            dimnames = list(paste0("otu", 1:4), c("Genus", "Species"))
        )
    ),
    sam_data = sample_data(
        data.frame(
            group = rep(c("group1", "group2"), 5),
            row.names = paste0("sample", 1:10)
        )
    )
)
ps_summarized <- summarize_taxa(ps)

test_that("check whether phyloseq tax summarized or not", {
    expect_false(check_tax_summarize(ps))
    expect_true(check_tax_summarize(ps_summarized))
})

test_that("check the summarize_taxa", {
    sep <- "|"
    taxa <- otu_table(ps_summarized)

    expect_true(any(grepl(sep, row.names(taxa), fixed = TRUE)))
    expect_equal(nrow(taxa), 3L)

    expect_error(
        summarize_taxa(ps, level = "abc"),
        "`level` must in the ranks"
    )
})

test_that("extract prefix of names of taxonomic ranks", {
    expect_identical(extract_prefix(ps), c("g", "s"))
    expect_identical(
        extract_prefix(rank_names(ps)),
        c("g", "s")
    )
})


================================================
FILE: tests/testthat/test-transform.R
================================================
data(enterotypes_arumugam)
ps_t <- transform_abundances(enterotypes_arumugam, "log10p")
otutable_t <- transform_abundances(otu_table(enterotypes_arumugam), "log10p")
mm <- microbiomeMarker(
    marker_table = marker_table(data.frame(
        feature = c("speciesA", "speciesB"),
        enrich_group = c("groupA", "groupB"),
        ef_logFC = c(-2, 2),
        pvalue = c(0.01, 0.01),
        padj = c(0.01, 0.01),
        row.names = c("marker1", "marker2")
    )),
    otu_table = otu_table(matrix(
        c(4, 1, 1, 4),
        nrow = 2, byrow = TRUE,
        dimnames = list(c("speciesA", "speciesB"), c("sample1", "sample2"))
    ),
    taxa_are_rows = TRUE
    ),
    tax_table = tax_table(matrix(
        c("speciesA", "speciesB"),
        nrow = 2,
        dimnames = list(c("speciesA", "speciesB"), "Species")
    ))
)
mm_transed <- transform_abundances(mm, "log10p")

test_that("return a object: the same class with the argument `object`", {
    expect_true(inherits(ps_t, "phyloseq"))
    expect_true(inherits(otutable_t, "otu_table"))
    expect_true(inherits(mm_transed, "microbiomeMarker"))

    expect_equal(otu_table(ps_t), otutable_t)
})

test_that("transformation", {
    # transformed using log10(1 + x) if data contains zero
    expect_warning(
        t_log10 <- transform_abundances(enterotypes_arumugam, "log10"),
        "Using log10(1 + x) instead",
        fixed = TRUE
    )
    t_log10p <- transform_abundances(enterotypes_arumugam, "log10p")
    expect_identical(t_log10, t_log10p)
})


================================================
FILE: tests/testthat/test-two-group-test.R
================================================
test_that("ration", {
    abd1 <- rep(0, 6)
    abd2 <- rep(0, 6)
    expect_equal(calc_ratio(abd1, abd2), 0)

    abd1 <- c(0.00014940, 0.05774625, 0.05419578, 0.05414964, 0.02028051)
    abd2 <- rep(0, 5)
    expect_equal(round(calc_ratio(abd1, abd2), 5), 1.00278)

    abd2 <- rev(abd1)
    expect_equal(round(calc_ratio(abd1, abd2), 5), 1)
})

test_that("test two group result", {
    data(enterotypes_arumugam)
    mm_welch <- run_test_two_groups(enterotypes_arumugam, "Gender")
    mm_t <- run_test_two_groups(
        enterotypes_arumugam,
        group = "Gender",
        method = "t.test"
    )
    expect_snapshot(mm_welch)

    expect_snapshot(mm_t)
})


================================================
FILE: tests/testthat/test-utilities.R
================================================
test_that("check upper first letter", {
    expect_equal(
        upper_firstletter(c("abc", "ABC", "Abc")),
        c("Abc", "Abc", "Abc")
    )
})

test_that("check rank names and para `taxa_rank`", {
    ## check rank names
    # taxonomic profile
    ot <- otu_table(
        matrix(
            sample(100, 40),
            nrow = 4,
            dimnames = list(
                paste0("otu", 1:4),
                paste0("sample", 1:10)
            )
        ),
        taxa_are_rows = TRUE
    )
    ps <- phyloseq::phyloseq(
        otu_table = ot,
        tax_table = tax_table(
            matrix(
                c(rep("g1", 4), rep(c("s1", "s2"), 2)),
                nrow = 4, byrow = FALSE,
                dimnames = list(paste0("otu", 1:4), c("Genus", "Species"))
            )
        )
    )
    ps2 <- phyloseq::phyloseq(
        otu_table = ot,
        tax_table = tax_table(
            matrix(
                c(rep("g1", 4), rep(c("s1", "s2"), 2)),
                nrow = 4, byrow = FALSE,
                dimnames = list(paste0("otu", 1:4), c("xxx", "Species"))
            )
        )
    )

    expect_invisible(check_rank_names(ps))
    expect_error(check_rank_names(ps2), "ranks of taxonimic profile")

    # picrust2 functional profile
    ot_picrust2 <- otu_table(
        matrix(
            sample(100, 40),
            nrow = 4,
            dimnames = list(
                paste0("path", 1:4),
                paste0("sample", 1:10)
            )
        ),
        taxa_are_rows = TRUE
    )

    ps_picrust2 <- phyloseq::phyloseq(
        otu_table = ot_picrust2,
        tax_table = tax_table(
            matrix(
                c(rep("pathway", 4), paste("desp", 1:4)),
                nrow = 4, byrow = FALSE,
                dimnames = list(paste0("path", 1:4),
                                c("Picrust_trait", "Picrust_description"))
            )
        )
    )
    ps_picrust2_err <- phyloseq::phyloseq(
        otu_table = ot_picrust2,
        tax_table = tax_table(
            matrix(
                c(rep("pathway", 4), paste("desp", 1:4)),
                nrow = 4, byrow = FALSE,
                dimnames = list(paste0("path", 1:4),
                                c("Picrust_trait", "xxxx"))
            )
        )
    )
    expect_invisible(check_rank_names(ps_picrust2))
    expect_error(check_rank_names(ps_picrust2_err),
                 "ranks of picrust2 functional profile")

    # check whether the ps is created from picrust2 or not
    expect_true(is_picrust2(ps_picrust2))
    expect_false(is_picrust2(ps))


    ## check para `taxa_rank`
    expect_invisible(check_taxa_rank(ps, "all"))
    expect_invisible(check_taxa_rank(ps, "none"))
    expect_invisible(check_taxa_rank(ps, "Genus"))
    expect_error(check_taxa_rank(ps, "xxx"), "`taxa_rank` must be one of")
})

test_that(
    "taxa prefix", {
    expect_identical(get_prefix(c("Phylum", "Genus")), c("p__", "g__"))
})

test_that("var in sample_data", {
    st <- sample_data(
        data.frame(group = paste("group", 1:3))
    )
    expect_error(
        check_var_in_meta("abc", st),
        "variable of `sample_meta`"
    )
    expect_silent(check_var_in_meta("group", st))
})


test_that("get_norm_method works well", {
    expect_identical(
        get_norm_method(100),
        "per-sample normalized (sum of all taxa) to 100"
    )
    expect_identical(get_norm_method("a"), "a")
})


test_that("check_samples, at least one non zero features in a sample", {
    test_ot <- otu_table(
        cbind(matrix(1:12, 6, 2), 0),
        taxa_are_rows = TRUE
    )
    test_sa <- sample_data(
        data.frame(sample = c("sa1", "sa2", "sa3"))
    )
    test_ps <- phyloseq(test_ot, test_sa)

    test_ot2 <- otu_table(
        cbind(matrix(1:12, 6, 2)),
        taxa_are_rows = TRUE
    )
    expect_identical(check_samples(test_ps), "sa3")
    expect_null(check_samples(test_ot2))
})

test_that("remove samples with missing values in the specified var", {
    otu <- otu_table(
        data.frame(
            s1 = runif(10),
            s2 = runif(10),
            s3 = runif(10)
        ),
        taxa_are_rows = TRUE
    )
    tax <- tax_table(data.frame(feature = paste0("sp", 1:10)) %>% as.matrix())
    sam <- data.frame(group = c(NA, "A", "B"))
    rownames(sam) <- paste0("s", 1:3)
    test_ps <- phyloseq(otu, tax, sample_data(sam))

    new_samples <- sample_names(remove_na_samples(test_ps, "group"))
    expect_identical(new_samples, c("s2", "s3"))
})


# reset group levels according to the contrast
test_that("reset group levels", {
    groups <- factor(rep(c("a", "b", "c"), each = 3))
    new_groups <- set_lvl(groups, c("b", "a"))
    expect_identical(levels(new_groups), c("b", "a", "c"))
    expect_identical(as.character(groups), as.character(new_groups))

    new_groups <- set_lvl(groups, NULL)
    expect_identical(levels(new_groups), c("a", "b", "c"))
})

# check contrast
test_that("check contrast", {
    expect_null(check_contrast(contrast = NULL))
    expect_error(check_contrast(c("a", "b", "c")), "two length character")
    expect_error(check_contrast(c(1, 2)), "two length character")
    expect_identical(check_contrast(c("a", "b")), c("a", "b"))
})

# create design
test_that("create design", {
    groups <- factor(rep(c("a", "b", "c"), each = 3))
    meta <- data.frame(group = groups, conf = paste0("conf", 1:3), day = 1:3)
    des <- create_design(groups, meta)
    expect_identical(colnames(des), c("(Intercept)", "groupb", "groupc"))

    des <-  create_design(groups, meta, confounders = c("conf", "day"))
    expect_identical(ncol(des), 6L)

    des <-  create_design(groups, meta, confounders = c("day", "conf"))
    expect_identical(ncol(des), 6L)

    # support sample_data object for meta
    meta <- sample_data(meta)
    des <-  create_design(groups, meta, confounders = c("day", "conf"))
    expect_identical(ncol(des), 6L)

})

# calculate argument of ceof
test_that("calculate coef", {
    ## multiple groups
    groups <- factor(rep(c("a", "b", "c"), each = 3))
    meta <- data.frame(group = groups, conf = paste0("conf", 1:3), day = 1:3)
    des <-  create_design(groups, meta, confounders = c("conf", "day"))
    expect_identical(calc_coef(groups, des, contrast = NULL), c(5L, 6L))

    groups <- set_lvl(groups, contrast = c("b", "a"))
    des <- create_design(groups, meta, confounders = c("conf", "day"))
    expect_identical(calc_coef(groups, des, contrast = c("b", "a")), 5L)

    ## two groups
    groups <- factor(rep(c("a", "b"), each = 3))
    meta <- data.frame(group = groups, conf = paste0("conf", 1:2), day = 1:3)
    des <-  create_design(groups, meta)
    expect_identical(calc_coef(groups, des), 2L)

    groups <-set_lvl(groups, c("b", "a"))
    des <-  create_design(groups, meta)
    expect_warning(calc_coef(groups, des, c("b", "a")), "`contrast` is ignored")

    des <-  create_design(groups, meta, confounders = "conf")
    expect_warning(calc_coef(groups, des, c("b", "a")), "`contrast` is ignored")
})


# create contrast
# test_that("create contrast", {
#     expect_error(create_contrast("a"), "at least two groups")
#
#     # multiple groups, pairwise comparisons
#     groups <- factor(rep(c("a", "b", "c"), each = 3))
#     model_dat <- data.frame(group = groups,
#                             confounder1 = c("big", "small", "medium"),
#                             confounder2 = c(1, 2, 3))
#
#     ## no confounders
#     design1 <- stats::model.matrix(~ 0 + group, data = model_dat)
#     mat <- matrix(c(-1, 1, 0, -1, 0, 1, 0, -1, 1), 3)
#     row.names(mat) <- c("a", "b", "c")
#     colnames(mat) <- c("b-a", "c-a", "c-b")
#
#     expect_identical(create_pairwise_contrast(levels(groups)), mat)
#     expect_identical(create_contrast(groups, design1), mat)
#
#     expect_error(create_contrast(groups, design1, c("a", "d")),
#                  "contained in `groups`")
#     expect_error(create_contrast(groups, design1, "a"), "two length")
#     expect_identical(
#         create_contrast(groups, design1, c("a", "b")),
#         matrix(c(-1, 1, 0), dimnames = list(c("a", "b", "c"), "b-a"))
#     )
#     expect_identical(
#         create_contrast(groups, design1, c("b", "c")),
#         matrix(c(0, -1, 1), dimnames = list(c("a", "b", "c"), "c-b"))
#     )
#
#     ## confounders
#     design2 <- stats::model.matrix(~ confounder2 + confounder1 + 0+ group,
#                                    data = model_dat)
#     create_contrast(groups, design2)
#
#     # create contrast
#     groups_two <- factor(rep(c("a", "b"), each = 3))
#     expect_identical(create_contrast(groups_two), c(-1, 1))
#     expect_identical(create_contrast(groups), mat)
#     expect_warning(
#         ctra <- create_contrast(groups_two, c("a", "b")),
#         "`contrast` is ignored"
#     )
#     expect_identical(ctra, c(-1, 1))
#     expect_identical(ctra, create_contrast(groups_two))
# })
#
#
# # return marker
# test_that("marker_table, if no significant marker return all the features", {
#     sig_ft1 <- data.frame()
#     ft <- data.frame(feature = letters[1:3], ef = runif(3))
#     expect_warning(
#         marker_null <- return_marker(sig_ft1, ft),
#         "No marker was identified")
#     expect_identical(NULL, marker_null)
#
#     sig_ft2 <- data.frame(feature = "a", ef = 1)
#     expect_identical(marker_table(sig_ft2), return_marker(sig_ft2, ft))
# })


# extract the specific taxa rank
test_that("extract the specific taxa rank", {
    otu <- otu_table(
        data.frame(
            s1 = runif(10),
            s2 = runif(10),
            s3 = runif(10)
        ),
        taxa_are_rows = TRUE
    )
    tax <- tax_table(
        data.frame(
            rank1 = paste0("rank1", 1:10),
            rank2 = paste0("rank2", 1:10)
        ) %>%
            as.matrix()
    )
    test_ps <- phyloseq(otu, tax)

    # taxa names keep inconsistent with the taxa_rank
    expect_identical(
        taxa_names(extract_rank(test_ps, "rank2")),
        paste0("rank2", 1:10)
    )
    expect_identical(
        taxa_names(extract_rank(test_ps, "none")),
        paste0("sp", 1:10)
    )
})

test_that("create a chracter consistes of n spaces", {
    expect_identical(space(0), "")
    expect_identical(space(3), "   ")
})


================================================
FILE: tests/testthat/test_cladogram.R
================================================
test_that("Generate unique id for short annotation label", {
    uid <- get_unique_id(500)
    expect_equal(uid[26], "z")
    expect_equal(uid[30], "ad")
    expect_equal(uid[260], "iz")
    expect_equal(uid[500], "sf")
})

test_that("drop the levels (no marker) if the enrich_group is a factor", {
    marker <- readRDS("data/generate_cladogram_annotation.rds")
    group_n <- length(unique(marker$enrich_group))
    expect_error(
        generate_cladogram_annotation(marker, color = colors()[1:group_n]),
        NA
    )
})

test_that("generate tree data from phyloseq", {
    data(pediatric_ibd)
    td <- get_treedata_phyloseq(pediatric_ibd)
    # all node classes must be in r, k,p, c,o, f, g, s
    expect_true(
        all(levels(td@data$node_class) %in%
            c("r", "k", "p", "c", "o", "f", "g", "s"))
    )
})


================================================
FILE: tests/testthat/test_fix_duplicate_tax.R
================================================
test_that("fix duplicate tax", {
    ps <- readRDS("data/data_tax_duplicate.rds")
    ps_fixed <- fix_duplicate_tax(ps)

    expect_fixed_length_qual <- function(ps, ps_fixed, level) {
        tax <- tax_table(ps)@.Data
        l <- length(unique(apply(tax[, 1:level], 1, paste, collapse = "_")))

        tax_fixed <- tax_table(ps_fixed)@.Data
        l_fixed <- length(unique(tax_fixed[, level]))

        expect_equal(l, l_fixed)
    }

    expect_fixed_length_qual(ps, ps_fixed, 7)
    expect_fixed_length_qual(ps, ps_fixed, 6)
    expect_fixed_length_qual(ps, ps_fixed, 5)
    expect_fixed_length_qual(ps, ps_fixed, 4)
    expect_fixed_length_qual(ps, ps_fixed, 3)
    expect_fixed_length_qual(ps, ps_fixed, 2)
})


================================================
FILE: tests/testthat.R
================================================
library(testthat)
library(microbiomeMarker)

test_check("microbiomeMarker")


================================================
FILE: vignettes/.gitignore
================================================
*.html
*.R


================================================
FILE: vignettes/microbiomeMarker-vignette.Rmd
================================================
---
title: "Tools for microbiome marker identification"
author: 
  - name: Yang  Cao
    affiliation: Department of Environmental Medicine, Tianjin Institute of
      Environmental and Operational Medicine
    email: caoyang.name@gmail.com
output: 
  BiocStyle::html_document:
    toc: true
bibliography: vignette.bib
vignette: >
  %\VignetteIndexEntry{Tools for microbiome marker identification}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    message = FALSE,
    warning = FALSE,
    fig.align = "center",
    crop = NULL
)
library(BiocStyle)
```


# Introduction

It is well established that the microbiome play a key role in human health and 
disease, due to its function such as host nutrition production (e.g. short-chain
fatty acids, SCFA), defense against pathogens, and development of immunity 
[@gilbert2018current]. The microbiome provide novel biomarkers for many disease,
and characterizing biomarkers based on microbiome profiles has great potential
for translational medicine and precision medicine [@manor2020health]. 

Differential analysis (DA) is a widely used approach to identify biomarkers. To
date, a number of methods have been developed for microbiome marker discovery
based on metagenomic profiles, e.g. simple statistical analysis methods STAMP 
[@parks2014stamp], RNA-seq based methods such as edgeR [@robinson2010edger] and 
DESeq2 [@love2014moderated], metagenomeSeq [@paulson2013differential], and 
Linear Discriminant Analysis Effect Size (LEfSe) [@segata2011metagenomic].
However, all of these methods have its own advantages and disadvantages, and 
none of them is considered standard or universal. Moreover, the 
programs/softwares for different DA methods may be development using different 
programming languages, even in different operating systems. Here, we have 
developed an all-in-one R/Bioconductor package 
[`microbiomeMarker`](https://yiluheihei.github.io/microbiomeMarker) 
that integrates commonly used differential analysis methods as well as three
machine learning-based approaches (Logistic regression, Random forest, and 
Support vector machine) to facilitate the identification of microbiome markers.

# Installation

Install the package from Bioconductor directly:

```{r install-bioc,eval=FALSE}
if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

BiocManager::install("microbiomeMarker")
```

Or install the development version of the package from
[Github](https://github.com/yiluheihei/microbiomeMarker). 

```{r install-gh,eval=FALSE}
if (!requireNamespace("remotes", quietly = TRUE)) {
    install.packages("remotes")
}
remotes::install_github("yiluheihei/microbiomeMarker")
```

# Package loading

Load the `microbiomeMarker` into the R session:

```{r load}
library(microbiomeMarker)
```

# Data structure

## Input phyloseq-class object

`r Biocpkg("phyloseq")` is the most popular
[Biocondcutor](https://bioconductor.org/) package used by the microbiome 
research community, and `phyloseq-class` objects are a great 
data-standard for microbiome data in R. Therefore, the core functions in 
`microbiomeMarker` take `phyloseq-class` object as input. 
Conveniently, `microbiomeMarker` provides features to import external
metagenomic abundance profiles from two popular microbiome analysis pipelines,
[qiime2](http://qiime.org/) [@bolyen2019reproducible] and
[dada2](https://benjjneb.github.io/dada2) [@callahan2016dada2], and return a
`phyloseq-class` object.

### Import from dada2

The output of the [dada2](https://benjjneb.github.io/dada2) pipeline is a 
feature table of amplicon sequence variants (an ASV table): A matrix with rows 
corresponding to samples and columns to ASVs, in which the value of each entry 
is the number of times that ASV was observed in that sample. This table is 
analogous to the traditional OTU table. Conveniently, taxa names are saved as 

```{r import-dada2}
seq_tab <- readRDS(
    system.file(
        "extdata", "dada2_seqtab.rds",
        package = "microbiomeMarker"
    )
)
tax_tab <- readRDS(
    system.file(
        "extdata", "dada2_taxtab.rds",
        package = "microbiomeMarker"
    )
)
sam_tab <- read.table(
    system.file(
        "extdata", "dada2_samdata.txt",
        package = "microbiomeMarker"
    ),
    sep = "\t",
    header = TRUE,
    row.names = 1
)
ps <- import_dada2(seq_tab = seq_tab, tax_tab = tax_tab, sam_tab = sam_tab)
ps
```

### Import from qiime2

[qiime2](http://qiime.org/) is the most widely used software for metagenomic
analysis. User can import the feature table, taxonomic table, phylogenetic 
tree, representative sequence and sample metadata from qiime2 using
`import_qiime2()`.

```{r import-qiime2,message=FALSE}
otuqza_file <- system.file(
    "extdata", "table.qza",
    package = "microbiomeMarker"
)
taxaqza_file <- system.file(
    "extdata", "taxonomy.qza",
    package = "microbiomeMarker"
)
sample_file <- system.file(
    "extdata", "sample-metadata.tsv",
    package = "microbiomeMarker"
)
treeqza_file <- system.file(
    "extdata", "tree.qza",
    package = "microbiomeMarker"
)

ps <- import_qiime2(
    otu_qza = otuqza_file, taxa_qza = taxaqza_file,
    sam_tab = sample_file, tree_qza = treeqza_file
)
ps
```

### Other import functions reexport from phyloseq

Moreover, `microbiomeMarker` reexports three import functions from 
`r Biocpkg("phyloseq")`, including `import_biom()`, `import_qiime()` and 
`import_mothur()`, to help users to import abundance data from 
[biom file](http://biom-format.org/), [qiime1](http://www.qiime.org/), and 
[mothur](http://www.mothur.org/). More details on these three import functions 
can be see from [here](https://joey711.github.io/phyloseq/import-data.html#the_import_family_of_functions).

Users can also import the external files into `phyloseq-class` object manually.
For more details on how to create `phyloseq-class` object from manually 
imported data, please see 
[this tutorial](http://joey711.github.io/phyloseq/import-data.html#manual).

## Output microbiomeMaker-class object

The object class used by the `microbiomeMarker` package to store the result of
microbiome marker analysis (also referred as DA) is the 
`microbiomeMarker-class` object. The `microbiomeMarker-class` extends the 
`phyloseq-class` by adding three custom slots:

- `marker_table`: also a new S4 class to store the markers, which is inherit 
  from `data.frame`. Rows represent the microbiome markers and variables
  represents feature of the marker, such as feature names, effect size and
  p value.
- `norm_method`: normalization method.
- `diff_method`: DA method.

Once users have a `microbiomeMarker-class` object, many accessor functions are 
available to query aspects of the data set. The function name and its purpose 
can be seen [here](https://yiluheihei.github.io/microbiomeMarker/reference/index.html#section-microbiome-marker).

# Diferential analysis

A number of methods have been developed for identifying differentially 
metagenomic features. `microbiomeMarker` provides the most commonly used DA 
methods which can be divided into three main categories: a) simple statistical 
tests; b) RNA-seq based methods; c) metagenomic based methods. All the names of
DA functions in `microbiomeMarker` are prefixed with `run_` (the `run_*` family
of functions).

By default, all the methods will perform DA on all levels of features 
(`taxa_rank = "all"` in DA functions) like LEfSe [@segata2011metagenomic], 
therefore, the corrected p value in the result (var `padj` in the
`marker_table` object) may be over-corrected. Users can change the para 
`taxa_rank` to a specific level of interest, and the DA will only perform in 
the specified level. For simplicity, DA on a specific level of feature is not 
contained in this vignette.

## Normalization

It is critical to normalize the metagenomic data to eliminate artifactual bias
in the original measurements prior to DA [@weiss2017normalization]. Here in
`microbiomeMarker`, we provides seven popular normalization methods, including:

- `rarefy`: random subsampling counts to the smallest library size in the data 
  set.
- `TSS`: total sum scaling, also referred to as "relative abundance", the
  abundances were normalized by dividing the corresponding sample library
  size.
- `TMM`: trimmed mean of m-values. First, a sample
  is chosen as reference. The scaling factor is then derived using a weighted
  trimmed mean over the differences of the log-transformed gene-count         
  fold-change between the sample and the reference.
- `RLE`: relative log expression, RLE uses a pseudo-reference calculated
  using the geometric mean of the gene-specific abundances over all
  samples. The scaling factors are then calculated as the median of the
  gene counts ratios between the samples and the reference.
- `CSS`: cumulative sum scaling, calculates scaling factors as the
  cumulative sum of gene abundances up to a data-derived threshold.
- `CLR`: centered log-ratio normalization.
- `CPM`: pre-sample normalization of the sum of the values to 1e+06.

We can use `norm_*()` family of functions or a wrapper function `normalize` 
to normalize the original metagenomic abundance data.

```{r norm}
# take tss as example
norm_tss(ps)

normalize(ps, method = "TSS")
```

---------------
***Note***: all the DA functions provides a para to specify the normalization 
method. We emphasize that users should specify the normalization method
in the DA functions rather than using these normalization functions directly.
If you use normalize data first and then perform DA, you should set the 
`norm_method` manually. We recommend to use the default normalization methods 
for the corresponding DA methods, e.g. "CPM" for LEfSe and "CSS" for 
metagenomeSeq, and the default values of `norm` in the DA functions is set as
their default normalization methods.

```{r norm-note,eval=FALSE}
data(kostic_crc)
mm_test <- normalize(kostic_crc, method = "CPM") %>%
    run_lefse(
        wilcoxon_cutoff = 0.01,
        norm = "none", # must be "none" since the input has been normalized
        group = "DIAGNOSIS",
        kw_cutoff = 0.01,
        multigrp_strat = TRUE,
        lda_cutoff = 4
    )
# equivalent to
run_lefse(
    wilcoxon_cutoff = 0.01,
    norm = "CPM",
    group = "DIAGNOSIS",
    kw_cutoff = 0.01,
    multigrp_strat = TRUE,
    lda_cutoff = 4
)
```

## Simple statitical tests {#simple-stat}

In practice, simple statitical tests such as t-test (for two groups 
comparison) and Kruskal-Wallis rank sum test (for multiple groups comparison)
are frequently used for metagenomic differential analysis. STAMP 
[parks2014stamp] is a widely-used graphical software package that provides 
"best pratices" in choose appropriate statistical methods for metagenomic 
analysis. Here in `microbiomeMarker`, `t-test`, Welch’s `t-test`, and White’s
non-parametric `t-test` are provided for two groups comparison, and  ANOVA and
Kruskal–Wallis test for multiple groups comparisons.

We can use `test_two_groups()` to perform simple statistical differential test 
between two groups.

```{r two-group-test}
data(enterotypes_arumugam)
tg_welch <- run_test_two_groups(
    enterotypes_arumugam,
    group = "Gender",
    method = "welch.test"
)

# three significantly differential genera (marker)
tg_welch

# details of result of the three markers
head(marker_table(tg_welch))
```

Function `run_test_multiple_groups()` is constructed for statistical 
differential test for multiple groups.

```{r multi-group-test}
# three groups
ps <- phyloseq::subset_samples(
    enterotypes_arumugam,
    Enterotype %in% c("Enterotype 3", "Enterotype 2", "Enterotype 1")
)
mg_anova <- run_test_multiple_groups(
    ps,
    group = "Enterotype",
    method = "anova"
)

# 24 markers
mg_anova

head(marker_table(mg_anova))
```

Moreover, a wrapper of `run_test_two_groups()` and `run_test_multiple_groups()`
named `run_simple_stat()` is provided for simple statistical differential 
analysis.

## RNA-seq based DA methods

Some models developed specifically for RNA-Seq data have been proposed for 
metagenomic differential analysis. Three popular methods, including DESeq2 
[@love2014moderated] (`run_deseq2()`), edgeR [@robinson2010edger] 
(`run_edger()`), and Voom [@law2014voom] (`run_limma_voom()`) are provided in
`microbiomeMarker`.

Here we take edgeR method as an example.

```{r edger}
# contrast must be specified for two groups comparison
data(pediatric_ibd)
mm_edger <- run_edger(
    pediatric_ibd,
    group = "Class",
    pvalue_cutoff = 0.1,
    p_adjust = "fdr"
)
mm_edger

# multiple groups
data(cid_ying)
cid <- phyloseq::subset_samples(
    cid_ying,
    Consistency %in% c("formed stool", "liquid", "semi-formed")
)
mm_edger_mg <- run_edger(
    cid,
    group = "Consistency",
    method  = "QLFT",
    pvalue_cutoff = 0.05,
    p_adjust = "fdr"
)
mm_edger_mg
```

## metagenomic based methods

Five methods, LEfSe [@segata2011metagenomic], metagenomeSeq 
[@paulson2013differential], ALDEx2 [@fernandes2014unifying], ANCOM
[@mandal2015analysis], and ANCOMBC [@lin2020analysis], which were developed
specifically for microbiome data (contain many more zeros that RNA-seq data), 
are also provided in our package. All these methods have greater power to 
detect differentially features than simple statistical tests by incorporating 
more sensitive tests.

Curently, LEfSe is the most popular tool for microbiome biomarker discovery. 
Here we take LEfSe method for example:

```{r lefse}
data(kostic_crc)
kostic_crc_small <- phyloseq::subset_taxa(
    kostic_crc,
    Phylum %in% c("Firmicutes")
)
mm_lefse <- run_lefse(
    kostic_crc_small,
    wilcoxon_cutoff = 0.01,
    group = "DIAGNOSIS",
    kw_cutoff = 0.01,
    multigrp_strat = TRUE,
    lda_cutoff = 4
)

mm_lefse
head(marker_table(mm_lefse))
```

## Supervised machine learning methods

Given that supervised learning (SL) methods can be used to predict 
differentiate samples based on there metagenomic profiles efficiently 
[@knights2011supervised]. `microbiomeMarker` also provides three SL 
classification models, random forest, logistic regression, and support vector
machine, to identify microbiome biomarkers. In addition, the feature importance
score for each marker will be provided too.

Here we take random forest for example:

```{r rf}
# must specify the importance para for random forest
set.seed(2021)
# small example phyloseq object for test
ps_small <- phyloseq::subset_taxa(
    enterotypes_arumugam,
    Phylum %in% c("Firmicutes", "Bacteroidetes")
)
mm_lr <- run_sl(
    ps_small,
    group = "Gender",
    nfolds = 2,
    nrepeats = 1,
    taxa_rank = "Genus",
    top_n = 15,
    norm = "TSS",
    method = "LR",
)

marker_table(mm_lr)
```

**Please note that SL methods can be biased for data with sample size due to 
the model overfitting. Thus, we advise users to use these SL methods with 
caution for a smaller dataset.**

## Pair-wise comparison of multiple groups 

All the DE methods in ***microbiomeMarker***, except for simple statistical 
tests for two groups comparison (`test_mulitple_groups()`), can be used for
multiple groups comparison, that is to find markers that differ between any of 
the groups by analyze all groups at once. Users can perform post-hoc test to 
identify which pairs of groups may differ from each other using 
`run_posthoc_test()`. Apparently, the mutliple groups comparison will result in
a larger number of genes than the individual pair-wise comparisons.

```{r post-hoc-test}
pht <- run_posthoc_test(ps, group = "Enterotype")
pht

# 24 significantly differential genera
markers <- marker_table(mg_anova)$feature
markers

# take a marker "p__Bacteroidetes|g__Bacteroides"
# for example, we will show "p__Bacteroidetes|g__Bacteroides"  differ from
# between Enterotype 2-Enterotype 1 and Enterotype 3-Enterotype 2.
extract_posthoc_res(pht, "p__Bacteroidetes|g__Bacteroides")[[1]]
```

In addition, for the five linear models-based methods, including edgeR, DESeq2, 
metagenoSeq, limma-voom, and ANCOMBC, users can perform pair-wise comparisons by
setting the argument `contrast`, a two length character in which the first 
element is the reference level (donominator of the logFC) and the second element
is used as baseline (numerator for fold change). For more details on `contrast` 
argument, please see the help page of the corresponding functions. Here we take
limma-voom method as example:

```{r pair-wise-linear}
# comparison between Enterotype 3 and Enterotype 2
mm_lv_pair <- run_limma_voom(
    ps,
    "Enterotype",
    contrast = c("Enterotype 3", "Enterotype 2"),
    pvalue_cutoff = 0.05,
    p_adjust = "fdr"
)
mm_lv_pair
head(marker_table(mm_lv_pair))
```

# Visualization

In `microbiomeMarker`, users can visualize the microbiome biomarker in
different ways, such as box plot, bar plot, dot plot, heatmap, and cladogram.
Except for heatmap,  all these plots are generated using the most flexible and
popular data visualization package `r CRANpkg("ggplot2")`. Therefore, these 
plots can be easily customized before they are generated using the build-in 
functions of `r CRANpkg("ggplot2")`, e.g. using `theme()` to modify the titles 
and labels. Heatmap is generated using a fantastic Bioconductor package 
`r Biocpkg("ComplexHeatmap")` package.

## Abundance box plot

First of all, users can visualize the abundances of markers using box plots 
with function `plot_abundance()`. We emphasize a concern that the `group` para
for `plot_abunance()` must be keep same with the `group` para in the 
differential analysis function. By default, `plot_abundance()` will plot all 
the markers, users can plot the specificity markers using para `markers`.

```{r plot-abundance}
p_abd <- plot_abundance(mm_lefse, group = "DIAGNOSIS")
p_abd

# customize the plot with ggplot2, modify the fill color manually
library(ggplot2)
p_abd + scale_fill_manual(values = c("Healthy" = "grey", "Tumor" = "red"))
```

## Heat map

Moreover, users can also visualize the abundances of markers using heatmap, in
which rows represents the markers and columns represents the samples. Like the 
above abundance box plot, users should pay attention to the para `group`, and 
control which markers to display by setting para `markers`.

```{r heatmap}
plot_heatmap(mm_edger, transform = "log10p", group = "Class")
```

## Bar plot or dot plot for effect size

We also estimate the effect size to measure the magnitude the observed 
phenomenon due to each characterizing marker.

`plot_ef_bar()` and `plot_ef_dot()` were used to show the bar and dot plot of 
the effect sizes of markers.

```{r ef-plot}
# bar plot
plot_ef_bar(mm_lefse)

# dot plot
plot_ef_dot(mm_lefse)
```

Different effect size measures can be calculated for different DA methods, e.g.
`lda` (linear discriminant analysis) for LEfSe, `imp` (importance) for SL 
methods. `plot_ef_bar()` and `plot_ef_dot()` can set the axis label of effect
size correctly without manual intervention.

```{r ef-plot-diff}
# set the x axis to log2 Fold Change automatically without manual intervention
plot_ef_bar(mm_edger)
```

## Cladogram

As mentioned above, the microbiome marker analysis will run on all levels of 
features by default. Users can plot a LEfSe cladogram using function 
`plot_cladogram()`.

```{r cladogram,fig.width=7,fig.height=7}
plot_cladogram(mm_lefse, color = c(Healthy = "darkgreen", Tumor = "red")) +
    theme(plot.margin = margin(0, 0, 0, 0))
```

## AUC-ROC curve from SL methods

ROC (receiver operating characteristic) curve can be used to show the prediction
performance of the identified marker. And AUC (area under the ROC curve) 
measures the ability of the identified marker to classify the samples. 
`plot_sl_roc()` was provided to show ROC curve and AUC value to evaluate 
marker prediction performance.

```{r auc-roc}
set.seed(2021)
plot_sl_roc(mm_lr, group = "Gender")
```

## Visualization for post-hoc test 

As shown in \@ref(simple-stat),  post-hoc test can be used to identify which 
pairs of groups may differ from each other. `plot_postHocTest()` was provided
to allow users visualize the post-hoc test result.

```{r plot-pht}
p_pht <- plot_postHocTest(pht, feature = "p__Bacteroidetes|g__Bacteroides")
p_pht
```

The pot-hoc plots were wrapped using `r CRANpkg("patchwork")`, and users can 
modifying the themes of all subplots using `&`.

```{r customize-p-pht}
p_pht & theme_bw()
```


# Citation

Kindly cite as follows:  Yang Cao (2020). microbiomeMarker: microbiome 
biomarker analysis. R package version 0.0.1.9000. 
https://github.com/yiluheihei/microbiomeMarker. DOI: 
[10.5281/zenodo.3749415](https://doi.org/10.5281/zenodo.3749415).

# Question

If you have any question, please file an issue on the issue tracker following
the instructions in the issue template:

Please briefly describe your problem, what output actually happened, and what 
output you expect.

Please provide a minimal reproducible example. For more details on how to make 
a great minimal reproducible example, see [how to make a great r reproducible 
example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) and https://www.tidyverse.org/help/#reprex.

# Session information {-}

This vignette was created under the following conditions:

```{r}
sessionInfo()
```

# References {-}


================================================
FILE: vignettes/vignette.bib
================================================
@article{gilbert2018current,
  title={Current understanding of the human microbiome},
  author={Gilbert, Jack A and Blaser, Martin J and Caporaso, J Gregory and Jansson, Janet K and Lynch, Susan V and Knight, Rob},
  journal={Nature medicine},
  volume={24},
  number={4},
  pages={392--400},
  year={2018},
  publisher={Nature Publishing Group}
}

@article{manor2020health,
  title={Health and disease markers correlate with gut microbiome composition across thousands of people},
  author={Manor, Ohad and Dai, Chengzhen L and Kornilov, Sergey A and Smith, Brett and Price, Nathan D and Lovejoy, Jennifer C and Gibbons, Sean M and Magis, Andrew T},
  journal={Nature communications},
  volume={11},
  number={1},
  pages={1--12},
  year={2020},
  publisher={Nature Publishing Group}
}

@article{parks2014stamp,
  title={STAMP: statistical analysis of taxonomic and functional profiles},
  author={Parks, Donovan H and Tyson, Gene W and Hugenholtz, Philip and Beiko, Robert G},
  journal={Bioinformatics},
  volume={30},
  number={21},
  pages={3123--3124},
  year={2014},
  publisher={Oxford University Press}
}

@article{robinson2010edger,
  title={edgeR: a Bioconductor package for differential expression analysis of digital gene expression data},
  author={Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K},
  journal={Bioinformatics},
  volume={26},
  number={1},
  pages={139--140},
  year={2010},
  publisher={Oxford University Press}
}

@article{love2014moderated,
  title={Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2},
  author={Love, Michael I and Huber, Wolfgang and Anders, Simon},
  journal={Genome biology},
  volume={15},
  number={12},
  pages={1--21},
  year={2014},
  publisher={BioMed Central}
}

@article{paulson2013differential,
  title={Differential abundance analysis for microbial marker-gene surveys},
  author={Paulson, Joseph N and Stine, O Colin and Bravo, H{\'e}ctor Corrada and Pop, Mihai},
  journal={Nature methods},
  volume={10},
  number={12},
  pages={1200--1202},
  year={2013},
  publisher={Nature Publishing Group}
}

@article{segata2011metagenomic,
  title={Metagenomic biomarker discovery and explanation},
  author={Segata, Nicola and Izard, Jacques and Waldron, Levi and Gevers, Dirk and Miropolsky, Larisa and Garrett, Wendy S and Huttenhower, Curtis},
  journal={Genome biology},
  volume={12},
  number={6},
  pages={1--18},
  year={2011},
  publisher={Springer}
}

@article{bolyen2019reproducible,
  title={Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2},
  author={Bolyen, Evan and Rideout, Jai Ram and Dillon, Matthew R and Bokulich, Nicholas A and Abnet, Christian C and Al-Ghalith, Gabriel A and Alexander, Harriet and Alm, Eric J and Arumugam, Manimozhiyan and Asnicar, Francesco and others},
  journal={Nature biotechnology},
  volume={37},
  number={8},
  pages={852--857},
  year={2019},
  publisher={Nature Publishing Group}
}

@article{callahan2016dada2,
  title={DADA2: high-resolution sample inference from Illumina amplicon data},
  author={Callahan, Benjamin J and McMurdie, Paul J and Rosen, Michael J and Han, Andrew W and Johnson, Amy Jo A and Holmes, Susan P},
  journal={Nature methods},
  volume={13},
  number={7},
  pages={581--583},
  year={2016},
  publisher={Nature Publishing Group}
}

@article{weiss2017normalization,
  title={Normalization and microbial differential abundance strategies depend upon data characteristics},
  author={Weiss, Sophie and Xu, Zhenjiang Zech and Peddada, Shyamal and Amir, Amnon and Bittinger, Kyle and Gonzalez, Antonio and Lozupone, Catherine and Zaneveld, Jesse R and V{\'a}zquez-Baeza, Yoshiki and Birmingham, Amanda and others},
  journal={Microbiome},
  volume={5},
  number={1},
  pages={1--18},
  year={2017},
  publisher={Springer}
}

@article{law2014voom,
  title={voom: Precision weights unlock linear model analysis tools for RNA-seq read counts},
  author={Law, Charity W and Chen, Yunshun and Shi, Wei and Smyth, Gordon K},
  journal={Genome biology},
  volume={15},
  number={2},
  pages={1--17},
  year={2014},
  publisher={BioMed Central}
}

@article{fernandes2014unifying,
  title={Unifying the analysis of high-throughput sequencing datasets: characterizing RNA-seq, 16S rRNA gene sequencing and selective growth experiments by compositional data analysis},
  author={Fernandes, Andrew D and Reid, Jennifer Ns and Macklaim, Jean M and McMurrough, Thomas A and Edgell, David R and Gloor, Gregory B},
  journal={Microbiome},
  volume={2},
  number={1},
  pages={1--13},
  year={2014},
  publisher={Springer}
}

@article{mandal2015analysis,
  title={Analysis of composition of microbiomes: a novel method for studying microbial composition},
  author={Mandal, Siddhartha and Van Treuren, Will and White, Richard A and Eggesb{\o}, Merete and Knight, Rob and Peddada, Shyamal D},
  journal={Microbial ecology in health and disease},
  volume={26},
  number={1},
  pages={27663},
  year={2015},
  publisher={Taylor \& Francis}
}

@article{lin2020analysis,
  title={Analysis of compositions of microbiomes with bias correction},
  author={Lin, Huang and Peddada, Shyamal Das},
  journal={Nature communications},
  volume={11},
  number={1},
  pages={1--11},
  year={2020},
  publisher={Nature Publishing Group}
}

@article{knights2011supervised,
  title={Supervised classification of microbiota mitigates mislabeling errors},
  author={Knights, Dan and Kuczynski, Justin and Koren, Omry and Ley, Ruth E and Field, Dawn and Knight, Rob and DeSantis, Todd Z and Kelley, Scott T},
  journal={The ISME journal},
  volume={5},
  number={4},
  pages={570--573},
  year={2011},
  publisher={Nature Publishing Group}
}