[
  {
    "path": ".gitignore",
    "content": "ls\n*~\nR/*~\n.DS_Store\n.Rbuildignore\n.Rhistory\n.Rproj.user\n.git\n.Rproj.user\nsrc/*.o\nsrc/*.so\nscde.Rproj\n*_bak\n.*_bak"
  },
  {
    "path": ".travis.yml",
    "content": "# Use R\nlanguage: r\nsudo: false\ncache: packages\nwarnings_are_errors: false\n\nenv:\n  global:\n    - BIOC_USE_DEVEL=\"FALSE\"  ## Use the current release version\n    - _R_CHECK_FORCE_SUGGESTS_=\"FALSE\"  ## Some suggested packages not\n                                        ## available for r-oldrel\n\nr:\n - oldrel\n - release\n - devel\n\naddons:\n  apt:\n    packages:\n      - libnlopt-dev\n\ninstall:\n  - R -e 'install.packages(\"devtools\")'\n  - R -e 'devtools::install_deps(dependencies = TRUE)'\n  - R -e 'devtools::install_version(\"flexmix\", \"2.3-13\")'\n  # catch package installation issues sooner rather than later\n  - R -e 'devtools::install_github(\"hms-dbmi/scde\")'\n\n# do not build vignettes...takes too long and times out on travis\nscript:\n  - R CMD build --no-build-vignettes --no-manual .\n  - R CMD check --no-build-vignettes --no-manual --timings *tar.gz\n\n# we need to install BiocInstaller for testing Bioconductor packages\nbioc_required: true\n\n# only report coverage for the release version\nafter_success:\n  - test $TRAVIS_R_VERSION_STRING = 'release' && Rscript -e 'covr::codecov()'\n  \nnotifications:\n  email:\n    on_success: change\n    on_failure: change\n\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "## Upcoming\n\n\n## [2.26.2] \n- Version from Bioconductor, Packaged: 2023-January-19\nBioconductor version: Release (3.16)"
  },
  {
    "path": "DESCRIPTION",
    "content": "Package: scde\nType: Package\nTitle: Single Cell Differential Expression\nVersion: 2.27.1\nDescription: The scde package implements a set of statistical methods for\n    analyzing single-cell RNA-seq data. scde fits individual error models for\n    single-cell RNA-seq measurements. These models can then be used for assessment\n    of differential expression between groups of cells, as well as other types of\n    analysis. The scde package also contains the pagoda framework which applies\n    pathway and gene set overdispersion analysis to identify and characterize\n    putative cell subpopulations based on transcriptional signatures. The overall\n    approach to the differential expression analysis is detailed in the following\n    publication: \"Bayesian approach to single-cell differential expression\n    analysis\" (Kharchenko PV, Silberstein L, Scadden DT, Nature Methods, doi:\n    10.1038/nmeth.2967). The overall approach to subpopulation identification and\n    characterization is detailed in the following pre-print: \"Characterizing \n    transcriptional heterogeneity through pathway and gene set overdispersion \n    analysis\" (Fan J, Salathia N, Liu R, Kaeser G, Yung Y, Herman J, Kaper F,\n    Fan JB, Zhang K, Chun J, and Kharchenko PV, Nature Methods, doi:10.1038/nmeth.3734).\nAuthor: Peter Kharchenko [aut, cre], Jean Fan [aut], Evan Biederstedt [aut]\nAuthors@R: c(\n    person(\"Peter\", \"Kharchenko\", role = c(\"aut\", \"cre\"),\n             email = \"Peter_Kharchenko@hms.harvard.edu\"),\n    person(\"Jean\", \"Fan\", role = \"aut\",\n               email = \"jeanfan@jhu.edu\",     \n           comment = c(ORCID = \"0000-0002-0212-5451\")),\n    person(\"Evan\", \"Biederstedt\", role = \"aut\",\n           email = \"evan.biederstedt@gmail.com\")\n    )\nMaintainer: Evan Biederstedt <evan.biederstedt@gmail.com>\nURL: http://pklab.med.harvard.edu/scde\nBugReports: https://github.com/hms-dbmi/scde/issues\nLicense: GPL-2\nLazyData: true\nDepends: R (>= 3.0.0), flexmix\nImports: Rcpp (>= 0.10.4), RcppArmadillo (>= 0.5.400.2.0), mgcv, Rook,\n        rjson, MASS, Cairo, RColorBrewer, edgeR, quantreg, methods,\n        nnet, RMTstat, extRemes, pcaMethods, BiocParallel, parallel\nSuggests: knitr, cba, fastcluster, WGCNA, GO.db, org.Hs.eg.db, rmarkdown\nbiocViews: ImmunoOncology, RNASeq, StatisticalMethod, DifferentialExpression, Bayesian,\n        Transcription, Software\nLinkingTo: Rcpp, RcppArmadillo\nVignetteBuilder: knitr\nPackaged: 2015-11-02 14:30:04 UTC; reyes\nRoxygenNote: 5.0.0\nNeedsCompilation: yes\n"
  },
  {
    "path": "NAMESPACE",
    "content": "export(bwpca)\nexport(knn.error.models)\nexport(make.pagoda.app)\nexport(pagoda.cluster.cells)\nexport(pagoda.effective.cells)\nexport(pagoda.gene.clusters)\nexport(pagoda.pathway.wPCA)\nexport(pagoda.reduce.loading.redundancy)\nexport(pagoda.reduce.redundancy)\nexport(pagoda.show.pathways)\nexport(pagoda.subtract.aspect)\nexport(pagoda.top.aspects)\nexport(pagoda.varnorm)\nexport(pagoda.view.aspects)\nexport(scde.browse.diffexp)\nexport(scde.error.models)\nexport(scde.expression.difference)\nexport(scde.expression.magnitude)\nexport(scde.expression.prior)\nexport(scde.failure.probability)\nexport(scde.fit.models.to.reference)\nexport(scde.posteriors)\nexport(scde.test.gene.expression.difference)\nexport(show.app)\nexport(winsorize.matrix)\nexport(clean.counts)\nexport(clean.gos)\nuseDynLib(scde)\n\n# custom additions\nimportFrom(Cairo,CairoPNG)\nimportFrom(RColorBrewer,brewer.pal)\nimportFrom(parallel,mclapply,detectCores)\nimportFrom(BiocParallel,bplapply,MulticoreParam)\nimportFrom(edgeR,calcNormFactors)\nimportFrom(MASS,theta.ml,rnegbin,negative.binomial)\nimportFrom(rjson,fromJSON,toJSON)\nimportFrom(Rcpp,evalCpp)\nimportFrom(methods,setRefClass)\nimportFrom(quantreg,rq,predict.rq)\nimportFrom(nnet,multinom,nnet,nnet.default)\nimportFrom(mgcv,gam)\nimportFrom(RMTstat,WishartMaxPar)\nimportFrom(pcaMethods, sDev, pca, scores, loadings)\nimportFrom(extRemes, fevd, qevd, plot.fevd)\nimport(RcppArmadillo)\nimport(flexmix)\nimport(Rook)\n"
  },
  {
    "path": "R/functions.R",
    "content": "##' Single-cell Differential Expression (with Pathway And Gene set Overdispersion Analysis)\n##'\n##' The scde package implements a set of statistical methods for analyzing single-cell RNA-seq data.\n##' scde fits individual error models for single-cell RNA-seq measurements. These models can then be used for\n##' assessment of differential expression between groups of cells, as well as other types of analysis.\n##' The scde package also contains the pagoda framework which applies pathway and gene set overdispersion analysis\n##' to identify and characterize putative cell subpopulations based on transcriptional signatures.\n##' See vignette(\"diffexp\") for a brief tutorial on differential expression analysis.\n##' See vignette(\"pagoda\") for a brief tutorial on pathway and gene set overdispersion analysis to identify and characterize cell subpopulations.\n##' More extensive tutorials are available at \\url{http://pklab.med.harvard.edu/scde/index.html}.\n##'  (test)\n##' @name scde\n##' @docType package\n##' @author Peter Kharchenko \\email{Peter_Kharchenko@@hms.harvard.edu}\n##' @author Jean Fan \\email{jeanfan@@fas.harvard.edu}\nNULL\n\n################################# Sample data\n\n##' Sample data\n##'\n##' A subset of Saiful et al. 2011 dataset containing first 20 ES and 20 MEF cells.\n##'\n##' @name es.mef.small\n##' @docType data\n##' @references \\url{http://www.ncbi.nlm.nih.gov/pubmed/21543516}\n##' @export\nNULL\n\n##' Sample data\n##'\n##' Single cell data from Pollen et al. 2014 dataset.\n##'\n##' @name pollen\n##' @docType data\n##' @references \\url{www.ncbi.nlm.nih.gov/pubmed/25086649}\n##' @export\nNULL\n\n##' Sample error model\n##'\n##' SCDE error model generated from a subset of Saiful et al. 2011 dataset containing first 20 ES and 20 MEF cells.\n##'\n##' @name o.ifm\n##' @docType data\n##' @references \\url{http://www.ncbi.nlm.nih.gov/pubmed/21543516}\n##' @export\nNULL\n\n##' Sample error model\n##'\n##' SCDE error model generated from the Pollen et al. 2014 dataset.\n##'\n##' @name knn\n##' @docType data\n##' @references \\url{www.ncbi.nlm.nih.gov/pubmed/25086649}\n##' @export\nNULL\n\n# Internal model data\n#\n# Numerically-derived correction for NB->chi squared approximation stored as an local regression model\n#\n# @name scde.edff\n\n\n################################# Generic methods\n\n##' Filter GOs list\n##'\n##' Filter GOs list and append GO names when appropriate\n##'\n##' @param go.env GO or gene set list\n##' @param min.size Minimum size for number of genes in a gene set (default: 5)\n##' @param max.size Maximum size for number of genes in a gene set (default: 5000)\n##' @param annot Whether to append GO annotations for easier interpretation (default: FALSE)\n##'\n##' @return a filtered GO list\n##'\n##' @examples\n##' \\donttest{\n##' # 10 sample GOs\n##' library(org.Hs.eg.db)\n##' go.env <- mget(ls(org.Hs.egGO2ALLEGS)[1:10], org.Hs.egGO2ALLEGS)\n##' # Filter this list and append names for easier interpretation\n##' go.env <- clean.gos(go.env)\n##' }\n##'\n##' @export\nclean.gos <- function(go.env, min.size = 5, max.size = 5000, annot = FALSE) {\n  go.env <- as.list(go.env)\n  size <- unlist(lapply(go.env, length))\n  go.env <- go.env[size > min.size & size < max.size]\n  # If we have GO.db installed, then add the term to each GO code.\n  if (annot && \"GO.db\" %in% installed.packages()[,1]) {\n    desc <- MASS::select(\n      GO.db,\n      keys = names(go.env),\n      columns = c(\"TERM\"),\n      multiVals = 'CharacterList'\n    )\n    stopifnot(all(names(go.env) == desc$GOID))\n    names(go.env) <- paste(names(go.env), desc$TERM)\n  }\n  return(go.env)\n}\n\n\n##' Filter counts matrix\n##'\n##' Filter counts matrix based on gene and cell requirements\n##'\n##' @param counts read count matrix. The rows correspond to genes, columns correspond to individual cells\n##' @param min.lib.size Minimum number of genes detected in a cell. Cells with fewer genes will be removed (default: 1.8e3)\n##' @param min.reads Minimum number of reads per gene. Genes with fewer reads will be removed (default: 10)\n##' @param min.detected Minimum number of cells a gene must be seen in. Genes not seen in a sufficient number of cells will be removed (default: 5)\n##'\n##' @return a filtered read count matrix\n##'\n##' @examples\n##' data(pollen)\n##' dim(pollen)\n##' cd <- clean.counts(pollen)\n##' dim(cd)\n##'\n##' @export\nclean.counts <- function(counts, min.lib.size = 1.8e3, min.reads = 10, min.detected = 5) {\n    # filter out low-gene cells\n    counts <- counts[, colSums(counts>0)>min.lib.size]\n    # remove genes that don't have many reads\n    counts <- counts[rowSums(counts)>min.reads, ]\n    # remove genes that are not seen in a sufficient number of cells\n    counts <- counts[rowSums(counts>0)>min.detected, ]\n    return(counts)\n}\n\n################################# SCDE Methods\n\n##' Fit single-cell error/regression models\n##'\n##' Fit error models given a set of single-cell data (counts) and an optional grouping factor (groups). The cells (within each group) are first cross-compared to determine a subset of genes showing consistent expression. The set of genes is then used to fit a mixture model (Poisson-NB mixture, with expression-dependent concomitant).\n##'\n##' Note: the default implementation has been changed to use linear-scale fit with expression-dependent NB size (overdispersion) fit. This represents an interative improvement on the originally published model. Use linear.fit=F to revert back to the original fitting procedure.\n##'\n##' @param counts read count matrix. The rows correspond to genes (should be named), columns correspond to individual cells. The matrix should contain integer counts\n##' @param groups an optional factor describing grouping of different cells. If provided, the cross-fits and the expected expression magnitudes will be determined separately within each group. The factor should have the same length as ncol(counts).\n##' @param min.nonfailed minimal number of non-failed observations required for a gene to be used in the final model fitting\n##' @param threshold.segmentation use a fast threshold-based segmentation during cross-fit (default: TRUE)\n##' @param min.count.threshold the number of reads to use to guess which genes may have \"failed\" to be detected in a given measurement during cross-cell comparison (default: 4)\n##' @param zero.count.threshold threshold to guess the initial value (failed/non-failed) during error model fitting procedure (defaults to the min.count.threshold value)\n##' @param zero.lambda the rate of the Poisson (failure) component (default: 0.1)\n##' @param save.crossfit.plots whether png files showing cross-fit segmentations should be written out (default: FALSE)\n##' @param save.model.plots whether pdf files showing model fits should be written out (default = TRUE)\n##' @param n.cores number of cores to use\n##' @param min.size.entries minimum number of genes to use when determining expected expression magnitude during model fitting\n##' @param max.pairs maximum number of cross-fit comparisons that should be performed per group (default: 5000)\n##' @param min.pairs.per.cell minimum number of pairs that each cell should be cross-compared with\n##' @param verbose 1 for increased output\n##' @param linear.fit Boolean of whether to use a linear fit in the regression (default: TRUE).\n##' @param local.theta.fit Boolean of whether to fit the overdispersion parameter theta, ie. the negative binomial size parameter, based on local regression (default: set to be equal to the linear.fit parameter)\n##' @param theta.fit.range Range of valid values for the overdispersion parameter theta, ie. the negative binomial size parameter (default: c(1e-2, 1e2))\n##'\n##' @return a model matrix, with rows corresponding to different cells, and columns representing different parameters of the determined models\n##'\n##' @useDynLib scde\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' sg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\n##' names(sg) <- colnames(cd)\n##' \\donttest{\n##' o.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\n##' }\n##'\n##' @export\nscde.error.models <- function(counts, groups = NULL, min.nonfailed = 3, threshold.segmentation = TRUE, min.count.threshold = 4, zero.count.threshold = min.count.threshold, zero.lambda = 0.1, save.crossfit.plots = FALSE, save.model.plots = TRUE, n.cores = 12, min.size.entries = 2e3, max.pairs = 5000, min.pairs.per.cell = 10, verbose = 0, linear.fit = TRUE, local.theta.fit = linear.fit, theta.fit.range = c(1e-2, 1e2)) {\n    # default same group\n    if(is.null(groups)) {\n        groups <- as.factor(rep(\"cell\", ncol(counts)))\n    }\n    # check for integer counts\n    if(any(!unlist(lapply(counts,is.integer)))) {\n      stop(\"Some of the supplied counts are not integer values (or stored as non-integer types). Aborting!\\nThe method is designed to work on read counts - do not pass normalized read counts (e.g. FPKM values). If matrix contains read counts, but they are stored as numeric values, use counts<-apply(counts,2,function(x) {storage.mode(x) <- 'integer'; x}) to recast.\");\n    }\n\n    # crossfit\n    if(verbose) {\n        cat(\"cross-fitting cells.\\n\")\n    }\n    cfm <- calculate.crossfit.models(counts, groups, n.cores = n.cores, threshold.segmentation = threshold.segmentation, min.count.threshold = min.count.threshold, zero.lambda = zero.lambda, max.pairs = max.pairs, save.plots = save.crossfit.plots, min.pairs.per.cell = min.pairs.per.cell, verbose = verbose)\n    # error model for each cell\n    if(verbose) {\n        cat(\"building individual error models.\\n\")\n    }\n    ifm <- calculate.individual.models(counts, groups, cfm, min.nonfailed = min.nonfailed, zero.count.threshold = zero.count.threshold, n.cores = n.cores, save.plots = save.model.plots, linear.fit = linear.fit, return.compressed.models = TRUE, verbose = verbose, min.size.entries = min.size.entries, local.theta.fit = local.theta.fit, theta.fit.range = theta.fit.range)\n    rm(cfm)\n    gc()\n    return(ifm)\n}\n\n\n##' Estimate prior distribution for gene expression magnitudes\n##'\n##' Use existing count data to determine a prior distribution of genes in the dataset\n##'\n##' @param models models determined by \\code{\\link{scde.error.models}}\n##' @param counts count matrix\n##' @param length.out number of points (resolution) of the expression magnitude grid (default: 400). Note: larger numbers will linearly increase memory/CPU demands.\n##' @param show.plot show the estimate posterior\n##' @param pseudo.count pseudo-count value to use (default 1)\n##' @param bw smoothing bandwidth to use in estimating the prior (default: 0.1)\n##' @param max.quantile determine the maximum expression magnitude based on a quantile (default : 0.999)\n##' @param max.value alternatively, specify the exact maximum expression magnitude value\n##'\n##' @return a structure describing expression magnitude grid ($x, on log10 scale) and prior ($y)\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' data(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##'\n##' @export\nscde.expression.prior <- function(models, counts, length.out = 400, show.plot = FALSE, pseudo.count = 1, bw = 0.1, max.quantile = 1-1e-3, max.value = NULL) {\n    fpkm <- scde.expression.magnitude(models, counts)\n    fail <- scde.failure.probability(models, counts = counts)\n    fpkm <- log10(exp(as.matrix(fpkm))+1)\n    wts <- as.numeric(as.matrix(1-fail[, colnames(fpkm)]))\n    wts <- wts/sum(wts)\n\n    # fit density on a mirror image\n    if(is.null(max.value)) {\n        x <- as.numeric(fpkm)\n        max.value <- as.numeric(quantile(x[x<Inf], p = max.quantile))\n    }\n    md <- density(c(-1*as.numeric(fpkm), as.numeric(fpkm)), bw = bw, weights = c(wts/2, wts/2), n = 2*length.out+1, from = -1*max.value, to = max.value)\n\n    gep <- data.frame(x = md$x[-seq_len(length.out)], y = md$y[-seq_len(length.out)])\n    gep$y[is.na(gep$y)] <- 0\n    gep$y <- gep$y+pseudo.count/nrow(fpkm) # pseudo-count\n    gep$y <- gep$y/sum(gep$y)\n    if(show.plot) {\n        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n        plot(gep$x, gep$y, col = 4, panel.first = abline(h = 0, lty = 2), type = 'l', xlab = \"log10( signal+1 )\", ylab = \"probability density\", main = \"signal prior\")\n    }\n    gep$lp <- log(gep$y)\n\n    # grid weighting (for normalization)\n    gep$grid.weight <- diff(10^c(gep$x[1], gep$x+c(diff(gep$x)/2, 0))-1)\n\n    return(gep)\n    plot(x)\n}\n\n\n##' Test for expression differences between two sets of cells\n##'\n##' Use the individual cell error models to test for differential expression between two groups of cells.\n##'\n##' @param models models determined by \\code{\\link{scde.error.models}}\n##' @param counts read count matrix\n##' @param prior gene expression prior as determined by \\code{\\link{scde.expression.prior}}\n##' @param groups a factor determining the two groups of cells being compared. The factor entries should correspond to the rows of the model matrix. The factor should have two levels. NAs are allowed (cells will be omitted from comparison).\n##' @param batch a factor (corresponding to rows of the model matrix) specifying batch assignment of each cell, to perform batch correction\n##' @param n.randomizations number of bootstrap randomizations to be performed\n##' @param n.cores number of cores to utilize\n##' @param batch.models (optional) separate models for the batch data (if generated using batch-specific group argument). Normally the same models are used.\n##' @param return.posteriors whether joint posterior matrices should be returned\n##' @param verbose integer verbose level (1 for verbose)\n##'\n##' @return \\subsection{default}{\n##' a data frame with the following fields:\n##' \\itemize{\n##' \\item{lb, mle, ub} {lower bound, maximum likelihood estimate, and upper bound of the 95% confidence interval for the expression fold change on log2 scale.}\n##' \\item{ce} { conservative estimate of expression-fold change (equals to the min(abs(c(lb, ub))), or 0 if the CI crosses the 0}\n##' \\item{Z} { uncorrected Z-score of expression difference}\n##' \\item{cZ} {expression difference Z-score corrected for multiple hypothesis testing using Holm procedure}\n##' }\n##'  If batch correction has been performed (\\code{batch} has been supplied), analogous data frames are returned in slots \\code{$batch.adjusted} for batch-corrected results, and \\code{$batch.effect} for the differences explained by batch effects alone.\n##' }}\n##' \\subsection{return.posteriors = TRUE}{\n##' A list is returned, with the default results data frame given in the \\code{$results} slot.\n##' \\code{difference.posterior} returns a matrix of estimated expression difference posteriors (rows - genes, columns correspond to different magnitudes of fold-change - log2 values are given in the column names)\n##' \\code{joint.posteriors} a list of two joint posterior matrices (rows - genes, columns correspond to the expression levels, given by prior$x grid)\n##' }\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' sg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\n##' names(sg) <- colnames(cd)\n##' \\donttest{\n##' o.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' # make sure groups corresponds to the models (o.ifm)\n##' groups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels = c(\"ESC\", \"MEF\"))\n##' names(groups) <- row.names(o.ifm)\n##' ediff <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, n.randomizations = 100, n.cores = n.cores, verbose = 1)\n##' }\n##'\n##' @export\nscde.expression.difference <- function(models, counts, prior, groups = NULL, batch = NULL, n.randomizations = 150, n.cores = 10, batch.models = models, return.posteriors = FALSE, verbose = 0) {\n    if(!all(rownames(models) %in% colnames(counts))) {\n        stop(\"ERROR: provided count data does not cover all of the cells specified in the model matrix\")\n    }\n\n    ci <- match(rownames(models), colnames(counts))\n    counts <- as.matrix(counts[, ci])\n\n    if(is.null(groups)) { # recover groups from models\n        groups <- as.factor(attr(models, \"groups\"))\n        if(is.null(groups)) stop(\"ERROR: groups factor is not provided, and models structure is lacking groups attribute\")\n        names(groups) <- rownames(models)\n    }\n    if(length(levels(groups)) != 2) {\n        stop(paste(\"ERROR: wrong number of levels in the grouping factor (\", paste(levels(groups), collapse = \" \"), \"), but must be two.\", sep = \"\"))\n    }\n\n    correct.batch <- FALSE\n    if(!is.null(batch)) {\n        if(length(levels(batch)) > 1) {\n            correct.batch <- TRUE\n        } else {\n            if(verbose) {\n                cat(\"WARNING: only one batch level detected. Nothing to correct for.\")\n            }\n        }\n    }\n\n    # batch control\n    if(correct.batch) {\n        batch <- as.factor(batch)\n        # check batch-group interactions\n        bgti <- table(groups, batch)\n        bgti.ft <- fisher.test(bgti)\n        if(verbose) {\n            cat(\"controlling for batch effects. interaction:\\n\")\n            print(bgti)\n        }\n        #if(any(bgti == 0)) {\n        #  cat(\"ERROR: cannot control for batch effect, as some batches are found only in one group:\\n\")\n        #  print(bgti)\n        #}\n        if(bgti.ft$p.value < 1e-3) {\n            cat(\"WARNING: strong interaction between groups and batches! Correction may be ineffective:\\n\")\n            print(bgti.ft)\n        }\n\n        # calculate batch posterior\n        if(verbose) {\n            cat(\"calculating batch posteriors\\n\")\n        }\n        batch.jpl <- tapply(seq_len(nrow(models)), groups, function(ii) {\n            scde.posteriors(models = batch.models, counts = counts, prior = prior, batch = batch, composition = table(batch[ii]), n.cores = n.cores, n.randomizations = n.randomizations, return.individual.posteriors = FALSE)\n        })\n        if(verbose) {\n            cat(\"calculating batch differences\\n\")\n        }\n        batch.bdiffp <- calculate.ratio.posterior(batch.jpl[[1]], batch.jpl[[2]], prior, n.cores = n.cores)\n        batch.bdiffp.rep <- quick.distribution.summary(batch.bdiffp)\n    } else {\n        if(verbose) {\n            cat(\"comparing groups:\\n\")\n            print(table(as.character(groups)))\n        }\n    }\n\n\n    # fit joint posteriors for each group\n    jpl <- tapply(seq_len(nrow(models)), groups, function(ii) {\n        scde.posteriors(models = models[ii, , drop = FALSE], counts = counts[, ii, drop = FALSE], prior = prior, n.cores = n.cores, n.randomizations = n.randomizations)\n    })\n    if(verbose) {\n        cat(\"calculating difference posterior\\n\")\n    }\n    # calculate difference posterior\n    bdiffp <- calculate.ratio.posterior(jpl[[1]], jpl[[2]], prior, n.cores = n.cores)\n\n    if(verbose) {\n        cat(\"summarizing differences\\n\")\n    }\n    bdiffp.rep <- quick.distribution.summary(bdiffp)\n\n    if(correct.batch) {\n        if(verbose) {\n            cat(\"adjusting for batch effects\\n\")\n        }\n        # adjust for batch effects\n        a.bdiffp <- calculate.ratio.posterior(bdiffp, batch.bdiffp, prior = data.frame(x = as.numeric(colnames(bdiffp)), y = rep(1/ncol(bdiffp), ncol(bdiffp))), skip.prior.adjustment = TRUE, n.cores = n.cores)\n        a.bdiffp.rep <- quick.distribution.summary(a.bdiffp)\n\n        # return with batch correction info\n        if(return.posteriors) {\n            return(list(batch.adjusted = a.bdiffp.rep, results = bdiffp.rep, batch.effect = batch.bdiffp.rep, difference.posterior = bdiffp, batch.adjusted.difference.posterior = a.bdiffp, joint.posteriors = jpl))\n        } else {\n            return(list(batch.adjusted = a.bdiffp.rep, results = bdiffp.rep, batch.effect = batch.bdiffp.rep))\n        }\n    } else {\n        # no batch correction return\n        if(return.posteriors) {\n            return(list(results = bdiffp.rep, difference.posterior = bdiffp, joint.posteriors = jpl))\n        } else {\n            return(bdiffp.rep)\n        }\n    }\n}\n\n\n##' View differential expression results in a browser\n##'\n##' Launches a browser app that shows the differential expression results, allowing to sort, filter, etc.\n##' The arguments generally correspond to the \\code{scde.expression.difference()} call, except that the results of that call are also passed here. Requires \\code{Rook} and \\code{rjson} packages to be installed.\n##'\n##' @param results result object returned by \\code{scde.expression.difference()}. Note to browse group posterior levels, use \\code{return.posteriors = TRUE} in the \\code{scde.expression.difference()} call.\n##' @param models model matrix\n##' @param counts count matrix\n##' @param prior prior\n##' @param groups group information\n##' @param batch batch information\n##' @param geneLookupURL The URL that will be used to construct links to view more information on gene names. By default (if can't guess the organism) the links will forward to ENSEMBL site search, using \\code{geneLookupURL = \"http://useast.ensembl.org/Multi/Search/Results?q = {0}\"}. The \"{0}\" in the end will be substituted with the gene name. For instance, to link to GeneCards, use \\code{\"http://www.genecards.org/cgi-bin/carddisp.pl?gene = {0}\"}.\n##' @param server optional previously returned instance of the server, if want to reuse it.\n##' @param name app name (needs to be altered only if adding more than one app to the server using \\code{server} parameter)\n##' @param port Interactive browser port\n##'\n##' @return server instance, on which $stop() function can be called to kill the process.\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' sg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\n##' names(sg) <- colnames(cd)\n##' \\donttest{\n##' o.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' # make sure groups corresponds to the models (o.ifm)\n##' groups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels = c(\"ESC\", \"MEF\"))\n##' names(groups) <- row.names(o.ifm)\n##' ediff <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, n.randomizations = 100, n.cores = 10, verbose = 1)\n##' scde.browse.diffexp(ediff, o.ifm, cd, o.prior, groups = groups, geneLookupURL=\"http://www.informatics.jax.org/searchtool/Search.do?query={0}\")  # creates browser\n##' }\n##'\n##' @export\nscde.browse.diffexp <- function(results, models, counts, prior, groups = NULL, batch = NULL, geneLookupURL = NULL, server = NULL, name = \"scde\", port = NULL) {\n    #require(Rook)\n    #require(rjson)\n    if(is.null(server)) { server <- get.scde.server(port) }\n    sa <- ViewDiff$new(results, models, counts, prior, groups = groups, batch = batch, geneLookupURL = geneLookupURL)\n    server$add(app = sa, name = name)\n    browseURL(paste(server$full_url(name), \"index.html\", sep = \"/\"))\n    return(server)\n}\n\n\n##' View PAGODA application\n##'\n##' Installs a given pagoda app (or any other rook app) into a server, optionally\n##' making a call to show it in the browser.\n##'\n##' @param app pagoda app (output of make.pagoda.app()) or another rook app\n##' @param name URL path name for this app\n##' @param browse whether a call should be made for browser to show the app\n##' @param port optional port on which the server should be initiated\n##' @param ip IP on which the server should listen (typically localhost)\n##' @param server an (optional) Rook server instance (defaults to ___scde.server)\n##'\n##' @examples\n##' \\donttest{\n##' app <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols=col.cols, cell.clustering=hc, title=\"NPCs\")\n##' # show app in the browser (port 1468)\n##' show.app(app, \"pollen\", browse = TRUE, port=1468)\n##' }\n##'\n##' @return Rook server instance\n##'\n##' @export\nshow.app <- function(app, name, browse = TRUE, port = NULL, ip = '127.0.0.1', server = NULL) {\n    # replace special characters\n    name <- gsub(\"[^[:alnum:]]\", \"_\", name)\n    \n    if (tools:::httpdPort() !=0 && tools:::httpdPort() != port) {\n        cat(\"ERROR: port is already being used. The PAGODA app is currently incompatible with RStudio. Please try running the interactive app in the R console.\")\n    }\n    if(is.null(server)) { server <- get.scde.server(port) }\n    server$add(app = app, name = name)\n    if(browse) {\n        browseURL(paste(server$full_url(name), \"index.html\", sep = \"/\"))\n    }\n    return(server)\n}\n# get SCDE server from saved session\nget.scde.server <- function(port = NULL, ip = '127.0.0.1') {\n    if(exists(\"___scde.server\", envir = globalenv())) {\n        server <- get(\"___scde.server\", envir = globalenv())\n    } else {\n        require(Rook)\n        server <- Rhttpd$new()\n        assign(\"___scde.server\", server, envir = globalenv())\n        server$start(listen = ip, port = port)\n    }\n    return(server)\n}\n\n\n# calculate individual and joint posterior information\n# models - all or a subset of models belonging to a particular group\n#\n##' Calculate joint expression magnitude posteriors across a set of cells\n##'\n##' Calculates expression magnitude posteriors for the individual cells, and then uses bootstrap resampling to calculate a joint expression posterior for all the specified cells. Alternatively during batch-effect correction procedure, the joint posterior can be calculated for a random composition of cells of different groups (see \\code{batch} and \\code{composition} parameters).\n##'\n##' @param models models models determined by \\code{\\link{scde.error.models}}\n##' @param counts read count matrix\n##' @param prior gene expression prior as determined by \\code{\\link{scde.expression.prior}}\n##' @param n.randomizations number of bootstrap iterations to perform\n##' @param batch a factor describing which batch group each cell (i.e. each row of \\code{models} matrix) belongs to\n##' @param composition a vector describing the batch composition of a group to be sampled\n##' @param return.individual.posteriors whether expression posteriors of each cell should be returned\n##' @param return.individual.posterior.modes whether modes of expression posteriors of each cell should be returned\n##' @param ensemble.posterior Boolean of whether to calculate the ensemble posterior (sum of individual posteriors) instead of a joint (product) posterior. (default: FALSE)\n##' @param n.cores number of cores to utilize\n##'\n##' @return \\subsection{default}{ a posterior probability matrix, with rows corresponding to genes, and columns to expression levels (as defined by \\code{prior$x})\n##' }\n##' \\subsection{return.individual.posterior.modes}{ a list is returned, with the \\code{$jp} slot giving the joint posterior matrix, as described above. The \\code{$modes} slot gives a matrix of individual expression posterior mode values on log scale (rows - genes, columns -cells)}\n##' \\subsection{return.individual.posteriors}{ a list is returned, with the \\code{$post} slot giving a list of individual posterior matrices, in a form analogous to the joint posterior matrix, but reported on log scale }\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' data(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' # calculate joint posteriors\n##' jp <- scde.posteriors(o.ifm, cd, o.prior, n.cores = 1)\n##'\n##' @export\nscde.posteriors <- function(models, counts, prior, n.randomizations = 100, batch = NULL, composition = NULL, return.individual.posteriors = FALSE, return.individual.posterior.modes = FALSE, ensemble.posterior = FALSE, n.cores = 20) {\n    if(!all(rownames(models) %in% colnames(counts))) { stop(\"ERROR: provided count data does not cover all of the cells specified in the model matrix\") }\n    if(!is.null(batch)) { # calculating batch-sampled posteriors instead of evenly sampled ones\n        if(is.null(composition)) { stop(\"ERROR: group composition must be provided if the batch argument is passed\") }\n        batchil <- tapply(c(1:nrow(models))-1, batch, I)\n    }\n    # order counts according to the cells\n    ci <- match(rownames(models), colnames(counts))\n    counts <- as.matrix(counts[, ci, drop = FALSE])\n    marginals <- 10^prior$x - 1\n    marginals[marginals<0] <- 0\n    marginals <- log(marginals)\n\n    min.slope <- 1e-10\n    if(any(models$corr.a<min.slope)) {\n        cat(\"WARNING: the following cells have negatively-correlated or 0-slope fits: \", paste(rownames(models)[models$corr.a<min.slope], collapse = \" \"), \". Setting slopes to 1e-10.\\n\")\n        models$corr.a[models$corr.a<min.slope] <- min.slope\n    }\n\n    postflag <- 0\n    if(return.individual.posteriors) {\n        postflag <- 2\n        if(return.individual.posterior.modes) {\n            postflag <- 3\n        }\n    } else if(return.individual.posterior.modes) {\n        postflag <- 1\n    }\n\n    ensembleflag <- ifelse(ensemble.posterior, 1, 0)\n\n    localthetaflag <- \"corr.ltheta.b\" %in% colnames(models)\n    squarelogitconc <- \"conc.a2\" %in% colnames(models)\n\n    # prepare matrix models\n    mn <- c(\"conc.b\", \"conc.a\", \"fail.r\", \"corr.b\", \"corr.a\", \"corr.theta\", \"corr.ltheta.b\", \"corr.ltheta.t\", \"corr.ltheta.m\", \"corr.ltheta.s\", \"corr.ltheta.r\", \"conc.a2\")\n    mc <- match(c(mn), colnames(models))\n    mm <- matrix(NA, nrow(models), length(mn))\n    mm[, which(!is.na(mc))] <- as.matrix(models[, mc[!is.na(mc)], drop = FALSE])\n\n    chunk <- function(x, n) split(x, sort(rank(x) %% n.cores))\n    if(n.cores > 1 && nrow(counts) > n.cores) { # split by genes\n        xl <- papply(chunk(seq_len(nrow(counts)), n.cores), function(ii) {\n            ucl <- lapply(seq_len(ncol(counts)), function(i) as.vector(unique(counts[ii, i, drop = FALSE])))\n            uci <- do.call(cbind, lapply(seq_len(ncol(counts)), function(i) match(counts[ii, i, drop = FALSE], ucl[[i]])-1))\n            #x <- logBootPosterior(models, ucl, uci, marginals, n.randomizations, 1, postflag)\n            if(!is.null(batch)) {\n                x <- .Call(\"logBootBatchPosterior\", mm, ucl, uci, marginals, batchil, composition, n.randomizations, ii[1], postflag, localthetaflag, squarelogitconc, PACKAGE = \"scde\")\n            } else {\n                x <- .Call(\"logBootPosterior\", mm, ucl, uci, marginals, n.randomizations, ii[1], postflag, localthetaflag, squarelogitconc, ensembleflag, PACKAGE = \"scde\")\n            }\n        }, n.cores = n.cores)\n        if(postflag == 0) {\n            x <- do.call(rbind, xl)\n        } else if(postflag == 1) {\n            x <- list(jp = do.call(rbind, lapply(xl, function(d) d$jp)), modes = do.call(rbind, lapply(xl, function(d) d$modes)))\n        } else if(postflag == 2) {\n            x <- list(jp = do.call(rbind, lapply(xl, function(d) d$jp)), post = lapply(seq_along(xl[[1]]$post), function(pi) { do.call(rbind, lapply(xl, function(d) d$post[[pi]])) }))\n        } else if(postflag == 3) {\n            x <- list(jp = do.call(rbind, lapply(xl, function(d) d$jp)), modes = do.call(rbind, lapply(xl, function(d) d$modes)), post = lapply(seq_along(xl[[1]]$post), function(pi) { do.call(rbind, lapply(xl, function(d) d$post[[pi]])) }))\n        }\n        rm(xl)\n        gc()\n    } else {\n        # unique count lists with matching indices\n        ucl <- lapply(seq_len(ncol(counts)), function(i) as.vector(unique(counts[, i, drop = FALSE])))\n        uci <- do.call(cbind, lapply(seq_len(ncol(counts)), function(i) match(counts[, i, drop = FALSE], ucl[[i]])-1))\n        #x <- logBootPosterior(models, ucl, uci, marginals, n.randomizations, 1, postflag)\n        if(!is.null(batch)) {\n            x <- .Call(\"logBootBatchPosterior\", mm, ucl, uci, marginals, batchil, composition, n.randomizations, 1, postflag, localthetaflag, squarelogitconc, PACKAGE = \"scde\")\n        } else {\n            x <- .Call(\"logBootPosterior\", mm, ucl, uci, marginals, n.randomizations, 1, postflag, localthetaflag, squarelogitconc, ensembleflag, PACKAGE = \"scde\")\n        }\n    }\n    if(postflag == 0) {\n        rownames(x) <- rownames(counts)\n        colnames(x) <- as.character(exp(marginals))\n    } else if(postflag == 1) {\n        rownames(x$jp) <- rownames(counts)\n        colnames(x$jp) <- as.character(exp(marginals))\n        rownames(x$modes) <- rownames(counts)\n        colnames(x$modes) <- rownames(models)\n    } else if(postflag == 2) {\n        rownames(x$jp) <- rownames(counts)\n        colnames(x$jp) <- as.character(exp(marginals))\n        names(x$post) <- rownames(models)\n        x$post <- lapply(x$post, function(d) {\n            rownames(d) <- rownames(counts)\n            colnames(d) <- as.character(exp(marginals))\n            return(d)\n        })\n    } else if(postflag == 3) {\n        rownames(x$jp) <- rownames(counts)\n        colnames(x$jp) <- as.character(exp(marginals))\n        rownames(x$modes) <- rownames(counts)\n        colnames(x$modes) <- rownames(models)\n        names(x$post) <- rownames(models)\n        x$post <- lapply(x$post, function(d) {\n            rownames(d) <- rownames(counts)\n            colnames(d) <- as.character(exp(marginals))\n            return(d)\n        })\n    }\n    return(x)\n}\n\n\n# get estimates of expression magnitude for a given set of models\n# models - entire model matrix, or a subset of cells (i.e. select rows) of the model matrix for which the estimates should be obtained\n# counts - count data that covers the desired set of genes (rows) and all specified cells (columns)\n# return - a matrix of log(FPM) estimates with genes as rows and cells  as columns (in the model matrix order).\n##' Return scaled expression magnitude estimates\n##'\n##' Return point estimates of expression magnitudes of each gene across a set of cells, based on the regression slopes determined during the model fitting procedure.\n##'\n##' @param models models determined by \\code{\\link{scde.error.models}}\n##' @param counts count matrix\n##'\n##' @return a matrix of expression magnitudes on a log scale (rows - genes, columns - cells)\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' data(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n##' # get expression magnitude estimates\n##' lfpm <- scde.expression.magnitude(o.ifm, cd)\n##'\n##' @export\nscde.expression.magnitude <- function(models, counts) {\n    if(!all(rownames(models) %in% colnames(counts))) { stop(\"ERROR: provided count data does not cover all of the cells specified in the model matrix\") }\n    t((t(log(counts[, rownames(models), drop = FALSE]))-models$corr.b)/models$corr.a)\n}\n\n\n# calculate drop-out probability given either count data or magnitudes (log(FPM))\n# magnitudes can either be a per-cell matrix or a single vector of values which will be evaluated for each cell\n# returns a probability of a drop out event for every gene (rows) for every cell (columns)\n##' Calculate drop-out probabilities given a set of counts or expression magnitudes\n##'\n##' Returns estimated drop-out probability for each cell (row of \\code{models} matrix), given either an expression magnitude\n##' @param models models determined by \\code{\\link{scde.error.models}}\n##' @param magnitudes a vector (\\code{length(counts) == nrows(models)}) or a matrix (columns correspond to cells) of expression magnitudes, given on a log scale\n##' @param counts a vector (\\code{length(counts) == nrows(models)}) or a matrix (columns correspond to cells) of read counts from which the expression magnitude should be estimated\n##'\n##' @return a vector or a matrix of drop-out probabilities\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' data(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' # calculate probability of observing a drop out at a given set of magnitudes in different cells\n##' mags <- c(1.0, 1.5, 2.0)\n##' p <- scde.failure.probability(o.ifm, magnitudes = mags)\n##' # calculate probability of observing the dropout at a magnitude corresponding to the\n##' # number of reads actually observed in each cell\n##' self.p <- scde.failure.probability(o.ifm, counts = cd)\n##'\n##' @export\nscde.failure.probability <- function(models, magnitudes = NULL, counts = NULL) {\n    if(is.null(magnitudes)) {\n        if(!is.null(counts)) {\n            magnitudes <- scde.expression.magnitude(models, counts)\n        } else {\n            stop(\"ERROR: either magnitudes or counts should be provided\")\n        }\n    }\n    if(is.matrix(magnitudes)) { # a different vector for every cell\n        if(!all(rownames(models) %in% colnames(magnitudes))) { stop(\"ERROR: provided magnitude data does not cover all of the cells specified in the model matrix\") }\n        if(\"conc.a2\" %in% names(models)) {\n            x <- t(1/(exp(t(magnitudes)*models$conc.a +t(magnitudes^2)*models$conc.a2 + models$conc.b)+1))\n        } else {\n            x <- t(1/(exp(t(magnitudes)*models$conc.a + models$conc.b)+1))\n        }\n    } else { # a common vector of magnitudes for all cells\n        if(\"conc.a2\" %in% names(models)) {\n            x <- t(1/(exp((models$conc.a %*% t(magnitudes)) + (models$conc.a2 %*% t(magnitudes^2)) + models$conc.b)+1))\n        } else {\n            x <- t(1/(exp((models$conc.a %*% t(magnitudes)) + models$conc.b)+1))\n        }\n    }\n    x[is.nan(x)] <- 0\n    colnames(x) <- rownames(models)\n    x\n}\n\n\n##' Test differential expression and plot posteriors for a particular gene\n##'\n##' The function performs differential expression test and optionally plots posteriors for a specified gene.\n##'\n##' @param gene name of the gene to be tested\n##' @param models models\n##' @param counts read count matrix (must contain the row corresponding to the specified gene)\n##' @param prior expression magnitude prior\n##' @param groups a two-level factor specifying between which cells (rows of the models matrix) the comparison should be made\n##' @param batch optional multi-level factor assigning the cells (rows of the model matrix) to different batches that should be controlled for (e.g. two or more biological replicates). The expression difference estimate will then take into account the likely difference between the two groups that is explained solely by their difference in batch composition. Not all batch configuration may be corrected this way.\n##' @param batch.models optional set of models for batch comparison (typically the same as models, but can be more extensive, or recalculated within each batch)\n##' @param n.randomizations number of bootstrap/sampling iterations that should be performed\n##' @param show.plots whether the plots should be shown\n##' @param return.details whether the posterior should be returned\n##' @param verbose set to T for some status output\n##' @param ratio.range optionally specifies the range of the log2 expression ratio plot\n##' @param show.individual.posteriors whether the individual cell expression posteriors should be plotted\n##' @param n.cores number of cores to use (default = 1)\n##'\n##' @return by default returns MLE of log2 expression difference, 95% CI (upper, lower bound), and a Z-score testing for expression difference. If return.details = TRUE, a list is returned containing the above structure, as well as the expression fold difference posterior itself.\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' data(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' scde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior)\n##'\n##' @export\nscde.test.gene.expression.difference <- function(gene, models, counts, prior, groups = NULL, batch = NULL, batch.models = models, n.randomizations = 1e3, show.plots = TRUE, return.details = FALSE, verbose = FALSE, ratio.range = NULL, show.individual.posteriors = TRUE, n.cores = 1) {\n    if(!gene %in% rownames(counts)) {\n        stop(\"ERROR: specified gene (\", gene, \") is not found in the count data\")\n    }\n\n    ci <- match(rownames(models), colnames(counts))\n    counts <- as.matrix(counts[gene, ci, drop = FALSE])\n\n\n    if(is.null(groups)) { # recover groups from models\n        groups <- as.factor(attr(models, \"groups\"))\n        if(is.null(groups)) stop(\"ERROR: groups factor is not provided, and models structure is lacking groups attribute\")\n        names(groups) <- rownames(models)\n    }\n    if(length(levels(groups)) != 2) {\n        stop(paste(\"ERROR: wrong number of levels in the grouping factor (\", paste(levels(groups), collapse = \" \"), \"), but must be two.\", sep = \"\"))\n    }\n\n    if(verbose) {\n        cat(\"comparing gene \", gene, \" between groups:\\n\")\n        print(table(as.character(groups)))\n    }\n\n    # calculate joint posteriors\n    jpl <- tapply(seq_len(nrow(models)), groups, function(ii) {\n        scde.posteriors(models = models[ii, , drop = FALSE], counts = counts[, ii, drop = FALSE], prior = prior, n.cores = n.cores, n.randomizations = n.randomizations, return.individual.posteriors = TRUE)\n    })\n\n    bdiffp <- calculate.ratio.posterior(jpl[[1]]$jp, jpl[[2]]$jp, prior, n.cores = n.cores)\n\n    bdiffp.rep <- quick.distribution.summary(bdiffp)\n\n    nam1 <- levels(groups)[1]\n    nam2 <- levels(groups)[2]\n\n    # batch control\n    correct.batch <- !is.null(batch) && length(levels(batch)) > 1\n    if(correct.batch) {\n        batch <- as.factor(batch)\n        # check batch-group interactions\n        bgti <- table(groups, batch)\n        bgti.ft <- fisher.test(bgti)\n        if(verbose) {\n            cat(\"controlling for batch effects. interaction:\\n\")\n        }\n        if(any(bgti == 0)) {\n            cat(\"ERROR: cannot control for batch effect, as some batches are found only in one group:\\n\")\n            print(bgti)\n        }\n        if(bgti.ft$p.value<1e-3) {\n            cat(\"WARNING: strong interaction between groups and batches! Correction may be ineffective:\\n\")\n            print(bgti)\n            print(bgti.ft)\n        }\n        # calculate batch posterior\n        batch.jpl <- tapply(seq_len(nrow(models)), groups, function(ii) {\n            scde.posteriors(models = batch.models, counts = counts, prior = prior, batch = batch, composition = table(batch[ii]), n.cores = n.cores, n.randomizations = n.randomizations, return.individual.posteriors = FALSE)\n        })\n        batch.bdiffp <- calculate.ratio.posterior(batch.jpl[[1]], batch.jpl[[2]], prior, n.cores = n.cores)\n        a.bdiffp <- calculate.ratio.posterior(bdiffp, batch.bdiffp, prior = data.frame(x = as.numeric(colnames(bdiffp)), y = rep(1/ncol(bdiffp), ncol(bdiffp))), skip.prior.adjustment = TRUE)\n        a.bdiffp.rep <- quick.distribution.summary(a.bdiffp)\n    }\n\n\n    if(show.plots) {\n        # show each posterior\n        layout(matrix(c(1:3), 3, 1, byrow = TRUE), heights = c(2, 1, 2), widths = c(1), FALSE)\n        par(mar = c(2.5, 3.5, 2.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        #par(mar = c(2.5, 3.5, 0.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n\n        pp <- exp(do.call(rbind, lapply(jpl[[1]]$post, as.numeric)))\n        cols <- rainbow(nrow(pp), s = 0.8)\n        plot(c(), c(), xlim = range(prior$x), ylim = range(c(0, pp)), xlab = \"expression level\", ylab = \"individual posterior\", main = nam1)\n        if(show.individual.posteriors) {\n            lapply(seq_len(nrow(pp)), function(i) lines(prior$x, pp[i, ], col = rgb(1, 0.5, 0, alpha = 0.25)))\n        }\n        #legend(x = ifelse(which.max(na.omit(pjpc)) > length(pjpc)/2, \"topleft\", \"topright\"), bty = \"n\", col = cols, legend = rownames(pp), lty = rep(1, nrow(pp)))\n        if(correct.batch) {\n            par(new = TRUE)\n            plot(prior$x, batch.jpl[[1]][1, ], axes = FALSE, ylab = \"\", xlab = \"\", type = 'l', col = 8, lty = 1, lwd = 2)\n        }\n        pjpc <- jpl[[1]]$jp\n        par(new = TRUE)\n        jpr <- range(c(0, na.omit(pjpc)))\n        plot(prior$x, pjpc, axes = FALSE, ylab = \"\", xlab = \"\", ylim = jpr, type = 'l', col = 1, lty = 1, lwd = 2)\n        axis(4, pretty(jpr, 5), col = 1)\n        mtext(\"joint posterior\", side = 4, outer = FALSE, line = 2)\n\n\n        # ratio plot\n        if(is.null(ratio.range)) { ratio.range <- range(as.numeric(colnames(bdiffp))/log10(2)) }\n\n        par(mar = c(2.5, 3.5, 0.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        rv <- as.numeric(colnames(bdiffp))/log10(2)\n        rp <- as.numeric(bdiffp[1, ])\n        plot(rv, rp, xlab = \"log2 expression ratio\", ylab = \"ratio posterior\", type = 'l', lwd = ifelse(correct.batch, 1, 2), main = \"\", axes = FALSE, xlim = ratio.range, ylim = c(0, max(bdiffp)))\n        axis(1, pretty(ratio.range, 5), col = 1)\n        abline(v = 0, lty = 2, col = 8)\n        if(correct.batch) { # with batch correction\n            # show batch difference\n            par(new = TRUE)\n            plot(as.numeric(colnames(batch.bdiffp))/log10(2), as.numeric(batch.bdiffp[1, ]), xlab = \"\", ylab = \"\", type = 'l', lwd = 1, main = \"\", axes = FALSE, xlim = ratio.range, col = 8, ylim = c(0, max(batch.bdiffp)))\n            # fill out the a.bdiffp confidence interval\n            par(new = TRUE)\n            rv <- as.numeric(colnames(a.bdiffp))/log10(2)\n            rp <- as.numeric(a.bdiffp[1, ])\n            plot(rv, rp, xlab = \"\", ylab = \"\", type = 'l', lwd = 2, main = \"\", axes = FALSE, xlim = ratio.range, col = 2, ylim = c(0, max(rp)))\n            axis(2, pretty(c(0, max(a.bdiffp)), 2), col = 1)\n            r.lb <- which.min(abs(rv-a.bdiffp.rep$lb))\n            r.ub <- which.min(abs(rv-a.bdiffp.rep$ub))\n            polygon(c(rv[r.lb], rv[r.lb:r.ub], rv[r.ub]), y = c(-10, rp[r.lb:r.ub], -10), col = rgb(1, 0, 0, alpha = 0.2), border = NA)\n            abline(v = a.bdiffp.rep$mle, col = 2, lty = 2)\n            abline(v = c(rv[r.ub], rv[r.lb]), col = 2, lty = 3)\n\n            legend(x = ifelse(a.bdiffp.rep$mle > 0, \"topleft\", \"topright\"), legend = c(paste(\"MLE: \", round(a.bdiffp.rep$mle, 2), sep = \"\"), paste(\"95% CI: \", round(a.bdiffp.rep$lb, 2), \" : \", round(a.bdiffp.rep$ub, 2), sep = \"\"), paste(\"Z = \", round(a.bdiffp.rep$Z, 2), sep = \"\"), paste(\"cZ = \", round(a.bdiffp.rep$cZ, 2), sep = \"\")), bty = \"n\")\n\n        } else {  # without batch correction\n            # fill out the bdiffp confidence interval\n            axis(2, pretty(c(0, max(bdiffp)), 2), col = 1)\n\n            r.lb <- which.min(abs(rv-bdiffp.rep$lb))\n            r.ub <- which.min(abs(rv-bdiffp.rep$ub))\n            polygon(c(rv[r.lb], rv[r.lb:r.ub], rv[r.ub]), y = c(-10, rp[r.lb:r.ub], -10), col = rgb(1, 0, 0, alpha = 0.2), border = NA)\n            abline(v = bdiffp.rep$mle, col = 2, lty = 2)\n            abline(v = c(rv[r.ub], rv[r.lb]), col = 2, lty = 3)\n\n            legend(x = ifelse(bdiffp.rep$mle > 0, \"topleft\", \"topright\"), legend = c(paste(\"MLE: \", round(bdiffp.rep$mle, 2), sep = \"\"), paste(\"95% CI: \", round(bdiffp.rep$lb, 2), \" : \", round(bdiffp.rep$ub, 2), sep = \"\"), paste(\"Z = \", round(bdiffp.rep$Z, 2), sep = \"\"), paste(\"aZ = \", round(bdiffp.rep$cZ, 2), sep = \"\")), bty = \"n\")\n        }\n\n        # distal plot\n        par(mar = c(2.5, 3.5, 2.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        #par(mar = c(2.5, 3.5, 0.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        dp <- exp(do.call(rbind, lapply(jpl[[2]]$post, as.numeric)))\n        cols <- rainbow(nrow(dp), s = 0.8)\n        plot(c(), c(), xlim = range(prior$x), ylim = range(c(0, dp)), xlab = \"expression level\", ylab = \"individual posterior\", main = nam2)\n        if(show.individual.posteriors) {\n            lapply(seq_len(nrow(dp)), function(i) lines(prior$x, dp[i, ], col = rgb(0, 0.5, 1, alpha = 0.25)))\n        }\n        if(correct.batch) {\n            par(new = TRUE)\n            plot(prior$x, batch.jpl[[2]][1, ], axes = FALSE, ylab = \"\", xlab = \"\", type = 'l', col = 8, lty = 1, lwd = 2)\n        }\n        djpc <- jpl[[2]]$jp\n        #legend(x = ifelse(which.max(na.omit(djpc)) > length(djpc)/2, \"topleft\", \"topright\"), bty = \"n\", col = cols, legend = rownames(dp), lty = rep(1, nrow(dp)))\n        par(new = TRUE)\n        jpr <- range(c(0, na.omit(djpc)))\n        plot(prior$x, djpc, axes = FALSE, ylab = \"\", xlab = \"\", ylim = jpr, type = 'l', col = 1, lty = 1, lwd = 2)\n        axis(4, pretty(jpr, 5), col = 1)\n        mtext(\"joint posterior\", side = 4, outer = FALSE, line = 2)\n    }\n\n    if(return.details) {\n        if(correct.batch) { # with batch correction\n            return(list(results = a.bdiffp.rep, difference.posterior = a.bdiffp, results.nobatchcorrection = bdiffp.rep))\n        } else {\n            return(list(results = bdiffp.rep, difference.posterior = bdiffp, posteriors = jpl))\n        }\n    } else {\n        if(correct.batch) { # with batch correction\n            return(a.bdiffp.rep)\n        } else {\n            return(bdiffp.rep)\n        }\n    }\n}\n\n\n# fit models to external (bulk) reference\n##' Fit scde models relative to provided set of expression magnitudes\n##'\n##' If group-average expression magnitudes are available (e.g. from bulk measurement), this method can be used\n##' to fit individual cell error models relative to that reference\n##'\n##' @param counts count matrix\n##' @param reference a vector of expression magnitudes (read counts) corresponding to the rows of the count matrix\n##' @param min.fpm minimum reference fpm of genes that will be used to fit the models (defaults to 1). Note: fpm is calculated from the reference count vector as reference/sum(reference)*1e6\n##' @param n.cores number of cores to use\n##' @param zero.count.threshold read count to use as an initial guess for the zero threshold\n##' @param nrep number independent of mixture fit iterations to try (default = 1)\n##' @param save.plots whether to write out a pdf file showing the model fits\n##' @param plot.filename model fit pdf filename\n##' @param verbose verbose level\n##'\n##' @return matrix of scde models\n##'\n##' @examples\n##' data(es.mef.small)\n##' cd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n##' \\donttest{\n##' o.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\n##' o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n##' # calculate joint posteriors across all cells\n##' jp <- scde.posteriors(models = o.ifm, cd, o.prior, n.cores = 10, return.individual.posterior.modes = TRUE, n.randomizations = 100)\n##' # use expected expression magnitude for each gene\n##' av.mag <- as.numeric(jp$jp %*% as.numeric(colnames(jp$jp)))\n##' # translate into counts\n##' av.mag.counts <- as.integer(round(av.mag))\n##' # now, fit alternative models using av.mag as a reference (normally this would correspond to bulk RNA expression magnitude)\n##' ref.models <- scde.fit.models.to.reference(cd, av.mag.counts, n.cores = 1)\n##' }\n##'\n##' @export\nscde.fit.models.to.reference <- function(counts, reference, n.cores = 10, zero.count.threshold = 1, nrep = 1, save.plots = FALSE, plot.filename = \"reference.model.fits.pdf\", verbose = 0, min.fpm = 1) {\n    return.compressed.models <- TRUE\n    verbose <- 1\n    ids <- colnames(counts)\n    ml <- papply(seq_along(ids), function(i) {\n        df <- data.frame(count = counts[, ids[i]], fpm = reference/sum(reference)*1e6)\n        df <- df[df$fpm > min.fpm, ]\n        m1 <- fit.nb2.mixture.model(df, nrep = nrep, verbose = verbose, zero.count.threshold = zero.count.threshold)\n        if(return.compressed.models) {\n            v <- get.compressed.v1.model(m1)\n            cl <- clusters(m1)\n            rm(m1)\n            gc()\n            return(list(model = v, clusters = cl))\n        } else {\n            return(m1)\n        }\n    }, n.cores = n.cores)\n    names(ml) <- ids\n\n    # check if there were errors in the multithreaded portion\n    lapply(seq_along(ml), function(i) {\n        if(class(ml[[i]]) == \"try-error\") {\n            message(\"ERROR encountered in building a model for cell \", ids[i], \":\")\n            message(ml[[i]])\n            tryCatch(stop(paste(\"ERROR encountered in building a model for cell \", ids[i])), error = function(e) stop(e))\n        }\n    })\n\n    if(save.plots) {\n        # model fits\n        #CairoPNG(file = paste(group, \"model.fits.png\", sep = \".\"), width = 1024, height = 300*length(ids))\n        pdf(file = plot.filename, width = 13, height = 4)\n        #l <- layout(matrix(seq(1, 4*length(ids)), nrow = length(ids), byrow = TRUE), rep(c(1, 1, 1, 0.5), length(ids)), rep(1, 4*length(ids)), FALSE)\n        l <- layout(matrix(seq(1, 4), nrow = 1, byrow = TRUE), rep(c(1, 1, 1, 0.5), 1), rep(1, 4), FALSE)\n        par(mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n        invisible(lapply(seq_along(ids), function(i) {\n            df <- data.frame(count = counts[, ids[i]], fpm = reference/sum(reference)*1e6)\n            df <- df[df$fpm > min.fpm, ]\n            plot.nb2.mixture.fit(ml[[i]], df, en = ids[i], do.par = FALSE, compressed.models = return.compressed.models)\n        }))\n        dev.off()\n    }\n\n    if(return.compressed.models) {\n        # make a joint model matrix\n        jmm <- data.frame(do.call(rbind, lapply(ml, function(m) m$model)))\n        rownames(jmm) <- names(ml)\n        jmm\n        return(jmm)\n    } else {\n        return(ml)\n    }\n}\n\n\n##' Determine principal components of a matrix using per-observation/per-variable weights\n##'\n##' Implements a weighted PCA\n##'\n##' @param mat matrix of variables (columns) and observations (rows)\n##' @param matw  corresponding weights\n##' @param npcs number of principal components to extract\n##' @param nstarts number of random starts to use\n##' @param smooth smoothing span\n##' @param em.tol desired EM algorithm tolerance\n##' @param em.maxiter maximum number of EM iterations\n##' @param seed random seed\n##' @param center whether mat should be centered (weighted centering)\n##' @param n.shuffles optional number of per-observation randomizations that should be performed in addition to the main calculations to determine the lambda1 (PC1 eigenvalue) magnitude under such randomizations (returned in $randvar)\n##'\n##' @return a list containing eigenvector matrix ($rotation), projections ($scores), variance (weighted) explained by each component ($var), total (weighted) variance of the dataset ($totalvar)\n##'\n##' @examples\n##' set.seed(0)\n##' mat <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random matrix\n##' base.pca <- bwpca(mat)  # non-weighted pca, equal weights set automatically\n##' matw <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random weight matrix\n##' matw <- abs(matw)/max(matw)\n##' base.pca.weighted <- bwpca(mat, matw)  # weighted pca\n##'\n##' @export\nbwpca <- function(mat, matw = NULL, npcs = 2, nstarts = 1, smooth = 0, em.tol = 1e-6, em.maxiter = 25, seed = 1, center = TRUE, n.shuffles = 0) {\n    if(smooth<4) { smooth <- 0 }\n    if(any(is.nan(matw))) {\n      stop(\"bwpca: weight matrix contains NaN values\")\n    }\n    if(any(is.nan(mat))) {\n      stop(\"bwpca: value matrix contains NaN values\")\n    }\n    if(is.null(matw)) {\n        matw <- matrix(1, nrow(mat), ncol(mat))\n        nstarts <- 1\n    }\n    if(center) { mat <- t(t(mat)-colSums(mat*matw)/colSums(matw)) }\n\n    res <- .Call(\"baileyWPCA\", mat, matw, npcs, nstarts, smooth, em.tol, em.maxiter, seed, n.shuffles, PACKAGE = \"scde\")\n    #res <- bailey.wpca(mat, matw, npcs, nstarts, smooth, em.tol, em.maxiter, seed)\n    rownames(res$rotation) <- colnames(mat)\n    rownames(res$scores) <- rownames(mat)\n    colnames(res$rotation) <- paste(\"PC\", seq(1:ncol(res$rotation)), sep = \"\")\n    res$sd <- t(sqrt(res$var))\n    res\n}\n\n\n##' Winsorize matrix\n##'\n##' Sets the ncol(mat)*trim top outliers in each row to the next lowest value same for the lowest outliers\n##'\n##' @param mat matrix\n##' @param trim fraction of outliers (on each side) that should be Winsorized, or (if the value is  >= 1) the number of outliers to be trimmed on each side\n##'\n##' @return Winsorized matrix\n##'\n##' @examples\n##' set.seed(0)\n##' mat <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random matrix\n##' mat[1,1] <- 1000  # make outlier\n##' range(mat)  # look at range of values\n##' win.mat <- winsorize.matrix(mat, 0.1)\n##' range(win.mat)  # note outliers removed\n##'\n##' @export\nwinsorize.matrix <- function(mat, trim) {\n    if(trim  >  0.5) { trim <- trim/ncol(mat)  }\n    wm <- .Call(\"winsorizeMatrix\", mat, trim, PACKAGE = \"scde\")\n    rownames(wm) <- rownames(mat)\n    colnames(wm) <- colnames(mat)\n    return(wm)\n}\n\n\n############################ PAGODA functions\n\n\n##' Build error models for heterogeneous cell populations, based on K-nearest neighbor cells.\n##'\n##' Builds cell-specific error models assuming that there are multiple subpopulations present\n##' among the measured cells. The models for each cell are based on average expression estimates\n##' obtained from K closest cells within a given group (if groups = NULL, then within the entire\n##' set of measured cells). The method implements fitting of both the original log-fit models\n##' (when linear.fit = FALSE), or newer linear-fit models (linear.fit = TRUE, default) with locally\n##' fit overdispersion coefficient (local.theta.fit = TRUE, default).\n##'\n##' @param counts count matrix (integer matrix, rows- genes, columns- cells)\n##' @param groups optional groups partitioning known subpopulations\n##' @param cor.method correlation measure to be used in determining k nearest cells\n##' @param k number of nearest neighbor cells to use during fitting. If k is set sufficiently high, all of the cells within a given group will be used.\n##' @param min.nonfailed minimum number of non-failed measurements (within the k nearest neighbor cells) required for a gene to be taken into account during error fitting procedure\n##' @param min.size.entries minimum number of genes to use for model fitting\n##' @param min.count.threshold minimum number of reads required for a measurement to be considered non-failed\n##' @param save.model.plots whether model plots should be saved (file names are (group).models.pdf, or cell.models.pdf if no group was supplied)\n##' @param max.model.plots maximum number of models to save plots for (saves time when there are too many cells)\n##' @param n.cores number of cores to use through the calculations\n##' @param min.fpm optional parameter to restrict model fitting to genes with group-average expression magnitude above a given value\n##' @param verbose level of verbosity\n##' @param fpm.estimate.trim trim fraction to be used in estimating group-average gene expression magnitude for model fitting (0.5 would be median, 0 would turn off trimming)\n##' @param linear.fit whether newer linear model fit with zero intercept should be used (T), or the log-fit model published originally (F)\n##' @param local.theta.fit whether local theta fitting should be used (only available for the linear fit models)\n##' @param theta.fit.range allowed range of the theta values\n##' @param alpha.weight.power 1/theta weight power used in fitting theta dependency on the expression magnitude\n##'\n##' @return a data frame with parameters of the fit error models (rows- cells, columns- fitted parameters)\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' }\n##'\n##' @export\nknn.error.models <- function(counts, groups = NULL, k = round(ncol(counts)/2), min.nonfailed = 5, min.count.threshold = 1, save.model.plots = TRUE, max.model.plots = 50, n.cores = parallel::detectCores(), min.size.entries = 2e3, min.fpm = 0, cor.method = \"pearson\", verbose = 0, fpm.estimate.trim = 0.25, linear.fit = TRUE, local.theta.fit = linear.fit, theta.fit.range = c(1e-2, 1e2), alpha.weight.power = 1/2) {\n    threshold.prior = 1-1e-6\n\n    # check for integer counts\n    if(any(!unlist(lapply(counts,is.integer)))) {\n      stop(\"Some of the supplied counts are not integer values (or stored as non-integer types). Aborting!\\nThe method is designed to work on read counts - do not pass normalized read counts (e.g. FPKM values). If matrix contains read counts, but they are stored as numeric values, use counts<-apply(counts,2,function(x) {storage.mode(x) <- 'integer'; x}) to recast.\");\n    }\n\n    # TODO:\n    #  - implement check for k >= n.cells (to avoid correlation calculations)\n    #  - implement error reporting/handling for failed cell fits\n\n    if(is.null(groups)) {\n        groups <- as.factor(rep(\"cell\", ncol(counts)))\n    }\n    names(groups) <- colnames(counts)\n\n    if(k >  ncol(counts)-1) {\n        message(\"the value of k (\", k, \") is too large, setting to \", (ncol(counts)-1))\n        k <- ncol(counts)-1\n    }\n\n    ls <- estimate.library.sizes(counts, NULL, groups, min.size.entries, verbose = verbose, return.details = TRUE, vil = counts >= min.count.threshold)\n    ca <- counts\n    ca[ca<min.count.threshold] <- NA # a version of counts with all \"drop-out\" components set to NA\n    mll <- tapply(colnames(counts), groups, function(ids) {\n        # use Spearman rank correlation on pairwise complete observations to establish distance relationships between cells\n        group <- as.character(groups[ids[1]])\n\n        if(verbose > 0) {\n            cat(group, \": calculating cell-cell similarities ...\")\n        }\n\n        #if(n.cores > 1) { allowWGCNAThreads(n.cores) } else { disableWGCNAThreads() }\n        #celld <- WGCNA::cor(log10(matrix(as.numeric(as.matrix(ca)), nrow = nrow(ca), ncol = ncol(ca))+1), method = cor.method, use = \"p\", nThreads = n.cores)\n        if(is.element(\"WGCNA\", installed.packages()[, 1])) {\n            celld <- WGCNA::cor(sqrt(matrix(as.numeric(as.matrix(ca[, ids])), nrow = nrow(ca), ncol = length(ids))), method = cor.method, use = \"p\", nThreads = n.cores)\n        } else {\n            celld <- stats::cor(sqrt(matrix(as.numeric(as.matrix(ca[, ids])), nrow = nrow(ca), ncol = length(ids))), method = cor.method, use = \"p\")\n        }\n        rownames(celld) <- colnames(celld) <- ids\n\n        if(verbose > 0) {\n            cat(\" done\\n\")\n        }\n\n        # TODO: correct for batch effect in cell-cell similarity matrix\n        if(FALSE) {\n            # number batches 10^(seq(0, n)) compute matrix of id sums, NA the diagonal,\n            bid <- 10^(as.integer(batch)-1)\n            bm <- matrix(bid, byrow = TRUE, nrow = length(bid), ncol = length(bid))+bid\n            diag(bm) <- NA\n\n            # use tapply to calculate means shifts per combination reconstruct shift vector, matrix, subtract\n            # select the upper triangle, tapply to it to correct celld vector directly\n        }\n\n        if(verbose)  message(paste(\"fitting\", group, \"models:\"))\n\n        ml <- papply(seq_along(ids), function(i) { try({\n            if(verbose)  message(paste(group, '.', i, \" : \", ids[i], sep = \"\"))\n            # determine k closest cells\n            oc <- ids[-i][order(celld[ids[i], -i, drop = FALSE], decreasing = TRUE)[1:min(k, length(ids)-1)]]\n            #set.seed(i)   oc <- sample(ids[-i], k)\n            # determine a subset of genes that show up sufficiently often\n            #fpm <- rowMeans(t(t(counts[, oc, drop = FALSE])/(ls$ls[oc])))\n            fpm <- apply(t(ca[, oc, drop = FALSE])/(ls$ls[oc]), 2, mean, trim = fpm.estimate.trim, na.rm = TRUE)\n            # rank genes by the number of non-zero occurrences, take top genes\n            vi <- which(rowSums(counts[, oc] > min.count.threshold)  >=  min(ncol(oc)-1, min.nonfailed) & fpm > min.fpm)\n            if(length(vi)<40)  message(\"WARNING: only \", length(vi), \" valid genes were found to fit \", ids[i], \" model\")\n            df <- data.frame(count = counts[vi, ids[i]], fpm = fpm[vi])\n\n            # determine failed-component posteriors for each gene\n            #fp <- ifelse(df$count <=  min.count.threshold, threshold.prior, 1-threshold.prior)\n            fp <- ifelse(df$count <=  min.count.threshold & df$fpm  >=  median(df$fpm[df$count <=  min.count.threshold]), threshold.prior, 1-threshold.prior)\n            cp <- cbind(fp, 1-fp)\n\n            if(linear.fit) {\n                # use a linear fit (nb2gth)\n                m1 <- fit.nb2gth.mixture.model(df, prior = cp, nrep = 1, verbose = verbose, zero.count.threshold = min.count.threshold, full.theta.range = theta.fit.range, theta.fit.range = theta.fit.range, use.constant.theta.fit = !local.theta.fit, alpha.weight.power = alpha.weight.power)\n\n            }  else {\n                # mixture fit (the originally published method)\n                m1 <- fit.nb2.mixture.model(df, prior = cp, nrep = 1, verbose = verbose, zero.count.threshold = min.count.threshold)\n            }\n            v <- get.compressed.v1.model(m1)\n            cl <- clusters(m1)\n            m1<-list(model = v, clusters = cl)\n            #plot.nb2.mixture.fit(m1, df, en = ids[i], do.par = FALSE, compressed.models = TRUE)\n            return(m1)\n            #})\n        })}, n.cores = n.cores)\n        vic <- which(unlist(lapply(seq_along(ml), function(i) {\n            if(class(ml[[i]]) == \"try-error\") {\n                message(\"ERROR encountered in building a model for cell \", ids[i], \" - skipping the cell. Error:\")\n                message(ml[[i]])\n                #tryCatch(stop(paste(\"ERROR encountered in building a model for cell \", ids[i])), error = function(e) stop(e))\n                return(FALSE);\n            }\n            return(TRUE);\n        })))\n        ml <- ml[vic]; names(ml) <- ids[vic];\n\n        if(length(vic)<length(ids)) {\n          message(\"ERROR fitting of \", (length(ids)-length(vic)), \" out of \", length(ids), \" cells resulted in errors reporting remaining \", length(vic), \" cells\")\n        }\n        if(length(vic)<length(ids)) {\n                # model fits\n                if(verbose)  message(\"plotting \", group, \" model fits... \")\n                tryCatch( {\n                    pdf(file = paste(group, \"model.fits.pdf\", sep = \".\"), width = ifelse(local.theta.fit, 13, 15), height = 4)\n                    l <- layout(matrix(seq(1, 4), nrow = 1, byrow = TRUE), rep(c(1, 1, 1, ifelse(local.theta.fit, 1, 0.5)), 1), rep(1, 4), FALSE)\n                    par(mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n                    invisible(lapply(vic[1:min(max.model.plots, length(vic))], function(i) {\n                        oc <- ids[-i][order(celld[ids[i], -i, drop = FALSE], decreasing = TRUE)[1:min(k, length(ids)-1)]]\n                        #set.seed(i) oc <- sample(ids[-i], k)\n                        # determine a subset of genes that show up sufficiently often\n                        #fpm <- rowMeans(t(t(counts[, oc, drop = FALSE])/(ls$ls[oc])))\n                        fpm <- apply(t(ca[, oc, drop = FALSE])/(ls$ls[oc]), 2, mean, trim = fpm.estimate.trim, na.rm = TRUE)\n                        vi <- which(rowSums(counts[, oc] > min.count.threshold)  >=  min(ncol(oc)-1, min.nonfailed) & fpm > min.fpm)\n                        df <- data.frame(count = counts[vi, ids[i]], fpm = fpm[vi])\n                        plot.nb2.mixture.fit(ml[[ids[i]]], df, en = ids[i], do.par = FALSE, compressed.models = TRUE)\n                    }))\n                    dev.off()\n                }, error = function(e) {\n                    message(\"ERROR encountered during model fit plot outputs:\")\n                    message(e)\n                    dev.off()\n                })\n        }\n\n        return(ml)\n    })\n\n\n    # make a joint model matrix\n    jmm <- data.frame(do.call(rbind, lapply(mll, function(tl) do.call(rbind, lapply(tl, function(m) m$model)))))\n    rownames(jmm) <- unlist(lapply(mll, names))\n    # reorder in the original cell order\n    attr(jmm, \"groups\") <- rep(names(mll), unlist(lapply(mll, length)))\n    return(jmm)\n}\n\n\n##' Normalize gene expression variance relative to transcriptome-wide expectations\n##'\n##' Normalizes gene expression magnitudes to ensure that the variance follows chi-squared statistics\n##' with respect to its ratio to the transcriptome-wide expectation as determined by local regression\n##' on expression magnitude (and optionally gene length). Corrects for batch effects.\n##'\n##' @param models model matrix (select a subset of rows to normalize variance within a subset of cells)\n##' @param counts read count matrix\n##' @param batch measurement batch (optional)\n##' @param trim trim value for Winsorization (optional, can be set to 1-3 to reduce the impact of outliers, can be as large as 5 or 10 for datasets with several thousand cells)\n##' @param prior expression magnitude prior\n##' @param fit.genes a vector of gene names which should be used to establish the variance fit (default is NULL: use all genes). This can be used to specify, for instance, a set spike-in control transcripts such as ERCC.\n##' @param plot whether to plot the results\n##' @param minimize.underdispersion whether underdispersion should be minimized (can increase sensitivity in datasets with high complexity of population, however cannot be effectively used in datasets where multiple batches are present)\n##' @param n.cores number of cores to use\n##' @param n.randomizations number of bootstrap sampling rounds to use in estimating average expression magnitude for each gene within the given set of cells\n##' @param weight.k k value to use in the final weight matrix\n##' @param verbose verbosity level\n##' @param weight.df.power power factor to use in determining effective number of degrees of freedom (can be increased for datasets exhibiting particularly high levels of noise at low expression magnitudes)\n##' @param smooth.df degrees of freedom to be used in calculating smoothed local regression between coefficient of variation and expression magnitude (and gene length, if provided). Leave at -1 for automated guess.\n##' @param max.adj.var maximum value allowed for the estimated adjusted variance (capping of adjusted variance is recommended when scoring pathway overdispersion relative to randomly sampled gene sets)\n##' @param theta.range valid theta range (should be the same as was set in knn.error.models() call\n##' @param gene.length optional vector of gene lengths (corresponding to the rows of counts matrix)\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' }\n##'\n##' @return a list containing the following fields:\n##' \\itemize{\n##' \\item{mat} {adjusted expression magnitude values}\n##' \\item{matw} { weight matrix corresponding to the expression matrix}\n##' \\item{arv} { a vector giving adjusted variance values for each gene}\n##' \\item{avmodes} {a vector estimated average expression magnitudes for each gene}\n##' \\item{modes} {a list of batch-specific average expression magnitudes for each gene}\n##' \\item{prior} {estimated (or supplied) expression magnitude prior}\n##' \\item{edf} { estimated effective degrees of freedom}\n##' \\item{fit.genes} { fit.genes parameter }\n##' }\n##'\n##' @export\npagoda.varnorm <- function(models, counts, batch = NULL, trim = 0, prior = NULL, fit.genes=NULL, plot = TRUE, minimize.underdispersion = FALSE, n.cores = detectCores(), n.randomizations = 100, weight.k = 0.9, verbose = 0, weight.df.power = 1, smooth.df = -1, max.adj.var = 10, theta.range = c(1e-2, 1e2), gene.length = NULL) {\n\n    cd <- counts\n\n    min.edf <- 1\n    weight.k.internal <- 1\n    use.mean.fpm <- FALSE\n    use.expected.value <- TRUE\n    cv.fit <- TRUE\n    edf.damping <- 1\n\n    # load NB extensions\n    data(scde.edff, envir = environment())\n\n    # subset cd to the cells occurring in the models\n    if(verbose) { cat(\"checking counts ... \") }\n    if(!all(rownames(models) %in% colnames(cd))) {\n        stop(paste(\"supplied count matrix (cd) is missing data for the following cells:[\", paste(rownames(models)[!rownames(models) %in% colnames(cd)], collapse = \", \"), \"]\", sep = \"\"))\n    }\n    if(!length(rownames(models)) == length(colnames(cd)) || !all(rownames(models) == colnames(cd))) {\n        cd <- cd[, match(rownames(models), colnames(cd))]\n    }\n    if(verbose) { cat(\"done\\n\") }\n\n    # trim counts according to the extreme fpm values\n    if(trim > 0) {\n        if(verbose) { cat(\"Winsorizing count matrix ... \") }\n        fpm <- t((t(log(cd))-models$corr.b)/models$corr.a)\n        #tfpm <- log(winsorize.matrix(exp(fpm), trim = trim))\n        tfpm <- winsorize.matrix(fpm, trim)\n        rn <- rownames(cd)\n        cn <- colnames(cd)\n        cd <- round(exp(t(t(tfpm)*models$corr.a+models$corr.b)))\n        cd[cd<0] <- 0\n        rownames(cd) <- rn\n        colnames(cd) <- cn\n        rm(fpm, tfpm)\n        cd <- cd[rowSums(cd) > 0, ] # omit genes without any data after Winsorization\n        if(verbose) { cat(\"done\\n\") }\n    }\n\n    # check/fix batch vector\n    if(verbose) { cat(\"checking batch ... \") }\n    if(!is.null(batch)) {\n        if(!is.factor(batch)) {\n            batch <- as.factor(batch)\n        }\n        if(is.null(names(batch))) {\n            if(length(batch) != nrow(models)) {\n                stop(\"invalid batch vector supplied: length differs from nrow(models)!\")\n            }\n            names(batch) <- rownames(models)\n        } else {\n            if(!all(rownames(models) %in% names(batch))) {\n                stop(paste(\"invalid batch vector supplied: the following cell(s) are not present: [\", paste(rownames(models)[!rownames(models) %in% names(batch)], collapse = \", \"), \"]\", sep = \"\"))\n            }\n            batch <- batch[rownames(models)]\n        }\n\n        bt <- table(batch)\n        min.batch.level <- 2\n        if(any(bt<min.batch.level)) {\n            if(verbose) { cat(\"omitting small batch levels [\", paste(names(bt)[bt<min.batch.level], collapse = \" \"), \"] ... \") }\n            batch[batch %in% names(bt)[bt<min.batch.level]] <- names(bt)[which.max(bt)]\n        }\n    }\n    if(verbose) { cat(\"ok\\n\") }\n\n    # recalculate modes as needed\n    if(verbose) { cat(\"calculating modes ... \") }\n    if(is.null(prior)) {\n        if(verbose) { cat(\"prior \") }\n        prior <- scde.expression.prior(models = models, counts = cd, length.out = 400, show.plot = FALSE)\n    }\n    # dataset-wide mode\n    if(use.mean.fpm) { # use mean fpm across cells\n        avmodes <- modes <- rowMeans(exp(scde.expression.magnitude(models, cd)))\n    } else { # use joint posterior mode/expected value\n        jp <- scde.posteriors(models = models, cd, prior, n.cores = n.cores, return.individual.posterior.modes = TRUE, n.randomizations = n.randomizations)\n        if(use.expected.value) {\n            avmodes <- modes <- (jp$jp %*% as.numeric(colnames(jp$jp)))[, 1]\n        } else { # use mode\n            avmodes <- modes <- (as.numeric(colnames(jp$jp)))[max.col(jp$jp)]\n        }\n    }\n    if(verbose) { cat(\". \") }\n\n    # batch-specific modes, if necessary\n    if(!is.null(batch) && length(levels(batch)) > 1) {\n        # calculate mode for each batch\n        if(verbose) { cat(\"batch: [ \") }\n        modes <- tapply(seq_len(nrow(models)), batch, function(ii) {\n            if(verbose) { cat(as.character(batch[ii[1]]), \" \") }\n            if(use.mean.fpm) { # use mean fpm across cells\n                modes <- rowMeans(exp(scde.expression.magnitude(models[ii, ], cd[, ii])))\n            } else { # use joint posterior mode\n                jp <- scde.posteriors(models = models[ii, ], cd[, ii], prior, n.cores = n.cores, return.individual.posterior.modes = TRUE, n.randomizations = n.randomizations)\n                if(use.expected.value) {\n                    modes <- (jp$jp %*% as.numeric(colnames(jp$jp)))[, 1]\n                } else { # use mode\n                    modes <- (as.numeric(colnames(jp$jp)))[max.col(jp$jp)]\n                }\n            }\n        })\n        # set dataset-wide mode\n        #if(use.mean.fpm) { # use mean fpm across cells\n        #  avmodes <- colMeans(do.call(rbind, modes)*as.vector(unlist(tapply(1:length(batch), batch, length))))*length(levels(batch))/length(batch)\n        #jp <- scde.posteriors(models = models, cd, prior, n.cores = n.cores, return.individual.posterior.modes = TRUE, n.randomizations = n.randomizations)\n        if(verbose) { cat(\"] \") }\n    }\n    if(verbose) { cat(\"done\\n\") }\n\n    # check/calculate weights\n    if(verbose) { cat(\"calculating weight matrix ... \") }\n\n    # calculate default weighting scheme\n    if(verbose) { cat(\"calculating ... \") }\n\n    # dataset-wide version of matw (disregarding batch)\n    sfp <- do.call(cbind, lapply(seq_len(ncol(cd)), function(i) ppois(cd[, i]-1, exp(models[i, \"fail.r\"]), lower.tail = FALSE)))\n    mfp <- scde.failure.probability(models = models, magnitudes = log(avmodes))\n    ofpT <- do.call(cbind, lapply(seq_len(ncol(cd)), function(i) { # for each cell\n        lfpm <- log(avmodes)\n        mu <- models$corr.b[i] + models$corr.a[i]*lfpm\n        thetas <- get.corr.theta(models[i, ], lfpm, theta.range)\n        pnbinom(1, size = thetas, mu = exp(mu), lower.tail = TRUE)\n    }))\n    matw <- 1-weight.k.internal*mfp*sfp # only mode failure probability\n    # mode failure or NB failure\n    #tmfp <- 1-(1-mfp)*(1-ofpT)\n    #matw <- 1-weight.k.internal*tmfp*sfp\n\n\n    # calculate batch-specific version of the weight matrix if needed\n    if(!is.null(batch) && length(levels(batch)) > 1) { # with batch correction\n        # save the dataset-wide one as avmatw\n        # calculate mode for each batch\n        if(verbose) { cat(\"batch: [ \") }\n        bmatw <- do.call(cbind, tapply(seq_len(nrow(models)), batch, function(ii) {\n            if(verbose) { cat(as.character(batch[ii[1]]), \" \") }\n            # set self-fail probability to p(count|background)\n            # total mode failure (including overdispersion dropouts)\n            #sfp <- do.call(cbind, lapply(ii, function(i) dpois(cd[, i], exp(models[i, \"fail.r\"]), log = FALSE)))\n            sfp <- do.call(cbind, lapply(ii, function(i) ppois(cd[, i]-1, exp(models[i, \"fail.r\"]), lower.tail = FALSE)))\n\n            mfp <- scde.failure.probability(models = models[ii, ], magnitudes = log(modes[[batch[ii[1]]]]))\n            ofpT <- do.call(cbind, lapply(ii, function(i) { # for each cell\n                lfpm <- log(modes[[batch[i]]])\n                mu <- models$corr.b[i] + models$corr.a[i]*lfpm\n                thetas <- get.corr.theta(models[i, ], lfpm, theta.range)\n                pnbinom(1, size = thetas, mu = exp(mu), lower.tail = TRUE)\n            }))\n\n            x <- 1-weight.k.internal*mfp*sfp # only mode failure probability\n            # mode failure or NB failure\n            #tmfp <- 1-(1-mfp)*(1-ofpT)\n            #x <- 1-weight.k.internal*tmfp*sfp\n        }))\n        # reorder\n        bmatw <- bmatw[, rownames(models)]\n        if(verbose) { cat(\"] \") }\n    }\n    if(verbose) { cat(\"done\\n\") }\n\n    # calculate effective degrees of freedom\n    # total effective degrees of freedom per gene\n    if(verbose) { cat(\"calculating effective degrees of freedom ..\") }\n    ids <- 1:ncol(cd)\n    names(ids) <- colnames(cd)\n    # dataset-wide version\n    edf.mat <- do.call(cbind, papply(ids, function(i) {\n        v <- models[i, ]\n        lfpm <- log(avmodes)\n        mu <- exp(lfpm*v$corr.a + v$corr.b)\n        # adjust very low mu levels except for those that have 0 counts (to avoid inf values)\n\n        thetas <- get.corr.theta(v, lfpm, theta.range)\n        edf <- exp(predict(scde.edff, data.frame(lt = log(thetas))))\n        edf[thetas > 1e3] <- 1\n        edf\n    }, n.cores = n.cores))\n    if(edf.damping != 1) {\n        edf.mat <- ((edf.mat/ncol(edf.mat))^edf.damping) * ncol(edf.mat)\n    }\n\n    # incorporate weight into edf\n    #edf.mat <- ((matw^weight.df.power)*edf.mat)\n    edf.mat <- (matw*edf.mat)^weight.df.power\n    #edf <- rowSums(matw*edf.mat)+1.5 # summarize eDF per gene\n    edf <- rowSums(edf.mat)+1 # summarize eDF per gene\n    if(verbose) { cat(\".\") }\n\n    # batch-specific version if necessary\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        bedf.mat <- do.call(cbind, papply(ids, function(i) {\n            v <- models[i, ]\n            lfpm <- log(modes[[batch[i]]])\n            mu <- exp(lfpm*v$corr.a + v$corr.b)\n            # adjust very low mu levels except for those that have 0 counts (to avoid inf values)\n\n            thetas <- get.corr.theta(v, lfpm, theta.range)\n            edf <- exp(predict(scde.edff, data.frame(lt = log(thetas))))\n            edf[thetas > 1e3] <- 1\n            return(edf)\n        }, n.cores = n.cores))\n        if(edf.damping != 1) { bedf.mat <-  ((bedf.mat/ncol(bedf.mat))^edf.damping) * ncol(edf.mat) }\n\n        # incorporate weight into edf\n        #bedf.mat <- ((bmatw^weight.df.power)*bedf.mat)\n        bedf.mat <- (bmatw*bedf.mat)^weight.df.power\n        bedf <- rowSums(bedf.mat)+1 # summarize eDF per gene\n        if(verbose) { cat(\".\") }\n    }\n\n    if(verbose) { cat(\" done\\n\") }\n\n    if(verbose) { cat(\"calculating normalized expression values ... \") }\n    # evaluate negative binomial deviations and effective degrees of freedom\n    ids <- 1:ncol(cd)\n    names(ids) <- colnames(cd)\n    mat <- do.call(cbind, papply(ids, function(i) {\n        v <- models[i, ]\n        lfpm <- log(avmodes)\n        mu <- exp(lfpm*v$corr.a + v$corr.b)\n        # adjust very low mu levels except for those that have 0 counts (to avoid inf values)\n        thetas <- get.corr.theta(v, lfpm, theta.range)\n\n        #matw[, i]*edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas)\n        #x <- (cd[, i]-mu)^2/(mu+mu^2/thetas)\n        #edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas)\n        # considering Poisson-nb mixture\n        fail.lambda <- exp(as.numeric(v[\"fail.r\"]))\n        #edf.mat[, i]*(cd[, i]-mu)^2/(matw[, i]*(mu+mu^2/thetas) + (1-matw[, i])*((mu-fail.lambda)^2 + fail.lambda))\n        edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas +  fail.lambda)\n\n        #edf.mat[, i]*(cd[, i]-mu)^2/(matw[, i]*mu+(mu^2)*((1-matw[, i])+matw[, i]/thetas))\n    }, n.cores = n.cores))\n    rownames(mat) <- rownames(cd)\n    if(verbose) { cat(\".\") }\n    # batch-specific version of mat\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        bmat <- do.call(cbind, papply(ids, function(i) {\n            v <- models[i, ]\n            lfpm <- log(modes[[batch[i]]])\n            mu <- exp(lfpm*v$corr.a + v$corr.b)\n            # adjust very low mu levels except for those that have 0 counts (to avoid inf values)\n            thetas <- get.corr.theta(v, lfpm, theta.range)\n\n            #matw[, i]*edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas)\n            #x <- (cd[, i]-mu)^2/(mu+mu^2/thetas)\n            #edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas)\n            #edf.mat[, i]*(cd[, i]-mu)^2/(matw[, i]*mu+(mu^2)*((1-matw[, i])+matw[, i]/thetas))\n            fail.lambda <- exp(as.numeric(v[\"fail.r\"]))\n            #edf.mat[, i]*(cd[, i]-mu)^2/(matw[, i]*(mu+mu^2/thetas) + (1-matw[, i])*((mu-fail.lambda)^2 + fail.lambda))\n            edf.mat[, i]*(cd[, i]-mu)^2/(mu+mu^2/thetas +  fail.lambda)\n        }, n.cores = n.cores))\n        rownames(bmat) <- rownames(cd)\n\n        if(verbose) { cat(\".\") }\n    }\n    if(verbose) { cat(\" done\\n\") }\n\n    # do a model fit on the weighted standard deviation (as a function of the batch-average expression mode)\n    wvar <- rowSums(mat)/rowSums(edf.mat)\n\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        # estimate the ratio of the batch-specific variance to the total dataset variance\n        bwvar <- rowSums(bmat)/rowSums(bedf.mat)\n        bwvar.ratio <- bwvar/wvar\n        wvar <- bwvar # replace wvar now that we have the ratio of\n        matw <- bmatw # replace matw with the batch-specific one that will be used from here on\n        # ALTERNATIVE: could adjust wvar for the bwvar.ratio here, before fitting expression dependency\n      }\n    fvi <- vi <- rowSums(matw) > 0 & is.finite(wvar) & wvar > 0\n    if(!is.null(fit.genes)) { fvi <- fvi & rownames(mat) %in% fit.genes }\n    if(!any(fvi)) { stop(\"unable to find a set of valid genes to establish the variance fit\") }\n\n    # s = mgcv:::s\n    s = mgcv::s\n    if(cv.fit) {\n        #x <- gam(as.formula(\"cv2 ~ s(lev)\"), data = df[vi, ], weights = rowSums(matw[vi, ]))\n        if(is.null(gene.length)) {\n            df <- data.frame(lev = log10(avmodes), cv2 = log10(wvar/avmodes^2))\n            x <- mgcv::gam(cv2 ~ s(lev, k = smooth.df), data = df[fvi, ], weights = rowSums(matw[fvi, ]))\n        } else {\n            df <- data.frame(lev = log10(avmodes), cv2 = log10(wvar/avmodes^2), len = gene.length[rownames(cd)])\n            x <- mgcv::gam(cv2 ~ s(lev, k = smooth.df) + s(len, k = smooth.df), data = df[fvi, ], weights = rowSums(matw[fvi, ]))\n        }\n        #x <- lm(cv2~lev, data = df[vi, ], weights = rowSums(matw[vi, ]))\n\n        zval.m <- 10^(df$cv2[vi]-predict(x, newdata = df[vi, ]))\n\n        if(plot) {\n            par(mfrow = c(1, 2), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))\n            #smoothScatter(df$lev[vi], log(wvar[vi]), nbin = 256, xlab = \"expression magnitude (log10)\", ylab = \"wvar (log)\") abline(h = 0, lty = 2, col = 2)\n            #points(df[paste(\"g\", diff.exp.gene.ids, sep = \"\"), \"lev\"], log(wvar[paste(\"g\", diff.exp.gene.ids, sep = \"\")]), col = 2)\n\n            smoothScatter(df$lev[vi], df$cv2[vi], nbin = 256, xlab = \"expression magnitude (log10)\", ylab = \"cv^2 (log10)\")\n            lines(sort(df$lev[vi]), predict(x, newdata = df[vi, ])[order(df$lev[vi])], col = 2, pch = \".\", cex = 1)\n            if(!is.null(fit.genes)) { # show genes used for the fit\n              points(df$lev[fvi],df$cv2[fvi],pch=\".\",col=\"green\",cex=1)\n            }\n\n            #points(df[paste(\"g\", diff.exp.gene.ids, sep = \"\"), \"lev\"], df[paste(\"g\", diff.exp.gene.ids, sep = \"\"), \"cv2\"], col = 2)\n        }\n\n        # optional : re-weight to minimize the underdispersed points\n        if(minimize.underdispersion) {\n            pv <- pchisq(zval.m*(edf[vi]-1), edf[vi], log.p = FALSE, lower.tail = TRUE)\n            pv[edf[vi]<= min.edf] <- 0\n            pv <- p.adjust(pv)\n            #x <- gam(as.formula(\"cv2 ~ s(lev)\"), data = df[vi, ], weights = (pmin(10, -log(pv))+1)*rowSums(matw[vi, ]))\n            x <- mgcv::gam(cv2 ~ s(lev, k = smooth.df), data = df[fvi, ], weights = (pmin(10, -log(pv))+1)*rowSums(matw[fvi, ]))\n            zval.m <- 10^(df$cv2[vi]-predict(x,newdata=df[vi,]))\n            if(plot) {\n              lines(sort(df$lev[vi]), predict(x, newdata = df[vi, ])[order(df$lev[vi])], col = 4, pch = \".\", cex = 1)\n            }\n        }\n    } else {\n        df <- data.frame(lev = log10(avmodes), sd = sqrt(wvar))\n        #x <- gam(as.formula(\"sd ~ s(lev)\"), data = df[vi, ], weights = rowSums(matw[vi, ]))\n        x <- mgcv::gam(sd ~ s(lev, k = smooth.df), data = df[fvi, ], weights = rowSums(matw[fvi, ]))\n        zval.m <- (as.numeric((df$sd[vi])/pmax(min.sd, predict(x,newdata=df[vi,]))))^2\n\n        if(plot) {\n            par(mfrow = c(1, 2), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))\n            smoothScatter(df$lev[vi], df$sd[vi], nbin = 256, xlab = \"expression magnitude\", ylab = \"weighted sdiv\")\n            lines(sort(df$lev[vi]), predict(x, newdata = df[vi, ])[order(df$lev[vi])], col = 2, pch = \".\", cex = 1)\n            if(!is.null(fit.genes)) { # show genes used for the fit\n              points(df$lev[fvi],df$sd[fvi],pch=\".\",col=\"green\",cex=1)\n            }\n        }\n\n        # optional : re-weight to minimize the underdispersed points\n        if(minimize.underdispersion) {\n            pv <- pchisq(zval.m*(edf[vi]-1), edf[vi], log.p = FALSE, lower.tail = TRUE)\n            pv[edf[vi]<= min.edf] <- 0\n            pv <- p.adjust(pv)\n            #x <- gam(as.formula(\"sd ~ s(lev)\"), data = df[vi, ], weights = (pmin(20, -log(pv))+1)*rowSums(matw[vi, ]))\n            x <- mgcv::gam(sd ~ s(lev, k = smooth.df), data = df[fvi, ], weights = (pmin(20, -log(pv))+1)*rowSums(matw[fvi, ]))\n            zval.m <- (as.numeric((df$sd[vi])/pmax(x$fitted.values, min.sd)))^2\n            if(plot) {\n              lines(sort(df$lev[vi]), predict(x, newdata = df[vi, ])[order(df$lev[vi])], col = 4, pch = \".\", cex = 1)\n            }\n        }\n    }\n\n    # adjust for inter-batch variance\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        #zval.m <- zval.m*pmin(bwvar.ratio[vi], 1) # don't increase zval.m even if batch-specific specific variance is higher than the dataset-wide variance\n        zval.m <- zval.m*pmin(bwvar.ratio[vi], 1/bwvar.ratio[vi]) # penalize for strong deviation in either direction\n    }\n\n    # calculate adjusted variance\n    qv <- pchisq(zval.m*(edf[vi]-1), edf[vi], log.p = TRUE, lower.tail = FALSE)\n    qv[edf[vi]<= min.edf] <- 0\n    qv[abs(qv)<1e-10] <- 0\n    arv <- rep(NA, length(vi))\n    arv[vi] <- qchisq(qv, ncol(matw)-1, lower.tail = FALSE, log.p = TRUE)/ncol(matw)\n    arv <- pmin(max.adj.var, arv)\n    names(arv) <- rownames(cd)\n    if(plot) {\n        smoothScatter(df$lev[vi], arv[vi], xlab = \"expression magnitude (log10)\", ylab = \"adjusted variance (log10)\", nbin = 256)\n        abline(h = 1, lty = 2, col = 8)\n        abline(h = max.adj.var, lty = 3, col = 2)\n        if(!is.null(fit.genes)) {\n          points(df$lev[fvi],arv[fvi],pch=\".\",col=\"green\",cex=1)\n        }\n        #points(df[paste(\"g\", diff.exp.gene.ids, sep = \"\"), \"lev\"], arv[paste(\"g\", diff.exp.gene.ids, sep = \"\")], col = 2)\n        #points(df$lev[vi], arv[vi], col = 2, pch = \".\", cex = 2)\n    }\n\n    # Wilcox score upper bound\n    wsu <- function(k, n, z = qnorm(0.975)) {\n        p <- k/n\n        pmin(1, (2*n*p+z^2+(z*sqrt(z^2-1/n+4*n*p*(1-p)-(4*p-2)) +1))/(2*(n+z^2)))\n    }\n\n    # use milder weight matrix\n    #matw <- 1-0.9*((1-matw)^2) # milder weighting for the the PCA (1-0.9*sp*mf)\n    matw <- 1-weight.k*(1-matw) # milder weighting for the the PCA (1-0.9*sp*mf)\n    matw <- matw/rowSums(matw)\n    mat <- log10(exp(scde.expression.magnitude(models, cd))+1)\n\n    # estimate observed variance (for scaling) before batch adjustments\n    #varm <- sqrt(arv/pmax(weightedMatVar(mat, matw, batch = batch), 1e-5)) varm[varm<1e-5] <- 1e-5 mat <- mat*varm\n    ov <- weightedMatVar(mat, matw)\n    vr <- arv/ov\n    vr[ov <=  0] <- 0\n\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        # adjust proportion of zeros\n        # determine lowest upper bound of non-zero measurement probability among the batch (for each gene)\n        nbub <- apply(do.call(cbind, tapply(seq_len(ncol(mat)), batch, function(ii) {\n            wsu(rowSums(mat[, ii] > 0), length(ii), z = qnorm(1-1e-2))\n        })), 1, min)\n\n        # decrease the batch weights for each gene to match the total\n        # expectation of the non-zero measurements\n        nbo <- do.call(cbind, tapply(seq_len(ncol(mat)), batch, function(ii) {\n            matw[, ii]*pmin(1, ceiling(nbub*length(ii))/rowSums(mat[, ii] > 0))\n        }))\n        nbo <- nbo[, colnames(matw)]\n        matw <- nbo\n\n        ## # center 0 and non-0 observations between batches separately\n        ## amat <- mat amat[amat == 0] <- NA\n        ## amat.av <- rowMeans(amat, na.rm = TRUE) # dataset means\n        ## # adjust each batch by the mean of its non-0 measurements\n        ## amat <- do.call(cbind, tapply(1:ncol(amat), batch, function(ii) {\n        ##   amat[, ii]-rowMeans(amat[, ii], na.rm = TRUE)\n        ## }))\n        ## amat <- amat[, colnames(mat)] # fix the ordering\n        ## # shift up each gene by the dataset mean\n        ## amat <- amat+amat.av\n        ## amat[is.na(amat)] <- 0\n        ## mat <- amat\n\n        amat <- mat\n        nr <- ncol(matw)/rowSums(matw)\n        amat.av <- rowMeans(amat*matw)*nr # dataset means\n        amat <- do.call(cbind, tapply(seq_len(ncol(amat)), batch, function(ii) {\n            amat[, ii]-(rowMeans(amat[, ii]*matw[, ii]*nr, na.rm = TRUE))\n        }))\n        amat <- amat[, colnames(matw)]\n        mat <- amat+amat.av\n\n        # alternative: actually zero-out entries in mat\n        ## nbub <- rowMin(do.call(cbind, tapply(1:ncol(amat), batch, function(ii) {\n        ##   wsu(rowSums(amat[, ii] > 0), length(ii), z = qnorm(1-1e-2))\n        ## })))\n        ## set.seed(0)\n\n        ## # decrease the batch weights for each gene to match the total\n        ## # expectation of the non-zero measurements\n        ## matm <- do.call(cbind, tapply(1:ncol(amat), batch, function(ii) {\n        ##   # number of entries to zero-out per gene\n        ##   nze <- rowSums(amat[, ii] > 0) - ceiling(nbub*length(ii))\n        ##   # construct mat multiplier submatrix\n        ##   sa <- rep(1, length(ii))\n        ##   smatm <- do.call(rbind, lapply(1:length(nze), function(ri) {\n        ##     if(nze[ri]<1) { return(sa) }\n        ##     vi <- which(mat[ri, ii] > 0)\n        ##     a <- sa a[vi[sample.int(length(vi), nze[ri])]] <- 0\n        ##     a\n        ##   }))\n        ##   colnames(smatm) <- colnames(mat[, ii])\n        ##   rownames(smatm) <- rownames(mat)\n        ##   smatm\n        ## }))\n        ## matm <- matm[, colnames(mat)]\n        ## mat <- mat*matm\n        ## matw <- matw*matm\n    }\n\n    # center (no batch)\n    mat <- weightedMatCenter(mat, matw)\n    mat <- mat*sqrt(vr)\n\n    if(!is.null(batch) && is.list(modes)) { # batch-specific mode\n        return(list(mat = mat, matw = matw, arv = arv, modes = modes, avmodes = avmodes, prior = prior, edf = edf, batch = batch, trim = trim, bwvar.ratio = bwvar.ratio))\n    } else {\n        return(list(mat = mat, matw = matw, arv = arv, modes = modes, avmodes = avmodes, prior = prior, edf = edf, batch = batch, trim = trim))\n    }\n}\n\n\n##' Control for a particular aspect of expression heterogeneity in a given population\n##'\n##' Similar to subtracting n-th principal component, the current procedure determines\n##' (weighted) projection of the expression matrix onto a specified aspect (some pattern\n##' across cells, for instance sequencing depth, or PC corresponding to an undesired process\n##' such as ribosomal pathway variation) and subtracts it from the data so that it is controlled\n##' for in the subsequent weighted PCA analysis.\n##'\n##' @param varinfo normalized variance info (from pagoda.varnorm())\n##' @param aspect a vector giving a cell-to-cell variation pattern that should be controlled for (length should be corresponding to ncol(varinfo$mat))\n##' @param center whether the matrix should be re-centered following pattern subtraction\n##'\n##' @return a modified varinfo object with adjusted expression matrix (varinfo$mat)\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' # create go environment\n##' library(org.Hs.eg.db)\n##' # translate gene names to ids\n##' ids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\n##' rids <- names(ids); names(rids) <- ids\n##' go.env <- lapply(mget(ls(org.Hs.egGO2ALLEGS), org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x])))\n##' # clean GOs\n##' go.env <- clean.gos(go.env)\n##' # convert to an environment\n##' go.env <- list2env(go.env)\n##' # subtract the pattern\n##' cc.pattern <- pagoda.show.pathways(ls(go.env)[1:2], varinfo, go.env, show.cell.dendrogram = TRUE, showRowLabels = TRUE)  # Look at pattern from 2 GO annotations\n##' varinfo.cc <- pagoda.subtract.aspect(varinfo, cc.pattern)\n##' }\n##'\n##' @export\npagoda.subtract.aspect <- function(varinfo, aspect, center = TRUE) {\n    if(length(aspect) != ncol(varinfo$mat)) { stop(\"aspect should be a numeric vector of the same length as the number of cells (i.e. ncol(varinfo$mat))\") }\n    v <- aspect\n    v <- v-mean(v)\n    v <- v/sqrt(sum(v^2))\n    nr <- ((varinfo$mat * varinfo$matw) %*% v)/(varinfo$matw %*% v^2)\n    mat.c <- varinfo$mat - t(v %*% t(nr))\n    if(center) {\n        mat.c <- weightedMatCenter(mat.c, varinfo$matw) # this commonly re-introduces some background dependency because of the matw\n    }\n    varinfo$mat <- mat.c\n    varinfo\n}\n\n\n##' Run weighted PCA analysis on pre-annotated gene sets\n##'\n##' For each valid gene set (having appropriate number of genes) in the provided environment (setenv),\n##' the method will run weighted PCA analysis, along with analogous analyses of random gene sets of the\n##' same size, or shuffled expression magnitudes for the same gene set.\n##'\n##' @param varinfo adjusted variance info from pagoda.varinfo() (or pagoda.subtract.aspect())\n##' @param setenv environment listing gene sets (contains variables with names corresponding to gene set name, and values being vectors of gene names within each gene set)\n##' @param n.components number of principal components to determine for each gene set\n##' @param n.cores number of cores to use\n##' @param min.pathway.size minimum number of observed genes that should be contained in a valid gene set\n##' @param max.pathway.size maximum number of observed genes in a valid gene set\n##' @param n.randomizations number of random gene sets (of the same size) to be evaluated in parallel with each gene set (can be kept at 5 or 10, but should be increased to 50-100 if the significance of pathway overdispersion will be determined relative to random gene set models)\n##' @param n.internal.shuffles number of internal (independent row shuffles) randomizations of expression data that should be evaluated for each gene set (needed only if one is interested in gene set coherence P values, disabled by default; set to 10-30 to estimate)\n##' @param n.starts number of random starts for the EM method in each evaluation\n##' @param center whether the expression matrix should be recentered\n##' @param batch.center whether batch-specific centering should be used\n##' @param proper.gene.names alternative vector of gene names (replacing rownames(varinfo$mat)) to be used in cases when the provided setenv uses different gene names\n##' @param verbose verbosity level\n##'\n##' @return a list of weighted PCA info for each valid gene set\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' # create go environment\n##' library(org.Hs.eg.db)\n##' # translate gene names to ids\n##' ids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\n##' rids <- names(ids); names(rids) <- ids\n##' go.env <- lapply(mget(ls(org.Hs.egGO2ALLEGS), org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x])))\n##' # clean GOs\n##' go.env <- clean.gos(go.env)\n##' # convert to an environment\n##' go.env <- list2env(go.env)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' }\n##'\n##' @export\npagoda.pathway.wPCA <- function(varinfo, setenv, n.components = 2, n.cores = detectCores(), min.pathway.size = 10, max.pathway.size = 1e3, n.randomizations = 10, n.internal.shuffles = 0, n.starts = 10, center = TRUE, batch.center = TRUE, proper.gene.names = NULL, verbose = 0) {\n    mat <- varinfo$mat\n    matw <- varinfo$matw\n    gsl <- NULL\n    return.gsl <- FALSE\n    smooth <- 0\n    if(batch.center) { batch <- varinfo$batch } else { batch <- NULL }\n\n    if(is.null(proper.gene.names)) { proper.gene.names <- rownames(mat) }\n\n\n    if(center) {\n        mat <- weightedMatCenter(mat, matw, batch = batch)\n    }\n\n    vi <- apply(mat, 1, function(x) sum(abs(diff(x))) > 0)\n    vi[is.na(vi)] <- FALSE\n    mat <- mat[vi, , drop = FALSE] # remove constant rows\n    matw <- matw[vi, , drop = FALSE]\n    proper.gene.names <- proper.gene.names[vi]\n\n    if(is.null(gsl)) {\n        gsl <- ls(envir = setenv)\n        gsl.ng <- unlist(lapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names)))\n        gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]\n        names(gsl) <- gsl\n    }\n    if(verbose) {\n        message(\"processing \", length(gsl), \" valid pathways\")\n    }\n    if(return.gsl) return(gsl)\n\n\n    # transpose mat to save a bit of calculations\n    mat <- t(mat)\n    matw <- t(matw)\n\n    mcm.pc <- papply(gsl, function(x) {\n        lab <- proper.gene.names %in% get(x, envir = setenv)\n        if(sum(lab)<1) { return(NULL) }\n\n        #smooth <- round(sum(lab)*smooth.fraction)\n        #smooth <- max(sum(lab), smooth)\n\n        #xp <- pca(d, nPcs = n.components, center = TRUE, scale = \"none\")\n        #xp <- epca(mat[, lab], ncomp = n.components, center = FALSE, nstarts = n.starts)\n        xp <- bwpca(mat[, lab, drop = FALSE], matw[, lab, drop = FALSE], npcs = n.components, center = FALSE, nstarts = n.starts, smooth = smooth, n.shuffles = n.internal.shuffles)\n\n        # get standard deviations for the random samples\n        ngenes <- sum(lab)\n        z <- do.call(rbind, lapply(seq_len(n.randomizations), function(i) {\n            si <- sample(1:ncol(mat), ngenes)\n            #epca(mat[, si], ncomp = 1, center = FALSE, nstarts = n.starts)$sd\n            xp <- bwpca(mat[, si, drop = FALSE], matw[, si, drop = FALSE], npcs = 1, center = FALSE, nstarts = n.starts, smooth = smooth)$sd\n        }))\n\n        # flip orientations to roughly correspond with the means\n        cs <- unlist(lapply(seq_len(ncol(xp$scores)), function(i) sign(cor(xp$scores[, i], colMeans(t(mat[, lab, drop = FALSE])*abs(xp$rotation[, i]))))))\n\n        xp$scores <- t(t(xp$scores)*cs)\n        xp$rotation <- t(t(xp$rotation)*cs)\n\n        # local normalization of each component relative to sampled PC1 sd\n        avar <- pmax(0, (xp$sd^2-mean(z[, 1]^2))/sd(z[, 1]^2))\n        xv <- t(xp$scores)\n        xv <- xv/apply(xv, 1, sd)*sqrt(avar)\n        return(list(xv = xv, xp = xp, z = z, sd = xp$sd, n = ngenes))\n    }, n.cores = n.cores)\n}\n\n\n##' Estimate effective number of cells based on lambda1 of random gene sets\n##'\n##' Examines the dependency between the amount of variance explained by the first principal component\n##' of a gene set and the number of genes in a gene set to determine the effective number of cells\n##' for the Tracy-Widom distribution\n##'\n##' @param pwpca result of the pagoda.pathway.wPCA() call with n.randomizations > 1\n##' @param start optional starting value for the optimization (if the NLS breaks, trying high starting values usually fixed the local gradient problem)\n##'\n##' @return effective number of cells\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' pagoda.effective.cells(pwpca)\n##' }\n##'\n##' @export\npagoda.effective.cells <- function(pwpca, start = NULL) {\n    n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))\n    var <- unlist(lapply(pwpca, function(x) x$z[, 1]))^2\n    if(is.null(start)) { start <- nrow(pwpca[[1]]$xp$scores)*10 }\n\n    n.cells <- nrow(pwpca[[1]]$xp$scores)\n    of <- function(p, v, sp) {\n        sn <- p[1]\n        vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)\n        residuals <- (v-vfit)^2\n        return(sum(residuals))\n    }\n    x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(n.cells))\n    return((x$par)^2+1/2)\n}\n\n\n##' Determine de-novo gene clusters and associated overdispersion info\n##'\n##' Determine de-novo gene clusters, their weighted PCA lambda1 values, and random matrix expectation.\n##'\n##' @param varinfo varinfo adjusted variance info from pagoda.varinfo() (or pagoda.subtract.aspect())\n##' @param trim additional Winsorization trim value to be used in determining clusters (to remove clusters that group outliers occurring in a given cell). Use higher values (5-15) if the resulting clusters group outlier patterns\n##' @param n.clusters number of clusters to be determined (recommended range is 100-200)\n##' @param cor.method correlation method (\"pearson\", \"spearman\") to be used as a distance measure for clustering\n##' @param n.samples number of randomly generated matrix samples to test the background distribution of lambda1 on\n##' @param n.starts number of wPCA EM algorithm starts at each iteration\n##' @param n.internal.shuffles number of internal shuffles to perform (only if interested in set coherence, which is quite high for clusters by definition, disabled by default; set to 10-30 shuffles to estimate)\n##' @param n.cores number of cores to use\n##' @param verbose verbosity level\n##' @param plot whether a plot showing distribution of random lambda1 values should be shown (along with the extreme value distribution fit)\n##' @param show.random whether the empirical random gene set values should be shown in addition to the Tracy-Widom analytical approximation\n##' @param n.components number of PC to calculate (can be increased if the number of clusters is small and some contain strong secondary patterns - rarely the case)\n##' @param method clustering method to be used in determining gene clusters\n##' @param secondary.correlation whether clustering should be performed on the correlation of the correlation matrix instead\n##' @param n.cells number of cells to use for the randomly generated cluster lambda1 model\n##' @param old.results optionally, pass old results just to plot the model without recalculating the stats\n##'\n##' @return a list containing the following fields:\n##' \\itemize{\n##' \\item{clusters} {a list of genes in each cluster values}\n##' \\item{xf} { extreme value distribution fit for the standardized lambda1 of a randomly generated pattern}\n##' \\item{tci} { index of a top cluster in each random iteration}\n##' \\item{cl.goc} {weighted PCA info for each real gene cluster}\n##' \\item{varm} {standardized lambda1 values for each randomly generated matrix cluster}\n##' \\item{clvlm} {a linear model describing dependency of the cluster lambda1 on a Tracy-Widom lambda1 expectation}\n##' }\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' clpca <- pagoda.gene.clusters(varinfo, trim=7.1/ncol(varinfo$mat), n.clusters=150, n.cores=10, plot=FALSE)\n##' }\n##'\n##' @export\npagoda.gene.clusters <- function(varinfo, trim = 3.1/ncol(varinfo$mat), n.clusters = 150, n.samples = 60, cor.method = \"p\", n.internal.shuffles = 0, n.starts = 10, n.cores = detectCores(), verbose = 0, plot = FALSE, show.random = FALSE, n.components = 1, method = \"ward.D\", secondary.correlation = FALSE, n.cells = ncol(varinfo$mat), old.results = NULL) {\n\n    smooth <- 0\n    mat <- varinfo$mat\n    matw <- varinfo$matw\n    batch = varinfo$batch\n\n    if(trim > 0) {\n        mat <- winsorize.matrix(mat, trim = trim)\n    }\n    if(!is.null(batch)) {\n        # center mat by batch\n        mat <- weightedMatCenter(mat, matw, batch)\n    }\n\n\n    if(!is.null(old.results)) {\n        if(verbose) { cat (\"reusing old results for the observed clusters\\n\")}\n        gcls <- old.results$clusters\n        cl.goc <- old.results$cl.goc\n    } else {\n        if(verbose) { cat (\"determining gene clusters ...\")}\n        # actual clusters\n        vi<-which(abs(apply(mat, 1, function(x) sum(abs(diff(x))))) > 0)\n        if(is.element(\"WGCNA\", installed.packages()[, 1])) {\n            gd <- as.dist(1-WGCNA::cor(t(mat)[, vi], method = cor.method, nThreads = n.cores))\n        } else {\n            gd <- as.dist(1-cor(t(mat)[, vi], method = cor.method))\n        }\n\n        if(secondary.correlation) {\n            if(is.element(\"WGCNA\", installed.packages()[, 1])) {\n                gd <- as.dist(1-WGCNA::cor(as.matrix(gd), method = \"p\", nThreads = n.cores))\n            } else {\n                gd <- as.dist(1-cor(as.matrix(gd), method = \"p\"))\n            }\n        }\n\n        if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n            gcl <- fastcluster::hclust(gd, method = method)\n        } else {\n            gcl <- stats::hclust(gd, method = method)\n        }\n        gcll <- cutree(gcl, n.clusters)\n        gcls <- tapply(rownames(mat)[vi], as.factor(gcll), I)\n        names(gcls) <- paste(\"geneCluster\", names(gcls), sep = \".\")\n\n        rm(gd, gcl)\n        gc()\n\n        # determine PC1 for the actual clusters\n        if(verbose) { cat (\" cluster PCA ...\")}\n        il <- tapply(vi, factor(gcll, levels = c(1:length(gcls))), I)\n        cl.goc <- papply(il, function(ii) {\n            xp <- bwpca(t(mat[ii, , drop = FALSE]), t(matw[ii, , drop = FALSE]), npcs = n.components, center = FALSE, nstarts = n.starts, smooth = smooth, n.shuffles = n.internal.shuffles)\n\n            cs <- unlist(lapply(seq_len(ncol(xp$scores)), function(i) sign(cor(xp$scores[, i], colMeans(mat[ii, , drop = FALSE]*abs(xp$rotation[, i]))))))\n\n            xp$scores <- t(t(xp$scores)*cs)\n            xp$rotation <- t(t(xp$rotation)*cs)\n\n            return(list(xp = xp, sd = xp$sd, n = length(ii)))\n        }, n.cores = n.cores)\n        names(cl.goc) <- paste(\"geneCluster\", names(cl.goc), sep = \".\")\n\n        if(verbose) { cat (\"done\\n\")}\n    }\n\n    # sampled variation\n    if(!is.null(old.results) && !is.null(old.results$varm)) {\n        if(verbose) { cat (\"reusing old results for the sampled clusters\\n\")}\n        varm <- old.results$varm } else {\n            if(verbose) { cat (\"generating\", n.samples, \"randomized samples \")}\n            varm <- do.call(rbind, papply(seq_len(n.samples), function(i) { # each sampling iteration\n                set.seed(i)\n                # generate random normal matrix\n                # TODO: use n.cells instead of ncol(matw)\n                m <- matrix(rnorm(nrow(mat)*n.cells), nrow = nrow(mat), ncol = n.cells)\n                #m <- weightedMatCenter(m, matw, batch = batch)\n\n                if(show.random) {\n                    full.m <- t(m) # save untrimmed version of m for random gene set controls\n                }\n\n                if(trim > 0) {\n                    m <- winsorize.matrix(m, trim = trim)\n                }\n\n                vi<-which(abs(apply(m, 1, function(x) sum(diff(abs(x))))) > 0)\n                if(is.element(\"WGCNA\", installed.packages()[, 1])) {\n                    gd <- as.dist(1-WGCNA::cor(t(m[vi, ]), method = cor.method, nThreads = 1))\n                } else {\n                    gd <- as.dist(1-cor(t(m[vi, ]), method = cor.method))\n                }\n                if(secondary.correlation) {\n                    if(is.element(\"WGCNA\", installed.packages()[, 1])) {\n                        gd <- as.dist(1-WGCNA::cor(as.matrix(gd), method = \"p\", nThreads = 1))\n                    } else {\n                        gd <- as.dist(1-cor(as.matrix(gd), method = \"p\"))\n                    }\n                }\n\n                if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n                    gcl <- fastcluster::hclust(gd, method = method)\n                } else {\n                    gcl <- stats::hclust(gd, method = method)\n                }\n                gcll <- cutree(gcl, n.clusters)\n                rm(gd, gcl)\n                gc()\n\n                # transpose to save time\n                m <- t(m) # matw <- t(matw)\n\n                sdv <- tapply(vi, gcll, function(ii) {\n                    #as.numeric(bwpca(m[, ii], matw[, ii], npcs = 1, center = FALSE, nstarts = n.starts, smooth = smooth)$sd)^2\n                    pcaMethods::sDev(pcaMethods::pca(m[, ii], nPcs = 1, center = FALSE))^2\n                })\n\n                pathsizes <- unlist(tapply(vi, gcll, length))\n                names(pathsizes) <- pathsizes\n\n                if(show.random) {\n                    rsdv <- unlist(lapply(names(pathsizes), function(s) {\n                        vi <- sample(1:ncol(full.m), as.integer(s))\n                        pcaMethods::sDev(pcaMethods::pca(full.m[, vi], nPcs = 1, center = FALSE))^2\n                    }))\n                    if(verbose) { cat (\".\")}\n                    return(data.frame(n = as.integer(pathsizes), var = unlist(sdv), round = i, rvar = rsdv))\n                }\n\n                if(verbose) { cat (\".\")}\n                data.frame(n = as.integer(pathsizes), var = unlist(sdv), round = i)\n\n            }, n.cores = n.cores))\n            if(verbose) { cat (\"done\\n\")}\n        }\n\n    # score relative to Tracey-Widom distribution\n    #require(RMTstat)\n    x <- RMTstat::WishartMaxPar(n.cells, varm$n)\n    varm$pm <- x$centering-(1.2065335745820)*x$scaling # predicted mean of a random set\n    varm$pv <- (1.607781034581)*x$scaling # predicted variance of a random set\n    #clvlm <- lm(var~pm, data = varm)\n    clvlm <- lm(var~0+pm+n, data = varm)\n    varm$varst <- (varm$var-predict(clvlm))/sqrt(varm$pv)\n    #varm$varst <- as.numeric(varm$var - (cbind(1, varm$pm) %*% coef(clvlm)))/sqrt(varm$pv)\n    #varm$varst <- as.numeric(varm$var - (varm$pm* coef(clvlm)[2]))/sqrt(varm$pv)\n\n    #varm$varst <- (varm$var-varm$pm)/sqrt(varm$pv)\n    tci <- tapply(seq_len(nrow(varm)), as.factor(varm$round), function(ii) ii[which.max(varm$varst[ii])])\n\n    #xf <- fevd(varm$varst[tci], type = \"Gumbel\") # fit on top clusters\n    xf <- extRemes::fevd(varm$varst, type = \"Gumbel\") # fit on all clusters\n\n    if(plot) {\n        require(extRemes)\n        par(mfrow = c(1, 2), mar = c(3.5, 3.5, 3.5, 1.0), mgp = c(2, 0.65, 0), cex = 0.9)\n        smoothScatter(varm$n, varm$var, main = \"simulations\", xlab = \"cluster size\", ylab = \"PC1 variance\")\n        if(show.random) {\n            points(varm$n, varm$rvar, pch = \".\", col = \"red\")\n        }\n        #pv <- predict(rsm, newdata = data.frame(n = sort(varm$n)), se.fit = TRUE)\n        on <- order(varm$n, decreasing = TRUE)\n        lines(varm$n[on], predict(clvlm)[on], col = 4, lty = 3)\n        lines(varm$n[on], varm$pm[on], col = 2)\n        lines(varm$n[on], (varm$pm+1.96*sqrt(varm$pv))[on], col = 2, lty = 2)\n        lines(varm$n[on], (varm$pm-1.96*sqrt(varm$pv))[on], col = 2, lty = 2)\n        legend(x = \"bottomright\", pch = c(1, 19, 19), col = c(1, 4, 2), legend = c(\"top clusters\", \"clusters\", \"random\"), bty = \"n\")\n\n        points(varm$n[tci], varm$var[tci], col = 1)\n        extRemes::plot.fevd(xf, type = \"density\", main = \"Gumbel fit\")\n        abline(v = 0, lty = 3, col = 4)\n    }\n\n    #pevd(9, loc = xf$results$par[1], scale = xf$results$par[2], lower.tail = FALSE)\n    #xf$results$par\n\n    return(list(clusters = gcls, xf = xf, tci = tci, cl.goc = cl.goc, varm = varm, clvlm = clvlm, trim = trim))\n}\n\n\n##' Score statistical significance of gene set and cluster overdispersion\n##'\n##' Evaluates statistical significance of the gene set and cluster lambda1 values, returning\n##' either a text table of Z scores, etc, a structure containing normalized values of significant\n##' aspects, or a set of genes underlying the significant aspects.\n##'\n##' @param pwpca output of pagoda.pathway.wPCA()\n##' @param clpca output of pagoda.gene.clusters() (optional)\n##' @param n.cells effective number of cells (if not provided, will be determined using pagoda.effective.cells())\n##' @param z.score Z score to be used as a cutoff for statistically significant patterns (defaults to 0.05 P-value\n##' @param return.table whether a text table showing\n##' @param return.genes whether a set of genes driving significant aspects should be returned\n##' @param plot whether to plot the cv/n vs. dataset size scatter showing significance models\n##' @param adjust.scores whether the normalization of the aspect patterns should be based on the adjusted Z scores - qnorm(0.05/2, lower.tail = FALSE)\n##' @param score.alpha significance level of the confidence interval for determining upper/lower bounds\n##' @param use.oe.scale whether the variance of the returned aspect patterns should be normalized using observed/expected value instead of the default chi-squared derived variance corresponding to overdispersion Z score\n##' @param effective.cells.start starting value for the pagoda.effective.cells() call\n##'\n##' @return if return.table = FALSE and return.genes = FALSE (default) returns a list structure containing the following items:\n##' \\itemize{\n##' \\item{xv} {a matrix of normalized aspect patterns (rows- significant aspects, columns- cells}\n##' \\item{xvw} { corresponding weight matrix }\n##' \\item{gw} { set of genes driving the significant aspects }\n##' \\item{df} { text table with the significance testing results }\n##' }\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' tam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n##' }\n##'\n##' @export\npagoda.top.aspects <- function(pwpca, clpca = NULL, n.cells = NULL, z.score = qnorm(0.05/2, lower.tail = FALSE), return.table = FALSE, return.genes = FALSE, plot = FALSE, adjust.scores = TRUE, score.alpha = 0.05, use.oe.scale = FALSE, effective.cells.start = NULL) {\n    basevar = 1\n\n    if(is.null(n.cells)) {\n        n.cells <- pagoda.effective.cells(pwpca, start = effective.cells.start)\n    }\n\n\n    vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {\n        vars <- as.numeric((pwpca[[i]]$sd)^2)\n        shz <- NA\n        if(!is.null(pwpca[[i]]$xp$randvar)) { shz <- (vars - mean(pwpca[[i]]$xp$randvar))/sd(pwpca[[i]]$xp$randvar) }\n        cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$scores)), shz = shz)\n    })))\n\n    # fix p-to-q mistake in qWishartSpike\n    qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {\n        params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)\n        qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)\n    }\n\n    # add right tail approximation to ptw, which gives up quite early\n    pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {\n        params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)\n        q.tw <- (q - params$centering)/(params$scaling)\n        p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)\n        p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))\n        p\n    }\n\n\n    #bi <- which.max(unlist(lapply(pwpca, function(x) x$n)))\n    #vshift <- mean(pwpca[[bi]]$z[, 1]^2)/pwpca[[bi]]$n\n    #ev <- ifelse(spike > 0, qWishartSpikeFixed(0.5, spike, n.cells, pwpca[[bi]]$n, var = basevar, lower.tail = FALSE), RMTstat::qWishartMax(0.5, n.cells, pwpca[[bi]]$n, var = basevar, lower.tail = FALSE))/pwpca[[bi]]$n\n    #cat(\"vshift = \", vshift)\n    vshift <- 0\n    ev <- 0\n\n    vdf$var <- vdf$var-(vshift-ev)*vdf$n\n\n    #vdf$var[vdf$npc == 1] <- vdf$var[vdf$npc == 1]-(vshift-ev)*vdf$n[vdf$npc == 1]\n    vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)\n    #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)\n    vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)\n    vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)\n    vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)\n    vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)\n\n    if(!is.null(clpca)) {\n        clpca$xf <- extRemes::fevd(varst, data = clpca$varm, type = \"Gumbel\")\n        #clpca$xf <- fevd(clpca$varm$varst[clpca$tci], type = \"Gumbel\")\n        clpca$xf$results$par <- c(clpca$xf$results$par, c(shape = 0))\n        #plot(xf)\n\n        clvdf <- data.frame(do.call(rbind, lapply(seq_along(clpca$cl.goc), function(i)  {\n            vars <- as.numeric((clpca$cl.goc[[i]]$sd)^2)\n            shz <- NA\n            if(!is.null(clpca$cl.goc[[i]]$xp$randvar)) {\n                shz <- (vars - mean(clpca$cl.goc[[i]]$xp$randvar))/sd(clpca$cl.goc[[i]]$xp$randvar)\n            }\n            cbind(i = i, var = vars, n = clpca$cl.goc[[i]]$n, npc = seq(1:ncol(clpca$cl.goc[[i]]$xp$scores)), shz = shz)\n        })))\n\n        clvdf$var <- clvdf$var-(vshift-ev)*clvdf$n\n\n        x <- RMTstat::WishartMaxPar(n.cells, clvdf$n)\n        clvdf$pm <- x$centering-(1.2065335745820)*x$scaling # predicted mean of a random set\n        clvdf$pv <- (1.607781034581)*x$scaling # predicted variance of a random set\n        pvar <- predict(clpca$clvlm, newdata = clvdf)\n        clvdf$varst <- (clvdf$var-pvar)/sqrt(clvdf$pv)\n        clvdf$exp <- clpca$xf$results$par[1]*sqrt(clvdf$pv)+pvar\n        #clvdf$varst <- (clvdf$var-clvdf$pm)/sqrt(clvdf$pv)\n        #clvdf$exp <- clpca$xf$results$par[1]*sqrt(clvdf$pv)+clvdf$pm\n\n        lp <- pgev.upper.log(clvdf$varst, clpca$xf$results$par[1], clpca$xf$results$par[2], rep(clpca$xf$results$par[3], nrow(clvdf)))\n        clvdf$z <- qnorm(lp, lower.tail = FALSE, log.p = TRUE)\n        clvdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(clvdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)\n\n        # CI relative to the background\n        clvdf$ub <- extRemes::qevd(score.alpha/2, loc = clpca$xf$results$par[1], scale = clpca$xf$results$par[2], shape = clpca$xf$results$par[3], lower.tail = FALSE)*sqrt(clvdf$pv) + pvar\n        clvdf$ub.stringent <- extRemes::qevd(score.alpha/2/nrow(clvdf), loc = clpca$xf$results$par[1], scale = clpca$xf$results$par[2], shape = clpca$xf$results$par[3], lower.tail = FALSE)*sqrt(clvdf$pv) + pvar\n\n    }\n\n    if(plot) {\n        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))\n        un <- sort(unique(vdf$n))\n        on <- order(vdf$n, decreasing = FALSE)\n        pccol <- colorRampPalette(c(\"black\", \"grey70\"), space = \"Lab\")(max(vdf$npc))\n        plot(vdf$n, vdf$var/vdf$n, xlab = \"gene set size\", ylab = \"PC1 var/n\", ylim = c(0, max(vdf$var/vdf$n)), col = pccol[vdf$npc])\n        lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)\n        lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)\n\n        if(!is.null(clpca)) {\n            pccol <- colorRampPalette(c(\"darkgreen\", \"lightgreen\"), space = \"Lab\")(max(clvdf$npc))\n            points(clvdf$n, clvdf$var/clvdf$n, col = pccol[clvdf$npc], pch = 1)\n\n            #clvm <- clpca$xf$results$par[1]*sqrt(pmax(1e-3, predict(vm, data.frame(n = un)))) + predict(mm, data.frame(n = un))\n            on <- order(clvdf$n, decreasing = FALSE)\n\n            lines(clvdf$n[on], (clvdf$exp/clvdf$n)[on], col = \"darkgreen\")\n            lines(clvdf$n[on], (clvdf$ub.stringent/clvdf$n)[on], col = \"darkgreen\", lty = 2)\n        }\n        #mi<-which.max(vdf$n) sv<- (vdf$var/vdf$n)[mi] - (vdf$exp/vdf$n)[mi]\n        #lines(vdf$n[on], (vdf$exp/vdf$n)[on]+sv, col = 2, lty = 3)\n        #lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on]+sv, col = 2, lty = 2)\n    }\n\n\n    if(!is.null(clpca)) { # merge in cluster stats based on their own model\n\n        # merge pwpca, psd and pm\n        # all processing from here is common\n        clvdf$i <- clvdf$i+length(pwpca) # shift cluster ids\n        pwpca <- c(pwpca, clpca$cl.goc)\n        vdf <- rbind(vdf, clvdf[, c(\"i\", \"var\", \"n\", \"npc\", \"exp\", \"cz\", \"z\", \"ub\", \"ub.stringent\", \"shz\")])\n    }\n\n    vdf$adj.shz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$shz), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)\n    #vdf$oe <- vdf$var/vdf$exp\n    rs <- (vshift-ev)*vdf$n\n    #rs <- ifelse(vdf$npc == 1, (vshift-ev)*vdf$n, 0)\n    vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)\n    #vdf$oe[vdf$oe<0] <- 0\n    #vdf$oec <- (vdf$var-vdf$ub.stringent+vdf$exp)/vdf$exp\n    #vdf$oec <- (vdf$var-vdf$ub+vdf$exp)/vdf$exp\n    #vdf$oec <- (vdf$var-vdf$ub+vdf$exp+rs)/(vdf$exp+rs)\n    vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)\n    #vdf$oec[vdf$oec<0] <- 0\n    #vdf$z[vdf$z<0] <- 0\n\n\n\n    df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, sh.z = vdf$shz, adj.sh.z = vdf$adj.shz, stringsAsFactors = FALSE)\n    if(adjust.scores) {\n        vdf$valid <- vdf$cz  >=  z.score\n    } else {\n        vdf$valid <- vdf$z  >=  z.score\n    }\n\n    if(return.table) {\n        df <- df[vdf$valid, ]\n        df <- df[order(df$score, decreasing = TRUE), ]\n        return(df)\n    }\n\n    # determine genes driving significant pathways\n    # return genes within top 2/3rds of PC loading\n    gl <- lapply(which(vdf$valid), function(i) { s <- abs(pwpca[[vdf[i, \"i\"]]]$xp$rotation[, vdf[i, \"npc\"]] )\n    s[s >= max(s)/3] })\n    gw <- tapply(abs(unlist(gl)), as.factor(unlist(lapply(gl, names))), max)\n    if(return.genes) {\n        return(gw)\n    }\n    # return combined data structure\n\n    # weight\n    xvw <- do.call(rbind, lapply(pwpca, function(x) {\n        xm <- t(x$xp$scoreweights)\n    }))\n    vi <- vdf$valid\n    xvw <- xvw[vi, ]/rowSums(xvw[vi, ])\n\n    # return scaled patterns\n    xmv <- do.call(rbind, lapply(pwpca, function(x) {\n        xm <- t(x$xp$scores)\n    }))\n\n    if(use.oe.scale) {\n        xmv <- (xmv[vi, ] -rowMeans(xmv[vi, ]))* (as.numeric(vdf$oe[vi])/sqrt(apply(xmv[vi, ], 1, var)))\n    } else {\n        # chi-squared\n        xmv <- (xmv[vi, ]-rowMeans(xmv[vi, ])) * sqrt((qchisq(pnorm(vdf$z[vi], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv[vi, ], 1, var))\n    }\n    rownames(xmv) <- paste(\"#PC\", vdf$npc[vi], \"# \", names(pwpca)[vdf$i[vi]], sep = \"\")\n\n    return(list(xv = xmv, xvw = xvw, gw = gw, df = df))\n\n}\n\n\n##' Collapse aspects driven by the same combinations of genes\n##'\n##' Examines PC loading vectors underlying the identified aspects and clusters aspects based\n##' on a product of loading and score correlation (raised to corr.power). Clusters of aspects\n##' driven by the same genes are determined based on the distance.threshold and collapsed.\n##'\n##' @param tam output of pagoda.top.aspects()\n##' @param pwpca output of pagoda.pathway.wPCA()\n##' @param clpca output of pagoda.gene.clusters() (optional)\n##' @param plot whether to plot the resulting clustering\n##' @param cluster.method one of the standard clustering methods to be used (fastcluster::hclust is used if available or stats::hclust)\n##' @param distance.threshold similarity threshold for grouping interdependent aspects\n##' @param corr.power power to which the product of loading and score correlation is raised\n##' @param abs Boolean of whether to use absolute correlation\n##' @param n.cores number of cores to use during processing\n##' @param ... additional arguments are passed to the pagoda.view.aspects() method during plotting\n##'\n##' @return a list structure analogous to that returned by pagoda.top.aspects(), but with addition of a $cnam element containing a list of aspects summarized by each row of the new (reduced) $xv and $xvw\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' tam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n##' tamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\n##' }\n##'\n##' @export\npagoda.reduce.loading.redundancy <- function(tam, pwpca, clpca = NULL, plot = FALSE, cluster.method = \"complete\", distance.threshold = 0.01, corr.power = 4, n.cores = detectCores(), abs = TRUE, ...) {\n    pclc <- pathway.pc.correlation.distance(c(pwpca, clpca$cl.goc), tam$xv, target.ndf = 100, n.cores = n.cores)\n    cda <- cor(t(tam$xv))\n    if(abs) {\n        cda <- abs(cda)\n    } else {\n        cda[cda<0] <- 0\n    }\n    cda <- as.dist(1-cda)\n    cc <- (1-sqrt((1-pclc)*(1-cda)))^corr.power\n\n    if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n        y <- fastcluster::hclust(cc, method = cluster.method)\n    } else {\n        y <- stats::hclust(cc, method = cluster.method)\n    }\n    ct <- cutree(y, h = distance.threshold)\n    ctf <- factor(ct, levels = sort(unique(ct)))\n    xvl <- collapse.aspect.clusters(tam$xv, tam$xvw, ct, pick.top = FALSE, scale = TRUE)\n\n    if(plot) {\n        sc <- sample(colors(), length(levels(ctf)), replace = TRUE)\n        view.aspects(tam$xv, row.clustering = y, row.cols = sc[as.integer(ctf)], ...)\n    }\n\n    # collapsed names\n    if(!is.null(tam$cnam)) { # already has collapsed names\n        cnam <- tapply(rownames(tam$xv), ctf, function(xn) unlist(tam$cnam[xn]))\n    } else {\n        cnam <- tapply(rownames(tam$xv), ctf, I)\n    }\n    names(cnam) <- rownames(xvl$d)\n    tam$xv <- xvl$d\n    tam$xvw <- xvl$w\n    tam$cnam <- cnam\n    return(tam)\n}\n\n\n##' Collapse aspects driven by similar patterns (i.e. separate the same sets of cells)\n##'\n##' Examines PC loading vectors underlying the identified aspects and clusters aspects based on score correlation. Clusters of aspects driven by the same patterns are determined based on the distance.threshold.\n##'\n##' @param tamr output of pagoda.reduce.loading.redundancy()\n##' @param distance.threshold similarity threshold for grouping interdependent aspects\n##' @param cluster.method one of the standard clustering methods to be used (fastcluster::hclust is used if available or stats::hclust)\n##' @param distance distance matrix\n##' @param weighted.correlation Boolean of whether to use a weighted correlation in determining the similarity of patterns\n##' @param plot Boolean of whether to show plot\n##' @param top Restrict output to the top n aspects of heterogeneity\n##' @param trim Winsorization trim to use prior to determining the top aspects\n##' @param abs Boolean of whether to use absolute correlation\n##' @param ... additional arguments are passed to the pagoda.view.aspects() method during plotting\n##'\n##' @return a list structure analogous to that returned by pagoda.top.aspects(), but with addition of a $cnam element containing a list of aspects summarized by each row of the new (reduced) $xv and $xvw\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' tam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n##' tamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\n##' tamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)\n##' }\n##'\n##' @export\npagoda.reduce.redundancy <- function(tamr, distance.threshold = 0.2, cluster.method = \"complete\", distance = NULL, weighted.correlation = TRUE, plot = FALSE, top = Inf, trim = 0, abs = FALSE, ...) {\n    if(is.null(distance)) {\n        if(weighted.correlation) {\n            distance <- .Call(\"matWCorr\", t(tamr$xv), t(tamr$xvw), PACKAGE = \"scde\")\n            rownames(distance) <- colnames(distance) <- rownames(tamr$xv)\n            if(abs) {\n                distance <- stats::as.dist(1-abs(distance), upper = TRUE)\n            } else {\n                distance <- stats::as.dist(1-distance, upper = TRUE)\n            }\n        } else {\n            if(abs) {\n                distance <- stats::as.dist(1-abs(cor(t(tamr$xv))))\n            } else {\n                distance <- stats::as.dist(1-cor(t(tamr$xv)))\n            }\n        }\n    }\n    if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n        y <- fastcluster::hclust(distance, method = cluster.method)\n    } else {\n        y <- stats::hclust(distance, method = cluster.method)\n    }\n\n    ct <- cutree(y, h = distance.threshold)\n    ctf <- factor(ct, levels = sort(unique(ct)))\n    xvl <- collapse.aspect.clusters(tamr$xv, tamr$xvw, ct, pick.top = FALSE, scale = TRUE)\n\n    if(plot) {\n        sc <- sample(colors(), length(levels(ctf)), replace = TRUE)\n        view.aspects(tamr$xv, row.clustering = y, row.cols = sc[as.integer(ctf)], ...)\n    }\n\n    # collapsed names\n    if(!is.null(tamr$cnam)) { # already has collapsed names\n        cnam <- tapply(rownames(tamr$xv), ctf, function(xn) unlist(tamr$cnam[xn]))\n    } else {\n        cnam <- tapply(rownames(tamr$xv), ctf, I)\n    }\n    names(cnam) <- rownames(xvl$d)\n\n    if(trim > 0) { xvl$d <- winsorize.matrix(xvl$d, trim) } # trim prior to determining the top sets\n\n    rcmvar <- apply(xvl$d, 1, var)\n    vi <- order(rcmvar, decreasing = TRUE)[1:min(length(rcmvar), top)]\n\n    tamr2 <- tamr\n    tamr2$xv <- xvl$d[vi, ]\n    tamr2$xvw <- xvl$w[vi, ]\n    tamr2$cnam <- cnam[vi]\n    return(tamr2)\n}\n\n\n##' Determine optimal cell clustering based on the genes driving the significant aspects\n##'\n##' Determines cell clustering (hclust result) based on a weighted correlation of genes\n##' underlying the top aspects of transcriptional heterogeneity. Branch orientation is optimized\n##' if 'cba' package is installed.\n##'\n##' @param tam result of pagoda.top.aspects() call\n##' @param varinfo result of pagoda.varnorm() call\n##' @param method clustering method ('ward.D' by default)\n##' @param verbose 0 or 1 depending on level of desired verbosity\n##' @param include.aspects whether the aspect patterns themselves should be included alongside with the individual genes in calculating cell distance\n##' @param return.details Boolean of whether to return just the hclust result or a list containing the hclust result plus the distance matrix and gene values\n##'\n##' @return hclust result\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' tam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n##' hc <- pagoda.cluster.cells(tam, varinfo)\n##' plot(hc)\n##' }\n##'\n##' @export\npagoda.cluster.cells <- function(tam, varinfo, method = \"ward.D\", include.aspects = FALSE, verbose = 0, return.details = FALSE) {\n    # gene clustering\n    gw <- tam$gw\n    gw <- gw[(rowSums(varinfo$matw)*varinfo$arv)[names(gw)] > 1]\n\n    gw <- gw/gw\n    mi <- match(names(gw), rownames(varinfo$mat))\n    wgm <- varinfo$mat[mi, ]\n    wgm <- wgm*as.numeric(gw)\n    wgwm <- varinfo$matw[mi, ]\n\n    if(include.aspects) {\n        if(verbose) { message(\"clustering cells based on \", nrow(wgm), \" genes and \", nrow(tam$xv), \" aspect patterns\")}\n        wgm <- rbind(wgm, tam$xv)\n        wgwm <- rbind(wgwm, tam$xvw)\n    } else {\n        if(verbose) { message(\"clustering cells based on \", nrow(wgm), \" genes\")}\n    }\n\n    snam <- sample(colnames(wgm))\n\n    dm <- .Call(\"matWCorr\", wgm, wgwm, PACKAGE = \"scde\")\n    dm <- 1-dm\n    rownames(dm) <- colnames(dm) <- colnames(wgm)\n    wcord <- stats::as.dist(dm, upper = TRUE)\n    hc <- hclust(wcord, method = method)\n\n    if(is.element(\"cba\", installed.packages()[, 1])) {\n        co <- cba::order.optimal(wcord, hc$merge)\n        hc$merge <- co$merge\n        hc$order <- co$order\n    }\n    if(return.details) {\n        return(list(clustering = hc, distance = wcord, genes = gw))\n    } else {\n        return(hc)\n    }\n}\n\n\n##' View PAGODA output\n##'\n##' Create static image of PAGODA output visualizing cell hierarchy and top aspects of transcriptional heterogeneity\n##'\n##' @param tamr Combined pathways that show similar expression patterns. Output of \\code{\\link{pagoda.reduce.redundancy}}\n##' @param row.clustering Dendrogram of combined pathways clustering\n##' @param top Restrict output to the top n aspects of heterogeneity\n##' @param ... additional arguments are passed to the \\code{\\link{view.aspects}} method during plotting\n##'\n##' @return PAGODA heatmap\n##'\n##' @examples\n##' data(pollen)\n##' cd <- clean.counts(pollen)\n##' \\donttest{\n##' knn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n##' varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n##' pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n##' tam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n##' pagoda.view.aspects(tam)\n##' }\n##'\n##' @export\npagoda.view.aspects <- function(tamr, row.clustering = hclust(dist(tamr$xv)), top = Inf, ...) {\n    if(is.finite(top)) {\n        rcmvar <- apply(tamr$xv, 1, var)\n        vi <- order(rcmvar, decreasing = TRUE)[1:min(length(rcmvar), top)]\n        tamr$xv <- tamr$xv[vi, ]\n        tamr$xvw <- tamr$xvw[vi, ]\n        tamr$cnam <- tamr$cnam[vi]\n    }\n\n    view.aspects(tamr$xv, row.clustering = row.clustering, ... )\n}\n\n\n##' View heatmap\n##'\n##' Internal function to visualize aspects of transcriptional heterogeneity as a heatmap. Used by \\code{\\link{pagoda.view.aspects}}.\n##'\n##' @param mat Numeric matrix\n##' @param row.clustering Row dendrogram\n##' @param cell.clustering Column dendrogram\n##' @param zlim Range of the normalized gene expression levels, inputted as a list: c(lower_bound, upper_bound). Values outside this range will be Winsorized. Useful for increasing the contrast of the heatmap visualizations. Default, set to the 5th and 95th percentiles.\n##' @param row.cols  Matrix of row colors.\n##' @param col.cols  Matrix of column colors. Useful for visualizing cell annotations such as batch labels.\n##' @param cols Heatmap colors\n##' @param show.row.var.colors Boolean of whether to show row variance as a color track\n##' @param top Restrict output to the top n aspects of heterogeneity\n##' @param ... additional arguments for heatmap plotting\n##'\n##' @return A heatmap\n##'\nview.aspects <- function(mat, row.clustering = NA, cell.clustering = NA, zlim = c(-1, 1)*quantile(mat, p = 0.95), row.cols = NULL, col.cols = NULL, cols = colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(1024), show.row.var.colors = TRUE, top = Inf, ...) {\n    #row.cols, col.cols are matrices for now\n    rcmvar <- apply(mat, 1, var)\n    mat[mat<zlim[1]] <- zlim[1]\n    mat[mat > zlim[2]] <- zlim[2]\n    if(class(row.clustering) == \"hclust\") { row.clustering <- as.dendrogram(row.clustering) }\n    if(class(cell.clustering) == \"hclust\") { cell.clustering <- as.dendrogram(cell.clustering) }\n    if(show.row.var.colors) {\n        if(is.null(row.cols)) {\n            icols <- colorRampPalette(c(\"white\", \"black\"), space = \"Lab\")(1024)[1023*(rcmvar/max(rcmvar))+1]\n            row.cols <- cbind(var = icols)\n        }\n    }\n    my.heatmap2(mat, Rowv = row.clustering, Colv = cell.clustering, zlim = zlim, RowSideColors = row.cols, ColSideColors = col.cols, col = cols, ...)\n}\n\n\n##' Make the PAGODA app\n##'\n##' Create an interactive user interface to explore output of PAGODA.\n##'\n##' @param tamr Combined pathways that show similar expression patterns. Output of \\code{\\link{pagoda.reduce.redundancy}}\n##' @param tam Combined pathways that are driven by the same gene sets. Output of \\code{\\link{pagoda.reduce.loading.redundancy}}\n##' @param varinfo Variance information. Output of \\code{\\link{pagoda.varnorm}}\n##' @param env Gene sets as an environment variable.\n##' @param pwpca Weighted PC magnitudes for each gene set provided in the \\code{env}. Output of \\code{\\link{pagoda.pathway.wPCA}}\n##' @param clpca Weighted PC magnitudes for de novo gene sets identified by clustering on expression. Output of \\code{\\link{pagoda.gene.clusters}}\n##' @param col.cols  Matrix of column colors. Useful for visualizing cell annotations such as batch labels. Default NULL.\n##' @param cell.clustering Dendrogram of cell clustering. Output of \\code{\\link{pagoda.cluster.cells} } . Default   NULL.\n##' @param row.clustering Dendrogram of combined pathways clustering. Default NULL.\n##' @param title Title text to be used in the browser label for the app. Default, set as 'pathway clustering'\n##' @param zlim Range of the normalized gene expression levels, inputted as a list: c(lower_bound, upper_bound). Values outside this range will be Winsorized. Useful for increasing the contrast of the heatmap visualizations. Default, set to the 5th and 95th percentiles.\n##'\n##' @return PAGODA app\n##'\n##' @export\nmake.pagoda.app <- function(tamr, tam, varinfo, env, pwpca, clpca = NULL, col.cols = NULL, cell.clustering = NULL, row.clustering = NULL, title = \"pathway clustering\", zlim = c(-1, 1)*quantile(tamr$xv, p = 0.95)) {\n    # rcm - xv\n    # matvar\n    if(is.null(cell.clustering)) {\n        cell.clustering <- pagoda.cluster.cells(tam, varinfo)\n    }\n    if(is.null(row.clustering)) {\n        row.clustering <- hclust(dist(tamr$xv))\n        row.clustering$order <- rev(row.clustering$order)\n    }\n\n    #fct - which tam row in which tamr$xv cluster.. remap tamr$cnams\n    cn <- tamr$cnam\n    fct <- rep(1:length(cn), lapply(cn, length))\n    names(fct) <- unlist(cn)\n    fct <- fct[rownames(tam$xv)]\n    rcm <- tamr$xv\n    rownames(rcm) <- as.character(1:nrow(rcm))\n    fres <- list(hvc = cell.clustering, tvc = row.clustering, rcm = rcm, zlim2 = zlim, matvar = apply(tam$xv, 1, sd), ct = fct, matrcmcor = rep(1, nrow(tam$xv)), cols = colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(1024), colcol = col.cols)\n\n    # gene df\n    gene.df <- data.frame(var = varinfo$arv*rowSums(varinfo$matw))\n    gene.df$gene <- rownames(varinfo$mat)\n    gene.df <- gene.df[order(gene.df$var, decreasing = TRUE), ]\n\n    # prepare pathway df\n    df <- tamr$df\n    if(exists(\"myGOTERM\", envir = globalenv())) {\n        df$desc <- mget(df$name, get(\"myGOTERM\", envir = globalenv()), ifnotfound = \"\")\n    } else {\n        df$desc <- \"\"\n    }\n    min.z <- -9\n    df$z[df$z<min.z] <- min.z\n    df$adj.z[df$adj.z<min.z] <- min.z\n    df$sh.z[df$sh.z<min.z] <- min.z\n    df$adj.sh.z[df$adj.sh.z<min.z] <- min.z\n    df <- data.frame(id = paste(\"#PC\", df$npc, \"# \", df$name, sep = \"\"), npc = df$npc, n = df$n, score = df$score, Z = df$z, aZ = df$adj.z, sh.Z = df$sh.z, sh.aZ = df$adj.sh.z, name = paste(df$name, df$desc))\n\n    df <- df[order(df$score, decreasing = TRUE), ]\n\n    # merge go.env\n    if(!is.null(clpca)) {\n        set.env <- list2env(c(as.list(env), clpca$clusters))\n    } else {\n        set.env <- env\n    }\n    sa <- ViewPagodaApp$new(fres, df, gene.df, varinfo$mat, varinfo$matw, set.env, name = title, trim = 0, batch = varinfo$batch)\n}\n\n##################### Internal functions\n\none.sided.test.id <- function(id, nam1, nam2, ifm, dm, prior, difference.prior = 0.5, bootstrap = TRUE, n.samples = 1e3, show.plots = TRUE, return.posterior = FALSE, return.both = FALSE) {\n    gr <- 10^prior$x - 1\n    gr[gr<0] <- 0\n    lpp <- get.rep.set.general.model.logposteriors(ifm[[nam1]], dm[rep(id, length(gr)), names(ifm[[nam1]])], data.frame(fpm = gr), grid.weight = prior$grid.weight)\n    ldp <- get.rep.set.general.model.logposteriors(ifm[[nam2]], dm[rep(id, length(gr)), names(ifm[[nam2]])], data.frame(fpm = gr), grid.weight = prior$grid.weight)\n\n    if(bootstrap) {\n        pjp <- do.call(cbind, lapply(seq_along(n.samples), function(i) {\n            pjp <- rowSums(lpp[, sample(1:ncol(lpp), replace = TRUE)])\n            pjp <- exp(pjp-max(pjp))\n            pjp <- pjp/sum(pjp)\n            return(pjp)\n        }))\n        pjp <- rowSums(pjp)\n        pjp <- log(pjp/sum(pjp))\n\n        djp <- do.call(cbind, lapply(seq_along(n.samples), function(i) {\n            djp <- rowSums(ldp[, sample(1:ncol(ldp), replace = TRUE)])\n            djp <- exp(djp-max(djp))\n            djp <- djp/sum(djp)\n            return(djp)\n        }))\n        djp <- rowSums(djp)\n        djp <- log(djp/sum(djp))\n    } else {\n        pjp <- rowSums(lpp)\n        djp <- rowSums(ldp)\n    }\n\n    dpy <- exp(prior$lp+djp)\n    mpgr <- sum(exp(prior$lp+pjp+log(c(0, cumsum(dpy)[-length(dpy)])))) # m1\n    mpls <- sum(exp(prior$lp+pjp+log(sum(dpy)-cumsum(dpy)))) # m0\n    mpls/mpgr\n\n    pjpc <- exp(prior$lp+pjp)\n    pjpc <- pjpc/sum(pjpc)\n    djpc <- exp(prior$lp+djp)\n    djpc <- djpc/sum(djpc)\n\n    if(show.plots || return.posterior || return.both) {\n        # calculate log-fold-change posterior\n        n <- length(pjpc)\n        rp <- c(unlist(lapply(n:2, function(i) sum(pjpc[1:(n-i+1)]*djpc[i:n]))), unlist(lapply(seq_along(n), function(i) sum(pjpc[i:n]*djpc[1:(n-i+1)]))))\n        rv <- seq(prior$x[1]-prior$x[length(prior$x)], prior$x[length(prior$x)]-prior$x[1], length = length(prior$x)*2-1)\n        fcp <- data.frame(v = rv, p = rp)\n    }\n\n    if(show.plots) {\n        # show each posterior\n        layout(matrix(c(1:3), 3, 1, byrow = TRUE), heights = c(2, 1, 2), widths = c(1), FALSE)\n        par(mar = c(2.5, 3.5, 2.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        jpr <- range(c(0, pjpc), na.rm = TRUE)\n        pp <- exp(lpp)\n        cols <- rainbow(dim(pp)[2], s = 0.8)\n        plot(c(), c(), xlim = range(prior$x), ylim = range(c(0, pp)), xlab = \"expression level\", ylab = \"individual posterior\", main = nam1)\n        lapply(seq_len(ncol(pp)), function(i) lines(prior$x, pp[, i], col = cols[i]))\n        legend(x = ifelse(which.max(na.omit(pjpc)) > length(pjpc)/2, \"topleft\", \"topright\"), bty = \"n\", col = cols, legend = colnames(pp), lty = rep(1, dim(pp)[2]))\n        par(new = TRUE)\n        plot(prior$x, pjpc, axes = FALSE, ylab = \"\", xlab = \"\", ylim = jpr, type = 'l', col = 1, lty = 1, lwd = 2)\n        axis(4, pretty(jpr, 5), col = 1)\n        mtext(\"joint posterior\", side = 4, outer = FALSE, line = 2)\n\n        # ratio plot\n        par(mar = c(2.5, 3.5, 0.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        plot(fcp$v, fcp$p, xlab = \"log10 expression ratio\", ylab = \"ratio posterior\", type = 'l', lwd = 2, main = \"\")\n        r.mle <- fcp$v[which.max(fcp$p)]\n        r.lb <- max(which(cumsum(fcp$p)<0.025))\n        r.ub <- min(which(cumsum(fcp$p) > (1-0.025)))\n        polygon(c(fcp$v[r.lb], fcp$v[r.lb:r.ub], fcp$v[r.ub]), y = c(-10, fcp$p[r.lb:r.ub], -10), col = \"grey90\")\n        abline(v = r.mle, col = 2, lty = 2)\n        abline(v = c(fcp$v[r.ub], fcp$v[r.lb]), col = 2, lty = 3)\n        box()\n        legend(x = ifelse(r.mle > 0, \"topleft\", \"topright\"), legend = c(paste(\"MLE = \", round(10^(r.mle), 1), \" (\", round(r.mle, 2), \" in log10)\", sep = \"\"), paste(\"95% CI: \", round(10^(fcp$v[r.lb]), 1), \" - \", round(10^(fcp$v[r.ub]), 1), sep = \"\"), paste(\" log10: \", round(fcp$v[r.lb], 2), \" - \", round(fcp$v[r.ub], 2), sep = \"\")), bty = \"n\")\n\n        # distal plot\n        dp <- exp(ldp)\n        par(mar = c(2.5, 3.5, 2.5, 3.5), mgp = c(1.5, 0.65, 0), cex = 0.9)\n        jpr <- range(c(0, djpc), na.rm = TRUE)\n        cols <- rainbow(dim(dp)[2], s = 0.8)\n        plot(c(), c(), xlim = range(prior$x), ylim = range(c(0, dp)), xlab = \"expression level\", ylab = \"individual posterior\", main = nam2)\n        lapply(seq_len(ncol(dp)), function(i) lines(prior$x, dp[, i], col = cols[i]))\n        legend(x = ifelse(which.max(na.omit(djpc)) > length(djpc)/2, \"topleft\", \"topright\"), bty = \"n\", col = cols, legend = colnames(dp), lty = rep(1, dim(dp)[2]))\n\n        par(new = TRUE)\n        plot(prior$x, djpc, axes = FALSE, ylab = \"\", xlab = \"\", ylim = jpr, type = 'l', col = 1, lty = 1, lwd = 2)\n        axis(4, pretty(jpr, 5), col = 1)\n        mtext(\"joint posterior\", side = 4, outer = FALSE, line = 2)\n    }\n\n    lbf <- mpls/mpgr\n    lbf <- (difference.prior*lbf)/(difference.prior*lbf+1-difference.prior)\n    #return(c(equal = qnorm(ebf, lower.tail = TRUE), less = qnorm(lbf, lower.tail = TRUE)))\n    if(return.both) {\n        return(list(z = qnorm(lbf, lower.tail = TRUE), post = fcp))\n    } else if(return.posterior) {\n        return(fcp)\n    } else {\n        return(qnorm(lbf, lower.tail = TRUE))\n    }\n}\n\n# counts - data frame with fragment counts (rows - fragments columns -experiments)\n# groups - a two-level factor describing grouping of columns. Use NA for observations that should be skipped\n# min.count.threshold - the number of reads used to make an initial guess for the failed component\n# threshold.segmentation - use min.count.threshold to perform very quick identification of the drop-outs\n# threshold.prior - prior that should be associated with threshold segmentation\ncalculate.crossfit.models <- function(counts, groups, min.count.threshold = 4, nrep = 1, verbose = 0, min.prior = 1e-5, n.cores = 12, save.plots = TRUE, zero.lambda = 0.1, old.cfm = NULL, threshold.segmentation = FALSE, threshold.prior = 1-1e-6, max.pairs = 1000, min.pairs.per.cell = 10) {\n    names(groups) <- colnames(counts)\n    # enumerate cross-fit pairs within each group\n    cl <- do.call(cbind, tapply(colnames(counts), groups, function(ids) {\n        cl <- combn(ids, 2)\n        min.pairs.per.cell <- min(length(ids)*(length(ids)-1)/2, min.pairs.per.cell)\n        if(verbose) {\n            cat(\"number of pairs: \", ncol(cl), \"\\n\")\n        }\n        if(ncol(cl) > max.pairs) {\n            if(verbose) {\n                cat(\"reducing to a random sample of \", max.pairs, \" pairs\\n\")\n            }\n\n            # make sure there's at least min.pairs.per.cell pairs for each cell\n            cl <- cl[, unique(c(sample(1:ncol(cl), max.pairs),\n                                unlist(lapply(ids, function(id) sample(which(colSums(cl == id) > 0), min.pairs.per.cell)))))]\n        }\n        cl\n    }))\n\n    orl <- c()\n    if(!is.null(old.cfm)) {\n        # check which pairs have already been fitted in compared in old.cfm\n        pn1 <- unlist(apply(cl, 2, function(ii) paste(ii, collapse = \".vs.\")))\n        pn2 <- unlist(apply(cl, 2, function(ii) paste(rev(ii), collapse = \".vs.\"))) ### %%% use rev() to revert element order\n        vi <- (pn1 %in% names(old.cfm)) | (pn2 %in% names(old.cfm))\n        cl <- cl[, !vi, drop = FALSE]\n        orl <- old.cfm[names(old.cfm) %in% c(pn1, pn2)]\n    }\n    if(verbose) {\n        cat(\"total number of pairs: \", ncol(cl), \"\\n\")\n    }\n\n    if(dim(cl)[2] > 0) {\n        if(verbose)  message(paste(\"cross-fitting\", ncol(cl), \"pairs:\"))\n        rl <- papply(seq_len(ncol(cl)), function(cii) {\n            ii <- cl[, cii]\n            df <- data.frame(c1 = counts[, ii[1]], c2 = counts[, ii[2]])\n            vi <- which(rowSums(df) > 0, )\n            if(!threshold.segmentation) {\n                if(verbose) {\n                    message(\"fitting pair [\", paste(ii, collapse = \" \"), \"]\")\n                }\n                mo1 <- FLXMRglmCf(c1~1, family = \"poisson\", components = c(1), mu = log(zero.lambda))\n                mo2 <- FLXMRnb2glmC(c1~1+I(log(c2+1)), components = c(2))\n                mo3 <- FLXMRnb2glmC(c2~1+I(log(c1+1)), components = c(2))\n                mo4 <- FLXMRglmCf(c2~1, family = \"poisson\", components = c(3), mu = log(zero.lambda))\n                m1 <- mc.stepFlexmix(c1~1, data = df[vi, ], k = 3, model = list(mo1, mo2, mo3, mo4), control = list(verbose = verbose, minprior = min.prior), concomitant = FLXPmultinom(~I((log(c1+1)+log(c2+1))/2)+1), cluster = cbind(df$c1[vi]<= min.count.threshold, df$c1[vi] > min.count.threshold & df$c2[vi] > min.count.threshold, df$c2[vi]<= min.count.threshold), nrep = nrep)\n\n                # reduce return size\n                m1@posterior <- lapply(m1@posterior, function(m) {\n                    rownames(m) <- NULL\n                    return(m)\n                })\n                #rownames(m1@concomitant@x) <- NULL\n                m1@concomitant@x <- matrix()\n                m1@model <- lapply(m1@model, function(mod) {\n                    mod@x <- matrix()\n                    mod@y <- matrix()\n                    #rownames(mod@x) <- NULL\n                    #rownames(mod@y) <- NULL\n                    return(mod)\n                })\n\n                #parent.env(environment(m1@components[[1]][[1]]@logLik)) <- globalenv()\n                #parent.env(environment(m1@components[[1]][[2]]@logLik)) <- globalenv()\n                #parent.env(environment(m1@components[[2]][[1]]@logLik)) <- globalenv()\n                #parent.env(environment(m1@components[[2]][[2]]@logLik)) <- globalenv()\n\n                names(vi) <- NULL\n                pm <- posterior(m1)[, c(1, 3)]\n                rownames(pm) <- NULL\n                cl <- clusters(m1)\n                names(cl) <- NULL\n                gc()\n            } else {\n                # use min.count.threshold to quickly segment the points\n                cl <- rep(2, length(vi))\n                cl[df[vi, 1]<min.count.threshold] <- 1\n                cl[df[vi, 2]<min.count.threshold] <- 3\n                cl[df[vi, 1]<min.count.threshold & df[vi, 2]<min.count.threshold] <- 0\n                names(cl) <- NULL\n                pm <- cbind(ifelse(cl == 1, threshold.prior, 1-threshold.prior), ifelse(cl == 3, threshold.prior, 1-threshold.prior))\n                rownames(pm) <- NULL\n            }\n            rli <- list(ii = ii, clusters = cl, posterior = pm, vi = vi)\n            #message(\"return object size for pair [\", paste(ii, collapse = \" \"), \"] is \", round(object.size(rli)/(1024^2), 3), \" MB\")\n            return(rli)\n        }, n.cores = round(n.cores/nrep))\n        #, mc.preschedule = threshold.segmentation) # mclapply function has preschedule\n        names(rl) <- apply(cl, 2, paste, collapse = \".vs.\")\n        # clean up invalid entries\n        rl <- rl[!unlist(lapply(rl, is.null))]\n        rl <- rl[unlist(lapply(rl, is.list))]\n        #names(rl) <- unlist(lapply(rl, function(d) paste(d$ii, collapse = \".vs.\")))\n    } else {\n        rl <- c()\n    }\n\n    if(!is.null(old.cfm)) rl <- c(rl, orl)\n\n    if(save.plots) {\n        #require(Cairo) require(RColorBrewer)\n        tapply(colnames(counts), groups, function(ids) {\n            cl <- combn(ids, 2)\n            group <- as.character(groups[ids[1]])\n            # log-scale hist\n            t.pairs.panel.hist <- function(x, i = NULL, ...) {\n                usr <- par(\"usr\")\n                on.exit(par(usr))\n                par(usr = c(usr[1:2], 0, 1.5) )\n                vi <- x > 0\n                h <- hist(x, plot = FALSE)\n                breaks <- h$breaks\n                nB <- length(breaks)\n                y <- log10(h$counts)\n                y <- y/max(y)\n                rect(breaks[-nB], 0, breaks[-1], y, col = \"gray60\", ...)\n            }\n            t.pairs.smoothScatter.spearman <- function(x, y, i = NULL, j = NULL, cex = 0.8, ...) {\n                vi <- x > 0 | y > 0\n                smoothScatter(x[vi], y[vi], add = TRUE, useRaster = TRUE, ...)\n                legend(x = \"bottomright\", legend = paste(\"sr = \", round(cor(x[vi], y[vi], method = \"spearman\"), 2), sep = \"\"), bty = \"n\", cex = cex)\n            }\n            # component assignment scatter\n            t.panel.component.scatter <- function(x, y, i, j, cex = 0.8, ...) {\n                if(!is.null(rl[[paste(ids[i], \"vs\", ids[j], sep = \".\")]])) {\n                    m1 <- rl[[paste(ids[i], \"vs\", ids[j], sep = \".\")]]\n                    # forward plot\n                    vi <- which(x > 0 | y > 0)\n                    ci <- vi[m1$clusters == 1]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Reds\")[-(1:3)])), cex = 2)\n                    }\n\n                    ci <- vi[m1$clusters == 3]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Greens\")[-(1:3)])), cex = 2)\n                    }\n                    ci <- vi[m1$clusters == 2]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Blues\")[-(1:3)])), cex = 2)\n                    }\n                    legend(x = \"topleft\", pch = c(19), col = \"blue\", legend = paste(\"sr = \", round(cor(x[ci], y[ci], method = \"spearman\"), 2), sep = \"\"), bty = \"n\", cex = cex)\n                    legend(x = \"bottomright\", pch = c(rep(19, 3)), col = c(\"red\", \"blue\", \"green\"), legend = paste(round(unlist(tapply(m1$clusters, factor(m1$clusters, levels = c(1, 2, 3)), length))*100/length(vi), 1), \"%\", sep = \"\"), bty = \"n\", cex = cex)\n\n                } else if(!is.null(rl[[paste(ids[i], \"vs\", ids[j], sep = \".\")]])) {\n                    m1 <- rl[[paste(ids[j], \"vs\", ids[i], sep = \".\")]]\n                    # reverse plot\n                    vi <- which(x > 0 | y > 0)\n                    ci <- vi[m1$clusters == 3]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Reds\")[-(1:3)])), cex = 2)\n                    }\n\n                    ci <- vi[m1$clusters == 1]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Greens\")[-(1:3)])), cex = 2)\n                    }\n                    ci <- vi[m1$clusters == 2]\n                    if(length(ci) > 3) {\n                        points(x[ci], y[ci], pch = \".\", col = densCols(x[ci], y[ci], colramp = colorRampPalette(brewer.pal(9, \"Blues\")[-(1:3)])), cex = 2)\n                    }\n                    legend(x = \"topleft\", pch = c(19), col = \"blue\", legend = paste(\"sr = \", round(cor(x[ci], y[ci], method = \"spearman\"), 2), sep = \"\"), bty = \"n\", cex = cex)\n                    legend(x = \"bottomright\", pch = c(rep(19, 3)), col = c(\"red\", \"blue\", \"green\"), legend = paste(round(unlist(tapply(m1$clusters, factor(m1$clusters, levels = c(3, 2, 1)), length))*100/length(vi), 1), \"%\", sep = \"\"), bty = \"n\", cex = cex)\n                } else {\n                    #message(paste(\"ERROR: unable to find model for i = \", i, \"j = \", j))\n                    message(paste(\"INFO: cross-fit plots: skipping model for i = \", i, \"j = \", j, \" (increase max.pairs parameter if needed\"))\n                }\n            }\n            #pdf(file = paste(group, \"crossfits.pdf\", sep = \".\"), width = 3*length(ids), height = 3*length(ids))\n            CairoPNG(filename = paste(group, \"crossfits.png\", sep = \".\"), width = 250*length(ids), height = 250*length(ids))\n            pairs.extended(log10(counts[, ids]+1), lower.panel = t.pairs.smoothScatter.spearman, upper.panel = t.panel.component.scatter, diag.panel = t.pairs.panel.hist, cex = 1.5)\n            dev.off()\n        })\n    }\n\n    return(rl)\n}\n\n# estimates library sizes based on the correlated components\n# min.size.entries - minimal number of entries (genes) used to determine scaling factors for individual experiments\n# counts - data frame with fragment counts (rows - fragments columns -experiments)\n# groups - a two-level factor describing grouping of columns. Use NA for observations that should be skipped\n# cfm - cross-fit models (return of calculate.crossfit.models())\n# vil - optional binary matrix (corresponding to counts) with 0s marking likely drop-out observations\n# return value - library size vector in millions of reads\nestimate.library.sizes <- function(counts, cfm, groups, min.size.entries = min(nrow(counts), 2e3), verbose = 0, return.details = FALSE, vil = NULL, ...) {\n    #require(edgeR)\n    names(groups) <- colnames(counts)\n    # determine the set fragments that were not attributed to failure in any cross-comparison\n    if(is.null(vil)) {\n        #x <- lapply(cfm, function(d) { ll <- list(!(1:nrow(counts)) %in% d$vi[which(d$clusters != 1)], !(1:nrow(counts)) %in% d$vi[which(d$clusters != 3)]) names(ll) <- d$ii return(ll) })\n        x <- lapply(cfm, function(d) { ll <- list(!(1:nrow(counts)) %in% d$vi[which(d$clusters > 1)], !(1:nrow(counts)) %in% d$vi[which(d$clusters %% 3  != 0)])\n        names(ll) <- d$ii\n        return(ll)\n        })\n        vil <- do.call(cbind, tapply(unlist(x, recursive = FALSE), factor(unlist(lapply(x, names)), levels = colnames(counts)[!is.na(groups)]), function(l) {\n            x <- rowSums(do.call(cbind, l), na.rm = FALSE) == 0\n            x[is.na(x)] <- FALSE\n            return(x)\n        }))\n    }\n\n    # order entries by the number of non-failed experiments,\n    # select entries for library size estimation\n    ni <- cbind(1:nrow(counts), rowSums(vil))\n    ni <- ni[order(ni[, 2], decreasing = TRUE), ]\n    if(nrow(ni)<min.size.entries) {\n        stop(\"The number of valid genes (\", nrow(ni), \") is lower then the specified min.size.entries (\", min.size.entries, \"). Please either increase min.size.entries or lower min.nonfailed parameter to increase the number of valid genes\")\n    }\n    if(ni[min.size.entries, 2]<ncol(vil)) {\n        # if the min.size.entries -th gene has failures, take only min.size.entries genes\n        gis <- ni[1:min.size.entries, 1]\n    } else {\n        # otherwise take all genes that have not failed in any experiment\n        gis <- ni[ni[, 2] == ncol(vil), 1]\n    }\n\n    if(verbose)  message(paste(\"adjusting library size based on\", length(gis), \"entries\"))\n    f <- calcNormFactors(as.matrix(counts[gis, !is.na(groups)]), ...)\n    f <- f/exp(mean(log(f)))\n    ls <- colSums(counts[gis, !is.na(groups)])*f/1e6\n    if(return.details) { return(list(ls = ls, vil = vil)) } else { return(ls) }\n}\n\n# an alternative prior estimation procedure that weights down contributions by failure probability\n# and uses pre-scaled fpkm guesses for magnitude estimates\nestimate.signal.prior <- function(fpkm, fail, length.out = 400, show.plot = FALSE, pseudo.count = 1, bw = 0.1, max.quantile = 0.999, max.value = NULL) {\n    fpkm <- log10(exp(as.matrix(fpkm))+1)\n    wts <- as.numeric(as.matrix(1-fail[, colnames(fpkm)]))\n    wts <- wts/sum(wts)\n    # fit density on a mirror image\n    if(is.null(max.value)) {\n        x <- as.numeric(fpkm)\n        max.value <- as.numeric(quantile(x[x<Inf], p = max.quantile))\n    }\n    md <- density(c(-1*as.numeric(fpkm), as.numeric(fpkm)), bw = bw, weights = c(wts/2, wts/2), n = 2*length.out+1, from = -1*max.value, to = max.value)\n\n    gep <- data.frame(x = md$x[-c(1:length.out)], y = md$y[-c(1:length.out)])\n    gep$y[is.na(gep$y)] <- 0\n    gep$y <- gep$y+pseudo.count/nrow(fpkm) # pseudo-count\n    gep$y <- gep$y/sum(gep$y)\n    if(show.plot) {\n        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n        plot(gep$x, gep$y, col = 4, panel.first = abline(h = 0, lty = 2), type = 'l', xlab = \"log10( signal+1 )\", ylab = \"probability density\", main = \"signal prior\")\n    }\n    gep$lp <- log(gep$y)\n\n    # grid weighting (for normalization)\n    gep$grid.weight <- diff(10^c(gep$x[1], gep$x+c(diff(gep$x)/2, 0))-1)\n\n    return(gep)\n    plot(x)\n}\n\n# counts - data frame with gene counts (rows - genes columns -experiments)\n# groups - a two-level factor describing grouping of columns. Use NA for observations that should be skipped\n# cfm - cross-fit models (return of calculate.crossfit.models())\n# min.nonfailed - minimal number of non-failed observations required for a gene to be used in the final model fitting\n#  A minimum of either the specified value or number of experiments -1 will be used.\ncalculate.individual.models <- function(counts, groups, cfm, nrep = 1, verbose = 0, n.cores = 12, min.nonfailed = 2, min.size.entries = 2e3, zero.count.threshold = 10, save.plots = TRUE, linear.fit = TRUE, return.compressed.models = FALSE,  local.theta.fit = FALSE, theta.fit.range = c(1e-2, 1e2), ...) {\n    names(groups) <- colnames(counts)\n    # determine library size discarding non-zero entries\n    ls <- estimate.library.sizes(counts, cfm, groups, min.size.entries, verbose = verbose, return.details = TRUE)\n\n    # fit three-component models to unique pairs within each group\n    mll <- tapply(colnames(counts), groups, function(ids) {\n        cl <- combn(ids, 2)\n        group <- as.character(groups[ids[1]])\n\n        # incorporate cross-fit pairs from cfm\n        pn1 <- unlist(apply(cl, 2, function(ii) paste(ii, collapse = \".vs.\")))\n        pn2 <- unlist(apply(cl, 2, function(ii) paste(rev(ii), collapse = \".vs.\"))) ### %%% use rev() to revert element order\n        vi <- (pn1 %in% names(cfm)) | (pn2 %in% names(cfm)) # check both reverse and forward pairing\n        #if(!all(vi)) stop(\"unable to find cross-fit models for the following pairs : \", paste(pn1[!vi]))\n        if(!all(vi)) {\n            if(verbose > 0) {\n                if(verbose > 1) {\n                    cat(paste(\"WARNING: unable to find cross-fit models for the following pairs : \", paste(pn1[!vi], collapse = \" \")), \"\\n\")\n                } else {\n                    cat(\"WARNING: unable to find cross-fit models for \", sum(!vi), \" out of \", length(vi), \" pairs. Using a subset.\\n\")\n                }\n            }\n            # use a subset\n            if(sum(vi) > 3) {\n                pn1 <- pn1[vi]\n                pn2 <- pn2[vi]\n                vi <- vi[vi]\n            } else {\n                stop(\"less than 3 valid cross-fit pairs are available! giving up.\")\n            }\n        }\n\n        #rl <- cfm[vi]\n        vi.names<-names(cfm)[names(cfm) %in% c(pn1, pn2)] ### a similar selection was done like this in calculate.crossfit.models() function\n        rl <- cfm[vi.names]  ### with this sub-selection we select only sample pairs within the current group (e.g. pairs of ES)\n\n        # determine the set genes that were not attributed to failure in any cross-comparison\n        x <- lapply(rl, function(d) {\n            ll <- list(!(1:nrow(counts)) %in% d$vi[which(d$clusters > 1)], !(1:nrow(counts)) %in% d$vi[which(d$clusters %% 3  != 0)])\n            names(ll) <- d$ii\n            return(ll)\n        })\n        vil <- do.call(cbind, tapply(unlist(x, recursive = FALSE), factor(unlist(lapply(x, names)), levels = ids), function(l) {\n            x <- rowSums(do.call(cbind, l), na.rm = FALSE) == 0\n            x[is.na(x)] <- FALSE\n            return(x)\n        }))\n\n        #x <- lapply(rl, function(d) { ll <- list((d$failures == 1), (d$failures == 2)) names(ll) <- d$ii return(ll) })\n        #vil <- do.call(cbind, tapply(unlist(x, recursive = FALSE), factor(unlist(lapply(x, names)), levels = ids), function(l) { x <- rowSums(do.call(cbind, l), na.rm = FALSE) == 0 x[is.na(x)] <- FALSE return(x) }))\n\n        t.ls <- ls$ls[ids]\n        adjust <- NULL\n        if(!is.null(ls$adjustments)) { ls$adjustments[[groups[ids[1]]]] }\n        # fit two-NB2 mixture for each experiment\n        if(verbose) { message(paste(\"fitting\", group, \"models:\")) }\n        gc()\n\n        # pair cell name matrix\n        nm <- do.call(rbind, lapply(rl, function(x) x$ii))\n\n        ml <- papply(seq_along(ids), function(i) { try({\n            if(verbose)  message(paste(i, \":\", ids[i]))\n            # determine genes with sufficient number of non-failed observations in other experiments\n            vi <- which(rowSums(vil[, -i, drop = FALSE]) >= min(length(ids)-1, min.nonfailed))\n            fpm <- rowMeans(t(t(counts[vi, ids[-i], drop = FALSE])/(t.ls[-i])))\n            if(!is.null(adjust)) { fpm <- adjust(fpm)  } # adjust for between-group systematic differences\n            df <- data.frame(count = counts[vi, ids[i]], fpm = fpm)\n\n            # reconstruct failure prior for the cell by averaging across\n            # cross-cell comparisons where the cell did participate\n            cp <- exp(rowMeans(log(cbind(\n                do.call(cbind, lapply(rl[which(nm[, 1] == ids[i])], function(d) {\n                    ivi <- rep(NA, nrow(counts))\n                    ivi[d$vi] <- 1:length(d$vi)\n                    d$posterior[ivi[vi], 1]\n                })),\n                do.call(cbind, lapply(rl[which(nm[, 2] == ids[i])], function(d) {\n                    ivi <- rep(NA, nrow(counts))\n                    ivi[d$vi] <- 1:length(d$vi)\n                    d$posterior[ivi[vi], 2]\n                }))\n            )), na.rm = TRUE))\n            cp <- cbind(cp, 1-cp)\n\n            nai <- which(is.na(cp[, 1]))\n            cp[nai, 1] <- 1-(1e-10)\n            cp[nai, 2] <- (1e-10)\n            if(linear.fit) {\n                m1 <- fit.nb2gth.mixture.model(df, prior = cp, nrep = 1, verbose = verbose, zero.count.threshold = zero.count.threshold, full.theta.range = theta.fit.range, theta.fit.range = theta.fit.range, use.constant.theta.fit = !local.theta.fit, ...)\n            } else {\n                m1 <- fit.nb2.mixture.model(df, prior = cp, nrep = nrep, verbose = verbose, zero.count.threshold = zero.count.threshold, ...)\n            }\n\n            if(return.compressed.models) {\n                v <- get.compressed.v1.model(m1)\n                cl <- clusters(m1)\n                rm(m1)\n                gc()\n                return(list(model = v, clusters = cl))\n            }\n\n            # otherwise try to reduce the size of a full model\n            # reduce return size\n            #m1@posterior <- lapply(m1@posterior, function(m) { rownames(m) <- NULL return(m)})\n            m1@posterior <- NULL\n            #rownames(m1@concomitant@x) <- NULL\n            m1@concomitant@x <- matrix()\n            m1@model <- lapply(m1@model, function(mod) {\n                mod@x <- matrix()\n                mod@y <- matrix()\n                #rownames(mod@x) <- NULL\n                #rownames(mod@y) <- NULL\n                return(mod)\n            })\n\n            # make a clean copy of the internal environment\n            t.cleanenv <- function(comp) {\n                el <- list2env(as.list(environment(comp@logLik), all.names = TRUE), parent = globalenv())\n                ep <- list2env(as.list(environment(comp@predict), all.names = TRUE), parent = globalenv())\n                pf <- get(\"predict\", envir = el)\n                environment(pf) <- ep\n                assign(\"predict\", pf, envir = el)\n                pf <- get(\"predict\", envir = ep)\n                environment(pf) <- ep\n                assign(\"predict\", pf, envir = ep)\n\n                pf <- get(\"logLik\", envir = el)\n                environment(pf) <- el\n                assign(\"logLik\", pf, envir = el)\n                pf <- get(\"logLik\", envir = ep)\n                environment(pf) <- el\n                assign(\"logLik\", pf, envir = ep)\n\n                environment(comp@logLik) <- el\n                environment(comp@predict) <- ep\n                comp\n            }\n            m1@components <- lapply(m1@components, function(cl) lapply(cl, t.cleanenv))\n\n            # clean up the formula environment (was causing multithreading problems)\n            rm(list = ls(env = attr(m1@concomitant@formula, \".Environment\")), envir = attr(m1@concomitant@formula, \".Environment\"))\n            gc()\n            #rm(list = ls(env = attr(m1@formula, \".Environment\")), envir = attr(m1@formula, \".Environment\"))\n            return(m1)\n        })}, n.cores = n.cores) # end cell iteration\n\n        # check if there were errors in the multithreaded portion\n        vic <- which(unlist(lapply(seq_along(ml), function(i) {\n            if(class(ml[[i]]) == \"try-error\") {\n                message(\"ERROR encountered in building a model for cell \", ids[i], \" - skipping the cell. Error:\")\n                message(ml[[i]])\n                #tryCatch(stop(paste(\"ERROR encountered in building a model for cell \", ids[i])), error = function(e) stop(e))\n                return(FALSE);\n            }\n            return(TRUE);\n        })))\n        ml <- ml[vic]; names(ml) <- ids[vic];\n\n        if(length(vic)<length(ids)) {\n          message(\"ERROR fitting of \", (length(ids)-length(vic)), \" out of \", length(ids), \" cells resulted in errors reporting remaining \", length(vic), \" cells\")\n        }\n\n        if(save.plots && length(ml)>0) {\n            # model fits\n            #CairoPNG(filename = paste(group, \"model.fits.png\", sep = \".\"), width = 1024, height = 300*length(ids))\n            pdf(file = paste(group, \"model.fits.pdf\", sep = \".\"), width = ifelse(linear.fit, 15, 13), height = 4)\n            #l <- layout(matrix(seq(1, 4*length(ids)), nrow = length(ids), byrow = TRUE), rep(c(1, 1, 1, 0.5), length(ids)), rep(1, 4*length(ids)), FALSE)\n            l <- layout(matrix(seq(1, 4), nrow = 1, byrow = TRUE), rep(c(1, 1, 1, ifelse(linear.fit, 1, 0.5)), 1), rep(1, 4), FALSE)\n            par(mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n            invisible(lapply(seq_along(vic), function(j) {\n                i <- vic[j];\n                vi <- which(rowSums(vil[, -i, drop = FALSE]) >= min(length(ids)-1, min.nonfailed))\n                df <- data.frame(count = counts[vi, ids[i]], fpm = rowMeans(t(t(counts[vi, ids[-i], drop = FALSE])/(t.ls[-i]))))\n                plot.nb2.mixture.fit(ml[[j]], df, en = ids[i], do.par = FALSE, compressed.models = return.compressed.models)\n            }))\n            dev.off()\n        }\n\n        return(ml)\n\n    }) # end group iteration\n\n    if(return.compressed.models) {\n        # make a joint model matrix\n        jmm <- data.frame(do.call(rbind, lapply(mll, function(tl) do.call(rbind, lapply(tl, function(m) m$model)))))\n        rownames(jmm) <- unlist(lapply(mll, names))\n        # reorder in the original cell order\n        attr(jmm, \"groups\") <- rep(names(mll), unlist(lapply(mll, length)))\n        return(jmm)\n    } else {\n        return(mll)\n    }\n}\n\n\n#######\n## V1 optimized methods\n#######\n\n# gets an array summary of gam model structure (assumes a flat ifm list)\nget.compressed.v1.models <- function(ifml) {\n    data.frame(do.call(rbind, lapply(ifml, get.compressed.v1.model)))\n}\n\n# get a vector representation of a given model\nget.compressed.v1.model <- function(m1) {\n    if(class(m1@model[[2]]) == \"FLXMRnb2gthC\") { # linear fit model\n        v <- c(m1@concomitant@coef[c(1:2), 2], get(\"coef\", environment(m1@components[[1]][[1]]@predict)))\n        names(v) <- c(\"conc.b\", \"conc.a\", \"fail.r\")\n        vth <- m1@components[[2]][[2]]@parameters$coef\n        # translate mu regression from linear to log model\n        v <- c(v, c(\"corr.b\" = log(as.numeric(vth[\"corr.a\"])), \"corr.a\" = 1), vth[-match(\"corr.a\", names(vth))], \"conc.a2\" = m1@concomitant@coef[3, 2])\n    } else { # original publication model\n        v <- c(m1@concomitant@coef[, 2], get(\"coef\", environment(m1@components[[1]][[1]]@predict)), m1@components[[2]][[2]]@parameters$coef, get(\"theta\", environment(m1@components[[2]][[2]]@predict)))\n        names(v) <- c(\"conc.b\", \"conc.a\", \"fail.r\", \"corr.b\", \"corr.a\", \"corr.theta\")\n    }\n    v\n}\n\n# calculates posterior matrices (log scale) for a set of ifm models\ncalculate.posterior.matrices <- function(dat, ifm, prior, n.cores = 32, inner.cores = 4, outer.cores = round(n.cores/inner.cores)) {\n    marginals <- data.frame(fpm = 10^prior$x - 1)\n    marginals$fpm[marginals$fpm<0] <- 0\n    lapply(ifm, function(group.ifm) {\n        papply(sn(names(group.ifm)), function(nam) {\n            df <- get.exp.logposterior.matrix(group.ifm[[nam]], dat[, nam], marginals, n.cores = inner.cores, grid.weight = prior$grid.weight)\n            rownames(df) <- rownames(dat)\n            colnames(df) <- as.character(prior$x)\n            return(df)\n        }, n.cores = n.cores)\n    })\n}\n\nsample.posterior <- function(dat, ifm, prior, n.samples = 1, n.cores = 32) {\n    marginals <- data.frame(fpm = 10^prior$x - 1)\n    lapply(ifm, function(group.ifm) {\n        papply(sn(names(group.ifm)), function(nam) {\n            get.exp.sample(group.ifm[[nam]], dat[, nam], marginals, prior.x = prior$x, n = n.samples)\n        }, n.cores = n.cores)\n    })\n}\n\n# calculate joint posterior matrix for a given group of experiments\n# lmatl - list of posterior matrices (log scale) for individual experiments\ncalculate.joint.posterior.matrix <- function(lmatl, n.samples = 100, bootstrap = TRUE, n.cores = 15) {\n    if(bootstrap) {\n        jpl <- papply(seq_len(n.cores), function(i) jpmatLogBoot(Matl = lmatl, Nboot = ceiling(n.samples/n.cores), Seed = i), n.cores = n.cores)\n        jpl <- Reduce(\"+\", jpl)\n        jpl <- jpl/rowSums(jpl)\n    } else {\n        jpl <- Reduce(\"+\", lmatl)\n        jpl <- exp(jpl-log.row.sums(jpl))\n    }\n    rownames(jpl) <- rownames(lmatl[[1]])\n    colnames(jpl) <- colnames(lmatl[[1]])\n    jpl\n}\n\n# calculate joint posterior of a group defined by a composition vector\n# lmatll - list of posterior matrix lists (as obtained from calculate.posterior.matrices)\n# composition - a named vector, indicating the number of samples that should be drawn from each element of lmatll to compose a group\ncalculate.batch.joint.posterior.matrix <- function(lmatll, composition, n.samples = 100, n.cores = 15) {\n    # reorder composition vector to match lmatll names\n    jpl <- papply(seq_len(n.cores), function(i) jpmatLogBatchBoot(lmatll, composition[names(lmatll)], ceiling(n.samples/n.cores), i), n.cores = n.cores)\n    jpl <- Reduce(\"+\", jpl)\n    jpl <- jpl/rowSums(jpl)\n    #jpl <- jpmatLogBatchBoot(lmatll, composition[names(lmatll)], n.samples, n.cores)\n    rownames(jpl) <- rownames(lmatll[[1]][[1]])\n    colnames(jpl) <- colnames(lmatll[[1]][[1]])\n    jpl\n}\n\n# calculates the likelihood of expression difference based on\n# two posterior matrices (not adjusted for prior)\ncalculate.ratio.posterior <- function(pmat1, pmat2, prior, n.cores = 15, skip.prior.adjustment = FALSE) {\n    n <- length(prior$x)\n    if(!skip.prior.adjustment) {\n        pmat1 <- t(t(pmat1)*prior$y)\n        pmat2 <- t(t(pmat2)*prior$y)\n    }\n\n    chunk <- function(x, n) split(x, sort(rank(x) %% n))\n    if(n.cores > 1) {\n        x <- do.call(rbind, papply(chunk(1:nrow(pmat1), n.cores*5), function(ii) matSlideMult(pmat1[ii, , drop = FALSE], pmat2[ii, , drop = FALSE]), n.cores = n.cores))\n    } else {\n        x <- matSlideMult(pmat1, pmat2)\n    }\n    x <- x/rowSums(x)\n\n    rv <- seq(prior$x[1]-prior$x[length(prior$x)], prior$x[length(prior$x)]-prior$x[1], length = length(prior$x)*2-1)\n    colnames(x) <- as.character(rv)\n    rownames(x) <- rownames(pmat1)\n    return(x)\n}\n\n# quick utility function to get the difference Z score from the ratio posterior\nget.ratio.posterior.Z.score <- function(rpost, min.p = 1e-15) {\n    rpost <- rpost+min.p\n    rpost <- rpost/rowSums(rpost)\n    zi <- which.min(abs(as.numeric(colnames(rpost))))\n    gs <- rowSums(rpost[, 1:(zi-1), drop = FALSE])\n    zl <- pmin(0, qnorm(gs, lower.tail = FALSE))\n    zg <- pmax(0, qnorm(gs+rpost[, zi, drop = FALSE], lower.tail = FALSE))\n    z <- ifelse(abs(zl) > abs(zg), zl, zg)\n}\n\n# calculate a joint posterior matrix with bootstrap\njpmatLogBoot <- function(Matl, Nboot, Seed) {\n    .Call(\"jpmatLogBoot\", Matl, Nboot, Seed, PACKAGE = \"scde\")\n}\n\n# similar to the above, but compiles joint by sampling a pre-set\n# number of different types (defined by Comp factor)\njpmatLogBatchBoot <- function(Matll, Comp, Nboot, Seed) {\n    .Call(\"jpmatLogBatchBoot\", Matll, Comp, Nboot, Seed, PACKAGE = \"scde\")\n}\n\nmatSlideMult <- function(Mat1, Mat2) {\n    .Call(\"matSlideMult\", Mat1, Mat2, PACKAGE = \"scde\")\n}\n\ncalculate.failure.p <- function(dat, ifm, n.cores = 32) {\n    lapply(ifm, function(group.ifm) {\n        lapply(sn(names(group.ifm)), function(nam) {\n            get.concomitant.prob(group.ifm[[nam]], counts = dat[, nam])\n        })\n    })\n}\n\n# calculate failure probabilities across all cells for a given set\n# of levels (lfpm - log(fpm) vector for all genes\ncalculate.failure.lfpm.p <- function(lfpm, ifm, n.cores = 32) {\n    lapply(ifm, function(group.ifm) {\n        lapply(sn(names(group.ifm)), function(nam) {\n            get.concomitant.prob(group.ifm[[nam]], lfpm = lfpm)\n        })\n    })\n}\n\n# get expected fpm from counts\nget.fpm.estimates <- function(m1, counts) {\n    if(class(m1@components[[2]][[2]]) == \"FLXcomponentE\") {\n        # gam do inverse interpolation\n        b1 <- get(\"b1\", envir = environment(m1@components[[2]][[2]]@predict))\n        z <- approx(x = b1$fitted.values, y = b1$model$x, xout = counts, rule = 1:2)$y\n        z[is.na(z)] <- -Inf\n        z\n    } else {\n        # linear model\n        par <- m1@components[[2]][[2]]@parameters\n        if(!is.null(par[[\"linear\"]])) {\n            log((counts-par$coef[[1]])/par$coef[[2]])\n        } else {\n            (log(counts)-par$coef[[1]])/par$coef[[2]]\n        }\n    }\n}\n\n\n#######\n## INTERNAL FUNCTIONS\n#######\n\n# clean up stale web server reference\n.onAttach <- function(...) {\n\n    if(exists(\"___scde.server\", envir = globalenv())) {\n        old.server <- get(\"___scde.server\", envir = globalenv())\n        n.apps <- length(old.server$appList)-1\n        # TODO fix server rescue...\n        packageStartupMessage(\"scde: found stale web server instance with \", n.apps, \" apps. removing.\")\n        # remove\n        rm(\"___scde.server\", envir = globalenv())\n        return(TRUE)\n\n        if(n.apps > 0) {\n            require(Rook)\n            require(rjson)\n            packageStartupMessage(\"scde: found stale web server instance with \", n.apps, \" apps. restarting.\")\n            rm(\"___scde.server\", envir = globalenv()) # remove old instance (apparently saved Rook servers can't just be restarted ... we'll make a new one and re-add all of the apps\n\n            tryCatch( {\n                server <- get.scde.server(ip = old.server$listenAddr, port = old.server$listenPort) # launch a new server\n                if(!is.null(server)) {\n                    lapply(old.server$appList[-1], function(sa) {\n                        server$add(app = sa$app, name = sa$name)\n                    })\n                }\n            }, error = function(e) message(e))\n\n        } else {\n            packageStartupMessage(\"scde: found stale web server instance with \", n.apps, \" apps. removing.\")\n            # remove\n            rm(\"___scde.server\", envir = globalenv())\n        }\n    }\n}\n\n.onUnload <- function(libpath) {\n    library.dynam.unload(\"scde\", libpath, verbose = TRUE)\n}\n\n# rdf : count/fpm data frame\nfit.nb2.mixture.model <- function(rdf, zero.count.threshold = 10, prior = cbind(rdf$count<= zero.count.threshold, rdf$count > zero.count.threshold), nrep = 3, iter = 50, verbose = 0, background.rate = 0.1, ...) {\n    #mo1 <- FLXMRnb2glmC(count~1, components = c(1), theta.range = c(0.5, Inf))\n    #mo1 <- FLXMRglmCf(count~1, components = c(1), family = \"poisson\", mu = 0.01)\n    #mo1 <- FLXMRglmC(count~1, components = c(1), family = \"poisson\")\n    mo1 <- FLXMRglmCf(count~1, family = \"poisson\", components = c(1), mu = log(background.rate))\n    mo2 <- FLXMRnb2glmC(count~1+I(log(fpm)), components = c(2), theta.range = c(0.5, Inf))\n\n    m1 <- mc.stepFlexmix(count~1, data = rdf, k = 2, model = list(mo1, mo2), control = list(verbose = verbose, minprior = 0, iter = iter), concomitant = FLXPmultinom(~I(log(fpm))+1), cluster = prior, nrep = nrep, ...)\n\n    # check if the theta was underfit\n    if(get(\"theta\", envir = environment(m1@components[[2]][[2]]@logLik)) == 0.5) {\n        # refit theta\n        sci <- clusters(m1) == 2\n        fit <- glm.nb.fit(m1@model[[2]]@x[sci, , drop = FALSE], m1@model[[2]]@y[sci], weights = rep(1, sum(sci)), offset = c(), init.theta = 0.5)\n        assign(\"theta\", value = fit$theta, envir = environment(m1@components[[2]][[2]]@logLik))\n        m1@components[[2]][[2]]@parameters$coef <- fit$coefficients\n        assign(\"coef\", value = fit$coefficients, envir = environment(m1@components[[2]][[2]]@logLik))\n        message(\"WARNING: theta was underfit, new theta = \", fit$theta)\n    }\n\n    return(m1)\n}\n\nfit.nb2gth.mixture.model <- function(rdf, zero.count.threshold = 10, prior = as.integer(rdf$count >= zero.count.threshold | rdf$fpm<median(rdf$fpm[rdf$count<zero.count.threshold]))+1, nrep = 0, verbose = 0 , full.theta.range = c(1e-2, 1e2), theta.fit.range = full.theta.range, theta.sp = 1e-2, use.constant.theta.fit = FALSE, alpha.weight.power = 1/2, iter = 50) {\n    #mo1 <- FLXMRglmC(count~1, components = c(1), family = \"poisson\")\n    #matrix(cbind(ifelse(rdf$count<= zero.count.threshold, 0.95, 0.05), ifelse(rdf$count > zero.count.threshold, 0.95, 0.05)))\n    mo1 <- FLXMRglmCf(count~1, family = \"poisson\", components = c(1), mu = log(0.1))\n    mo2 <- FLXMRnb2gthC(count~0+fpm, components = c(2), full.theta.range = full.theta.range, theta.fit.range = theta.fit.range, theta.fit.sp = theta.sp, constant.theta = use.constant.theta.fit, alpha.weight.power = alpha.weight.power)\n    m1 <- mc.stepFlexmix(count~1, data = rdf, k = 2, model = list(mo1, mo2), control = list(verbose = verbose, minprior = 0, iter = iter), concomitant = FLXPmultinom(~I(log(fpm))+I(log(fpm)^2)+1), cluster = prior, nrep = nrep)\n    return(m1)\n}\n\n# rdf : count/fpm data frame\n# en : experiment name for plotting\n# n.zero.windows - number of windows to visualize failure model fit\n# m1 - fitted model\nplot.nb2.mixture.fit <- function(m1, rdf, en, do.par = TRUE, n.zero.windows = 50, compressed.models = FALSE, bandwidth = 0.05) {\n    #require(Cairo) require(RColorBrewer)\n    if(do.par) {\n        CairoPNG(filename = paste(en, \"model.fit.png\", sep = \".\"), width = 800, height = 300)\n        l <- layout(matrix(c(1:4), 1, 4, byrow = TRUE), c(1, 1, 1, 0.5), rep(1, 4), FALSE)\n        par(mar = c(3.5, 3.5, 3.5, 0.5), mgp = c(2.0, 0.65, 0), cex = 0.9)\n    }\n    smoothScatter(log10(rdf$fpm+1), log10(rdf$count+1), xlab = \"expected FPM\", ylab = \"observed counts\", main = paste(en, \"scatter\", sep = \" : \"), bandwidth = bandwidth)\n\n    plot(c(), c(), xlim = range(log10(rdf$fpm+1)), ylim = range(log10(rdf$count+1)), xlab = \"expected FPM\", ylab = \"observed counts\", main = paste(en, \"components\", sep = \" : \"))\n    if(compressed.models) {\n        vpi <- m1$clusters == 1\n    } else {\n        vpi <- clusters(m1) == 1\n    }\n    if(sum(vpi) > 2){\n        points(log10(rdf$fpm[vpi]+1), log10(rdf$count[vpi]+1), pch = \".\", col = densCols(log10(rdf$fpm[vpi]+1), log10(rdf$count[vpi]+1), colramp = colorRampPalette(brewer.pal(9, \"Reds\")[-(1:3)])), cex = 2)\n    }\n    if(sum(!vpi) > 2){\n        points(log10(rdf$fpm[!vpi]+1), log10(rdf$count[!vpi]+1), pch = \".\", col = densCols(log10(rdf$fpm[!vpi]+1), log10(rdf$count[!vpi]+1), colramp = colorRampPalette(brewer.pal(9, \"Blues\")[-(1:3)])), cex = 2)\n        # show fit\n        fpmo <- order(rdf$fpm[!vpi], decreasing = FALSE)\n        if(compressed.models) {\n            #rf <- scde.failure.probability(data.frame(t(m1$model)), magnitudes = log(rdf$fpm))\n            lines(log10(rdf$fpm[!vpi]+1)[fpmo], log10(exp(m1$model[[\"corr.a\"]]*log(rdf$fpm[!vpi])[fpmo]+m1$model[[\"corr.b\"]])+1))\n            if(\"corr.ltheta.b\" %in% names(m1$model)) {\n                # show 95% CI for the non-constant theta fit\n                xval <- range(log(rdf$fpm[!vpi]))\n                xval <- seq(xval[1], xval[2], length.out = 100)\n                thetas <- get.corr.theta(m1$model, xval)\n                #thetas <- exp(m1$model[[\"corr.ltheta.i\"]]+m1$model[[\"corr.ltheta.lfpm\"]]*xval)\n                #thetas <- (1+exp((m1$model[\"corr.ltheta.lfpm.m\"] - xval)/m1$model[\"corr.ltheta.lfpm.s\"]))/m1$model[\"corr.ltheta.a\"]\n                alpha <- 0.05\n                yval <- exp(m1$model[[\"corr.a\"]]*xval + m1$model[[\"corr.b\"]])\n                lines(log10(exp(xval)+1), log10(qnbinom(alpha/2, size = thetas, mu = yval)+1), col = 1, lty = 2)\n                lines(log10(exp(xval)+1), log10(qnbinom(1-alpha/2, size = thetas, mu = yval)+1), col = 1, lty = 2)\n                lines(log10(exp(xval)+1), log10(qnbinom(alpha/2, size = m1$model[[\"corr.theta\"]], mu = yval)+1), col = 8, lty = 2)\n                lines(log10(exp(xval)+1), log10(qnbinom(1-alpha/2, size = m1$model[[\"corr.theta\"]], mu = yval)+1), col = 8, lty = 2)\n            }\n        } else {\n            lines(log10(rdf$fpm[!vpi]+1)[fpmo], log10(m1@components[[2]][[2]]@predict(cbind(1, log(rdf$fpm[!vpi])))+1)[fpmo], col = 4)\n        }\n    }\n    legend(x = \"topleft\", col = c(\"red\", \"blue\"), pch = 19, legend = c(\"failure component\", \"correlated component\"), bty = \"n\", cex = 0.9)\n\n    # zero fit\n    if(n.zero.windows > nrow(rdf)) { n.zero.windows <- nrow(rdf) }\n    bw <- floor(nrow(rdf)/n.zero.windows)\n    if(compressed.models) {\n        rdf$cluster <- m1$clusters\n    } else {\n        rdf$cluster <- clusters(m1)\n    }\n    rdf <- rdf[order(rdf$fpm, decreasing = FALSE), ]\n    fdf <- data.frame(y = rowMeans(matrix(log10(rdf$fpm[1:(n.zero.windows*bw)]+1), ncol = bw, byrow = TRUE)), zf = rowMeans(matrix(as.integer(rdf$cluster[1:(n.zero.windows*bw)] == 1), ncol = bw, byrow = TRUE)))\n    plot(zf~y, fdf, ylim = c(0, 1), xlim = range(na.omit(log10(rdf$fpm+1))), xlab = \"expected FPM\", ylab = \"fraction of failures\", main = \"failure model\", pch = 16, cex = 0.5)\n    ol <- order(rdf$fpm, decreasing = TRUE)\n    if(compressed.models) {\n        fp <- scde.failure.probability(data.frame(t(m1$model)), magnitudes = log(rdf$fpm))\n        lines(log10(rdf$fpm[ol]+1), fp[ol], col = 2)\n    } else {\n        mt <- terms(m1@concomitant@formula, data = rdf)\n        mf <- model.frame(delete.response(mt), data = rdf, na.action = NULL)\n        cm0 <- exp(model.matrix(mt, data = mf) %*% m1@concomitant@coef)\n        cm0 <- cm0/rowSums(cm0)\n        lines(log10(rdf$fpm[ol]+1), cm0[ol, 1], col = 2)\n    }\n\n\n    # show thetas\n    #tl <- c(fail = get(\"theta\", envir = environment(m1@components[[1]][[1]]@logLik)), corr = get(\"theta\", envir = environment(m1@components[[2]][[2]]@logLik)))\n    if(compressed.models) {\n        if(\"corr.ltheta.b\" %in% names(m1$model)) {\n            p <- exp(m1$model[[\"corr.a\"]]*log(rdf$fpm[!vpi])+m1$model[[\"corr.b\"]])\n            alpha <- ((rdf$count[!vpi]/p-1)^2 - 1/p)\n            trng <- log(range(c(m1$model[[\"corr.theta\"]], thetas))) + 0.5*c(-1, 1)\n            # restrict the alpha to the confines of the estimated theta values\n            alpha[alpha > exp(-trng[1])] <- exp(-trng[1])\n            alpha[alpha<exp(-trng[2])] <- exp(-trng[2])\n\n            smoothScatter(log10(rdf$fpm[!vpi]+1), -log10(alpha), ylim = trng*log10(exp(1)), xlab = \"FPM\", ylab = \"log10(theta)\", main = \"overdispersion\", bandwidth = bandwidth)\n            xval <- range(log(rdf$fpm[!vpi]))\n            xval <- seq(xval[1], xval[2], length.out = 100)\n            #thetas <- exp(m1$model[[\"corr.ltheta.i\"]]+m1$model[[\"corr.ltheta.lfpm\"]]*xval)\n            thetas <- get.corr.theta(m1$model, xval)\n            #plot(log10(exp(xval)+1), log(thetas), ylim = trng, type = 'l', xlab = \"FPM\", ylab = \"log(theta)\", main = \"overdispersion\")\n            lines(log10(exp(xval)+1), log10(thetas))\n            abline(h = log10(m1$model[[\"corr.theta\"]]), col = 1, lty = 2)\n        } else {\n            tl <- c(fail = c(), corr = m1$model[[\"corr.theta\"]])\n            barplot(tl, beside = TRUE, las = 2, col = c(\"dodgerblue1\", \"indianred1\"), ylab = \"magnitude\", main = \"theta\")\n        }\n\n    } else {\n        tl <- c(fail = c(0), corr = get(\"theta\", envir = environment(m1@components[[2]][[2]]@logLik)))\n        barplot(tl, beside = TRUE, las = 2, col = c(\"dodgerblue1\", \"indianred1\"), ylab = \"magnitude\", main = \"theta\")\n    }\n    box()\n    if(do.par) {   dev.off() }\n}\n\n## from nb2.crossmodels.r\nmc.stepFlexmix <- function(..., nrep = 5, n.cores = nrep, return.all = FALSE) {\n    if(nrep < 2) {\n        return(flexmix(...))\n    } else {\n        ml <- papply(seq_len(nrep), function(m) {\n            x = try(flexmix(...))\n        }, n.cores = n.cores)\n        if(return.all) { return(ml) }\n        ml <- ml[unlist(lapply(ml, function(x) !is(x, \"try-error\")))]\n        logLiks <- unlist(lapply(ml, logLik))\n        ml[[which.max(logLiks)]]\n    }\n}\n\n# df: count matrix\n# xr: expression level for each row in the matrix\n# ml: fitted model list for a replicate\nget.rep.set.posteriors <- function(xr, df, ml, rescale = TRUE) {\n    pl <- do.call(cbind, lapply(seq_along(ml), function(i) {\n        edf <- data.frame(y = df[, i], xr = xr)\n        m1 <- ml[[i]]\n        x <- FLXgetModelmatrix(m1@model[[1]], edf, m1@model[[1]]@formula)\n        #cx <- FLXgetModelmatrix(m1@concomitant, edf, m1@concomitant@formula)\n        cm <- (1/(1+exp(-1*(x@x %*% m1@concomitant@coef[, -1]))))\n        p1 <- (1-cm)*exp(FLXdeterminePostunscaled(x, m1@components[[1]]))\n        p2 <- cm*exp(FLXdeterminePostunscaled(x, m1@components[[2]]))\n        tpr <- as.numeric(p1+p2)\n        if(rescale) {tpr <- tpr/sum(tpr) }\n        return(tpr)\n    }))\n    colnames(pl) <- names(ml)\n\n    return(pl)\n}\n\n# evaluates likelihood for a list of models and a set of\n# corresponding counts\n# ml - model list\n# counts - observed count matrix corresponding to the models\n# marginals - marginal info, to which model-specific count will be appended\nget.rep.set.general.model.posteriors <- function(ml, counts, marginals, grid.weight = rep(1, nrow(marginals)), rescale = TRUE, min.p = 0) {\n    pl <- do.call(cbind, lapply(seq_along(ml), function(i) {\n        marginals$count <- counts[, i]\n        rowSums(get.component.model.lik(ml[[i]], marginals))+min.p\n    }))\n    if(rescale) {\n        #pl <- pl*grid.weight+min.p\n        pl <- t(t(pl)/colSums(pl))\n    }\n    colnames(pl) <- names(ml)\n    return(pl)\n}\n\nget.rep.set.general.model.logposteriors <- function(ml, counts, marginals, grid.weight = rep(1, nrow(marginals)), rescale = TRUE) {\n    pl <- do.call(rbind, lapply(seq_along(ml), function(i) {\n        marginals$count <- counts[, i]\n        log.row.sums(get.component.model.loglik(ml[[i]], marginals))\n    }))\n    if(rescale) {\n        pl <- pl-log.row.sums(pl)\n    }\n    rownames(pl) <- names(ml)\n    return(t(pl))\n}\n\n# evaluate likelihood on a mixed model with a binomial concomitant\n# returns posterior probability for each component: rowSums(return) gives\n# total likelihood. (note it's not on a log scale!)\nget.component.model.lik <- function(m1, newdata) {\n    # core models\n    cp <- exp(do.call(\"+\", lapply(seq_along(m1@model), function(i) {\n        y <- posterior(m1@model[[i]], newdata, lapply(m1@components, \"[[\", i))\n    })))\n    # concomitant\n\n    # no groups!\n    mt <- terms(m1@concomitant@formula, data = newdata)\n    mf <- model.frame(delete.response(mt), data = newdata, na.action = NULL)\n    cm0 <- exp(model.matrix(mt, data = mf) %*% m1@concomitant@coef)\n    cm0 <- cm0/rowSums(cm0)\n    cm0[!is.finite(cm0)] <- 1\n    return(cp*cm0)\n}\n\n# same as above, but keeping log resolution\nget.component.model.loglik <- function(m1, newdata) {\n    # core models\n    cp <- do.call(\"+\", lapply(seq_along(m1@model), function(i) {\n        y <- posterior(m1@model[[i]], newdata, lapply(m1@components, \"[[\", i))\n    }))\n    cp[!is.finite(cp)] <- sign(cp[!is.finite(cp)])*.Machine$double.xmax\n    # concomitant\n    # no groups!\n    mt <- terms(m1@concomitant@formula, data = newdata)\n    mf <- model.frame(delete.response(mt), data = newdata, na.action = NULL)\n    cm0 <- model.matrix(mt, data = mf) %*% m1@concomitant@coef\n    cm0[is.nan(cm0)] <- 1\n    cm0[!is.finite(cm0)] <- sign(cm0[!is.finite(cm0)])*.Machine$double.xmax\n\n    cm0 <- cm0-log.row.sums(cm0)\n    return(cp+cm0)\n}\n\n# returns a matrix of posterior values, with rows corresponding to genes, and\n# columns to marginal values (prior fpkm grid)\n# m1 - model\n# counts - vector of per-gene counts for a given experiment\n# marginals - fpm data frame\nget.exp.posterior.matrix <- function(m1, counts, marginals, grid.weight = rep(1, nrow(marginals)), rescale = TRUE, n.cores = 32, min.p = 0) {\n    uc <- unique(counts)\n    #message(paste(\"get.exp.posterior.matrix() :\", round((1-length(uc)/length(counts))*100, 3), \"% savings\"))\n    cat(\".\")\n    df <- do.call(rbind, papply(uc, function(x) {\n        rowSums(get.component.model.lik(m1, cbind(marginals, count = rep(x, nrow(marginals)))))+min.p\n    }, n.cores = n.cores))\n    if(rescale) {\n        #df <- t(t(df)*grid.weight)+min.p\n        df <- df/rowSums(df)\n    }\n    df <- df[match(counts, uc), , drop = FALSE]\n    rownames(df) <- names(counts)\n    df\n}\n\nget.exp.logposterior.matrix <- function(m1, counts, marginals, grid.weight = rep(1, nrow(marginals)), rescale = TRUE, n.cores = 32) {\n    uc <- unique(counts)\n    #message(paste(\"get.exp.logposterior.matrix() :\", round((1-length(uc)/length(counts))*100, 3), \"% savings\"))\n    cat(\".\")\n    df <- do.call(rbind, papply(uc, function(x) {\n        log.row.sums(get.component.model.loglik(m1, cbind(marginals, count = rep(x, nrow(marginals)))))\n    }, n.cores = n.cores))\n    if(rescale) {\n        df <- df-log.row.sums(df)\n    }\n    df <- df[match(counts, uc), , drop = FALSE]\n    rownames(df) <- names(counts)\n    df\n}\n\n# similar to get.exp.posterior.matrix(), but returns inverse ecdf list\n# note that x must be supplied\nget.exp.posterior.samples <- function(pmatl, prior, n.samples = 1, n.cores = 32) {\n    sl <- papply(seq_along(pmatl), function(i) t(apply(pmatl[[i]], 1, function(d) approxfun(cumsum(d), prior$x, rule = 2)(runif(n.samples)))), n.cores = n.cores)\n    names(sl) <- names(pmatl)\n    sl\n}\n# similar to get.exp.posterior.matrix(), but returns inverse ecdf list\n# note that x must be supplied\nget.exp.sample <- function(m1, counts, marginals, prior.x, n, rescale = TRUE) {\n    do.call(rbind, papply(counts, function(x) {\n        tpr <- log.row.sums(get.component.model.loglik(m1, cbind(marginals, count = rep(x, nrow(marginals)))))\n        if(rescale)  {\n            tpr <- exp(tpr-max(tpr))\n            tpr <- tpr/sum(tpr)\n        }\n        return(approxfun(cumsum(tpr), prior.x, rule = 2)(runif(n)))\n    }, n.cores=1))\n}\n\n# gets a probability of failed detection for a given observation\n# optional vector of fpm values (log) can be supplied to evaluate mixing probability\n# at a point other than MLE fpm\nget.concomitant.prob <- function(m1, counts = NULL, lfpm = NULL) {\n    if(is.null(lfpm)) {\n        lfpm <- get.fpm.estimates(m1, counts)\n    }\n    newdata <- data.frame(fpm = exp(lfpm))\n    mt <- terms(m1@concomitant@formula, data = newdata)\n    mf <- model.frame(delete.response(mt), data = newdata, na.action = NULL)\n    cm0 <- exp(model.matrix(mt, data = mf) %*% m1@concomitant@coef)\n    cm0[is.nan(cm0)] <- 1\n    cm0 <- cm0/rowSums(cm0)\n    return(as.numeric(cm0[, 1]))\n}\n\n# copied from flexmix\nlog.row.sums <- function(m) {\n    M <- m[cbind(seq_len(nrow(m)), max.col(m, ties.method = \"first\"))] # \"random\" doesn't work!\n    M + log(rowSums(exp(m - M)))\n}\n\n\n#######\n## from nb1glm.R\n#######\n\n# nb2 glm implementation\nsetClass(\"FLXMRnb2glm\", contains = \"FLXMRglm\", package = \"flexmix\")\n\nFLXMRnb2glm <- function(formula = . ~ .,  offset = NULL, init.theta = NULL, theta.range = c(0, 1e3), ...) {\n    #require(MASS)\n    family <- \"negative.binomial\"\n    glmrefit <- function(x, y, w) {\n        #message(\"FLXRnb2glm:refit:nb2\")\n        fit <- c(glm.nb.fit(x, y, weights = w, offset = offset, init.theta = init.theta, theta.range = theta.range),\n                 list(call = sys.call(), offset = offset, control = eval(formals(glm.fit)$control), method = \"glm.fit\")\n        )\n        fit$df.null <- sum(w) + fit$df.null - fit$df.residual - fit$rank\n        fit$df.residual <- sum(w) - fit$rank\n        fit$x <- x\n        fit\n    }\n\n    z <- new(\"FLXMRnb2glm\", weighted = TRUE, formula = formula,\n             name = \"FLXMRnb2glm\", offset = offset,\n             family = family, refit = glmrefit)\n    z@preproc.y <- function(x) {\n        if (ncol(x)  >  1)\n            stop(paste(\"for the\", family, \"family y must be univariate\"))\n        x\n    }\n\n\n    z@defineComponent <- expression({\n        predict <- function(x, ...) {\n            dotarg = list(...)\n            #message(\"FLXRnb2glm:predict:nb2\")\n            if(\"offset\" %in% names(dotarg)) offset <- dotarg$offset\n            p <- x%*%coef\n            if (!is.null(offset)) p <- p + offset\n            negative.binomial(theta)$linkinv(p)\n        }\n        logLik <- function(x, y, ...) {\n            r <- dnbinom(y, size = theta, mu = predict(x, ...), log = TRUE)\n            #message(paste(\"FLXRnb2glm:loglik:nb2\", theta))\n            return(r)\n        }\n\n        new(\"FLXcomponent\",\n            parameters = list(coef = coef),\n            logLik = logLik, predict = predict,\n            df = df)\n    })\n\n    z@fit <- function(x, y, w){\n        #message(\"FLXRnb2glm:fit:nb2\")\n        w[y<= 1] <- w[y<= 1]/1e6 # focus the fit on non-failed genes\n        fit <- glm.nb.fit(x, y, weights = w, offset = offset, init.theta = init.theta, theta.range = theta.range)\n        # an ugly hack to restrict to non-negative slopes\n        cf <- coef(fit)\n        if(cf[2]<0) { cf <- c(mean(y*w)/sum(w), 0) }\n        with(list(coef = cf, df = ncol(x), theta = fit$theta, offset = offset), eval(z@defineComponent))\n    }\n\n    return(z)\n}\n\n# component-specific version of the nb2glm\n# nb2 glm implementation\nsetClass(\"FLXMRnb2glmC\", representation(vci = \"ANY\"), contains = \"FLXMRnb2glm\", package = \"flexmix\")\n\n# components is used to specify the indices of the components on which likelihood will be\n# evaluated. Others will return as loglik of 0\nFLXMRnb2glmC <- function(... , components = NULL) {\n    #require(MASS)\n    z <- new(\"FLXMRnb2glmC\", FLXMRnb2glm(...), vci = components)\n    z\n}\n\n# nb2 glm implementation\nsetClass(\"FLXMRnb2gam\", contains = \"FLXMRglm\", package = \"flexmix\")\n\nsetClass(\"FLXcomponentE\",\n         representation(refitTheta = \"function\",\n                        theta.fit = \"ANY\"),\n         contains = \"FLXcomponent\", package = \"flexmix\")\n\n# nb2 implementation with a simple trimmed-mean/median slope, and a gam theta fit\nsetClass(\"FLXMRnb2gth\", contains = \"FLXMRglm\", package = \"flexmix\")\n\n# get values of theta for a given set of models and expression (log-scale) magnitudes\nget.corr.theta <- function(model, lfpm, theta.range = NULL) {\n    if(\"corr.ltheta.b\" %in% names(model)) {\n        #th <- exp(-1*(model[[\"corr.ltheta.a\"]]/(1+exp((model[[\"corr.ltheta.lfpm.m\"]] - lfpm)/model[[\"corr.ltheta.lfpm.s\"]])) + log(model[[\"corr.ltheta.b\"]])))\n        th <- exp(-1*(model[[\"corr.ltheta.b\"]]+(model[[\"corr.ltheta.t\"]]-model[[\"corr.ltheta.b\"]])/(1+10^((model[[\"corr.ltheta.m\"]]-lfpm)*model[[\"corr.ltheta.s\"]]))^model[[\"corr.ltheta.r\"]]))\n    } else {\n        if(length(lfpm) > 1) {\n            th <- rep(model[[\"corr.theta\"]], length(lfpm))\n        } else {\n            th <- model[[\"corr.theta\"]]\n        }\n    }\n    if(!is.null(theta.range)) {\n        th[th<theta.range[1]] <- theta.range[1]\n        th[th > theta.range[2]] <- theta.range[2]\n        th[is.nan(th)] <- theta.range[1]\n    }\n    th\n}\n\nFLXMRnb2gth <- function(formula = . ~ .,  offset = NULL, full.theta.range = c(1e-3, 1e3), theta.fit.range = full.theta.range*c(1e-1, 1e1), theta.fit.sp = c(-1), constant.theta = FALSE, slope.mean.trim = 0.4, alpha.weight.power = 1/2, ...) {\n    if(slope.mean.trim<0) { slope.mean.trim <- 0 }\n    if(slope.mean.trim > 0.5) { slope.mean.trim <- 0.5 }\n\n    family <- \"negative.binomial\"\n    glmrefit <- function(x, y, w) {\n        message(\"ERROR: FLXRnb2gth:glmrefit: NOT IMPLEMENTED\")\n        return(NULL)\n    }\n\n    z <- new(\"FLXMRnb2gth\", weighted = TRUE, formula = formula,\n             name = \"FLXMRnb2gth\", offset = offset,\n             family = family, refit = glmrefit)\n    z@preproc.y <- function(x) {\n        if (ncol(x)  >  1)\n            stop(paste(\"for the\", family, \"family y must be univariate\"))\n        x\n    }\n\n    z@defineComponent <- expression({\n        predict <- function(x, ...) {\n            dotarg = list(...)\n            #message(\"FLXRnb2gth:predict:nb2\")\n            coef[\"corr.a\"]*x\n        }\n        logLik <- function(x, y, ...) {\n            dotarg = list(...)\n            #message(\"FLXRnb2gth:logLik\")\n            if(constant.theta) {\n                th <- coef[\"corr.theta\"]\n            } else {\n                #th <- exp(coef[\"corr.ltheta.i\"] + coef[\"corr.ltheta.lfpm\"]*log(x))\n                #th <- exp(-1*(coef[\"corr.ltheta.a\"]/(1+exp((coef[\"corr.ltheta.lfpm.m\"] - log(x))/coef[\"corr.ltheta.lfpm.s\"])) + log(coef[\"corr.ltheta.b\"])))\n                th <- get.corr.theta(coef, log(x))\n\n            }\n            # restrict theta to the pre-defined range\n            th[th  >  full.theta.range[2]] <- full.theta.range[2]\n            th[th < full.theta.range[1]] <- full.theta.range[1]\n            # evaluate NB\n            r <- dnbinom(y, size = th, mu = coef[\"corr.a\"]*x, log = TRUE)\n        }\n\n        new(\"FLXcomponent\",\n            parameters = list(coef = coef, linear = TRUE),\n            logLik = logLik, predict = predict, df = df)\n    })\n\n    z@fit <- function(x, y, w){\n        # message(\"FLXRnb2gth:fit\")\n\n        # estimate slope using weighted trimmed mean\n        #w[y == 0] <- w[y == 0]/1e6\n        #r <- y/x\n        #ro <- order(r)\n        ## cumulative weight sum along the ratio order (to figure out where to trim)\n        #cs <- cumsum(w[ro])/sum(w)\n        #lb <- min(which(cs > slope.mean.trim))\n        #ub <- max(which(cs<(1-slope.mean.trim)))\n        #ro <- ro[lb:ub]\n        ## slope fit\n        #a <- weighted.mean(r[ro], w[ro])\n\n        a <- as.numeric(coef(glm(y~0+x, family = poisson(link = \"identity\"), start = weighted.mean(y/x, w), weights = w))[1])\n\n        # predicted values\n        p <- a*x\n\n        #te <- p^2/((y-p)^2 - p) # theta point estimates\n        #te[te<theta.fit.range[1]] <- theta.fit.range[1] te[te > theta.fit.range[2]] <- theta.fit.range[2]\n        alpha <- ((y/p-1)^2 - 1/p)\n        alpha[alpha<1/theta.fit.range[2]] <- 1/theta.fit.range[2]\n        alpha[alpha > 1/theta.fit.range[1]] <- 1/theta.fit.range[1]\n\n        #theta <- MASS::theta.ml(y, p, sum(w), w)\n        theta <- MASS::theta.md(y, p, sum(w)-1, w)\n        theta <- pmin(pmax(theta.fit.range[1], theta), theta.fit.range[2])\n        if(constant.theta) {\n            v <- c(\"corr.a\" = a, \"corr.theta\" = theta)\n        } else {\n            # fit theta linear model in the log space\n            #theta.l <- glm(log(te)~log(x), weights = w)\n            #ac <- tryCatch( {\n\n            mw <- w*(as.numeric(alpha)^(alpha.weight.power))\n            lx <- log(x)\n            lx.rng <- range(lx)\n            mid.s <- (sum(lx.rng))/2\n            low <- log(x) < mid.s\n            lalpha <- log(alpha)\n            bottom.s <- quantile(lalpha[low], 0.025, na.rm = TRUE)\n            top.s <- quantile(lalpha[!low], 0.975, na.rm = TRUE)\n\n            wsr <- function(p, x, y, w = rep(1, length(y))) {\n                # borrowing nplr approach here\n                bottom <- p[1]\n                top <- p[2]\n                xmid <- p[3]\n                scal <- p[4]\n                r <- p[5]\n                yfit <- bottom+(top-bottom)/(1+10^((xmid-x)*scal))^r\n                #exp(-1*(model[[\"corr.ltheta.b\"]]+(model[[\"corr.ltheta.t\"]]-model[[\"corr.ltheta.b\"]])/(1+10^((model[[\"corr.ltheta.m\"]]-lfpm)*model[[\"corr.ltheta.s\"]]))^model[[\"corr.ltheta.r\"]]))\n                residuals <- (y - yfit)^2\n                return(sum(w*residuals))\n            }\n\n            #po <- nlm(f = wsr, p = c(bottom.s, top.s, mid.s, s = -1, r = 0.5), x = log(x), y = lalpha, w = w*(as.numeric(alpha)^(1/2)))\n            #ac <- po$estimate\n\n            #po <- nlminb(objective = wsr, start = c(bottom.s, top.s, mid.s, s = -1, r = 0.5), x = log(x), y = lalpha, w = mw)\n            po <- nlminb(objective = wsr, start = c(bottom.s, top.s, mid.s, s = -1, r = 0.5), x = log(x), y = lalpha, w = mw, lower = c(-100, -10, -100, -100, 0.1), upper = c(10, 100, 100, 0, 20))\n            ac <- po$par\n\n            #smoothScatter(log(x), log(alpha))\n            #p <- ac points(log(x), p[1]+(p[2]-p[1])/(1+10^((p[3]-log(x))*p[4]))^p[5], col = 2, pch = \".\")\n            #browser()\n            #}, error = function(e) {\n            #  message(\"encountered error trying to fit logistic model with guessed parameters.\")\n            #  # fit with fewer parameters?\n            #})\n\n            v <- c(a, theta, ac)\n            names(v) <- c(\"corr.a\", \"corr.theta\", \"corr.ltheta.b\", \"corr.ltheta.t\", \"corr.ltheta.m\", \"corr.ltheta.s\", \"corr.ltheta.r\")\n        }\n\n        with(list(coef = v, full.theta.range = full.theta.range, df = ncol(x), offset = offset), eval(z@defineComponent))\n    }\n\n    return(z)\n}\n\n# component-specific version of the nb2gth\n# nb2 gam implementation\nsetClass(\"FLXMRnb2gthC\", representation(vci = \"ANY\"), contains = \"FLXMRnb2gth\", package = \"flexmix\")\n\n# components is used to specify the indices of the components on which likelihood will be\n# evaluated. Others will return as loglik of 0\nFLXMRnb2gthC <- function(... , components = NULL) {\n    #require(mgcv)\n    z <- new(\"FLXMRnb2gthC\", FLXMRnb2gth(...), vci = components)\n    z\n}\n\nsetMethod(\"FLXdeterminePostunscaled\", signature(model = \"FLXMRnb2glmC\"), function(model, components, ...) {\n    if(is.null(model@vci)) {\n        #message(\"FLXMRnb2glmC:FLXdeterminePostunscaled - applying to all components\")\n        m <- matrix(sapply(components, function(x) x@logLik(model@x, model@y)), nrow = nrow(model@y))\n    } else {\n        #message(paste(\"FLXMRnb2glmC:FLXdeterminePostunscaled - applying to components\", paste(model@vci, collapse = \" \")))\n        m <- matrix(do.call(cbind, lapply(seq_along(components), function(i) {\n            if(i %in% model@vci) {\n                components[[i]]@logLik(model@x, model@y)\n            } else {\n                rep(0, nrow(model@y))\n            }\n        })), nrow = nrow(model@y))\n    }\n})\n\nsetMethod(\"FLXmstep\", signature(model = \"FLXMRnb2glmC\"), function(model, weights, ...) {\n    # make up a dummy component return\n    coef <- rep(0, ncol(model@x))\n    names(coef) <- colnames(model@x)\n    control <- eval(formals(glm.fit)$control)\n    comp.1 <- with(list(coef = coef, df = 0, offset = NULL,\n                        family = model@family), eval(model@defineComponent))\n\n    # iterate over components\n    unlist(lapply(seq_len(ncol(weights)), function(i) {\n        if(i %in% model@vci) {\n            #message(paste(\"FLXMRnb2glmC:FLXmstep - running m-step for component\", i))\n            FLXmstep(as(model, \"FLXMRnb2glm\"), weights[, i, drop = FALSE])\n        } else {\n            #message(paste(\"FLXMRnb2glmC:FLXmstep - dummy return for component\", i))\n            list(comp.1)\n        }\n    }), recursive = FALSE)\n})\n\n# same for gth\nsetMethod(\"FLXdeterminePostunscaled\", signature(model = \"FLXMRnb2gthC\"), function(model, components, ...) {\n    if(is.null(model@vci)) {\n        #message(\"FLXMRnb2gthC:FLXdeterminePostunscaled - applying to all components\")\n        m <- matrix(sapply(components, function(x) x@logLik(model@x, model@y)), nrow = nrow(model@y))\n    } else {\n        #message(paste(\"FLXMRnb2gthC:FLXdeterminePostunscaled - applying to components\", paste(model@vci, collapse = \" \")))\n        m <- matrix(do.call(cbind, lapply(seq_along(components), function(i) {\n            if(i %in% model@vci) {\n                components[[i]]@logLik(model@x, model@y)\n            } else {\n                rep(0, nrow(model@y))\n            }\n        })), nrow = nrow(model@y))\n    }\n})\n\nsetMethod(\"FLXmstep\", signature(model = \"FLXMRnb2gthC\"), function(model, weights, ...) {\n    # make up a dummy component return\n    coef <- rep(0, ncol(model@x))\n    names(coef) <- colnames(model@x)\n    control <- eval(formals(glm.fit)$control)\n    comp.1 <- with(list(q1 = list(coefficients = c(1)), coef = coef, df = 0, offset = NULL,\n                        family = model@family), eval(model@defineComponent))\n\n    # iterate over components\n    unlist(lapply(seq_len(ncol(weights)), function(i) {\n        if(i %in% model@vci) {\n            #message(paste(\"FLXMRnb2gthC:FLXmstep - running m-step for component\", i))\n            FLXmstep(as(model, \"FLXMRnb2gth\"), weights[, i, drop = FALSE])\n        } else {\n            #message(paste(\"FLXMRnb2gthC:FLXmstep - dummy return for component\", i))\n            list(comp.1)\n        }\n    }), recursive = FALSE)\n})\n\n# component-specific version of the nb2glm\n# nb2 glm implementation\nsetClass(\"FLXMRglmC\", representation(vci = \"ANY\"), contains = \"FLXMRglm\", package = \"flexmix\")\n\n# components is used to specify the indices of the components on which likelihood will be\n# evaluated. Others will return as loglik of 0\nFLXMRglmC <- function(... , components = NULL) {\n    #require(MASS)\n    z <- new(\"FLXMRglmC\", FLXMRglm(...), vci = components)\n    z\n}\n\nsetMethod(\"FLXdeterminePostunscaled\", signature(model = \"FLXMRglmC\"), function(model, components, ...) {\n    if(is.null(model@vci)) {\n        #message(\"FLXMRnb2glmC:FLXdeterminePostunscaled - applying to all components\")\n        m <- matrix(sapply(components, function(x) x@logLik(model@x, model@y)), nrow = nrow(model@y))\n    } else {\n        #message(paste(\"FLXMRnb2glmC:FLXdeterminePostunscaled - applying to components\", paste(model@vci, collapse = \" \")))\n        m <- matrix(do.call(cbind, lapply(seq_along(components), function(i) {\n            if(i %in% model@vci) {\n                components[[i]]@logLik(model@x, model@y)\n            } else {\n                rep(0, nrow(model@y))\n            }\n        })), nrow = nrow(model@y))\n    }\n    #message(\"FLXMRnb2glmC:FLXdeterminePostunscaled : \")\n    #message(m)\n    #browser()\n    m\n})\n\nsetMethod(\"FLXmstep\", signature(model = \"FLXMRglmC\"), function(model, weights, ...) {\n    # make up a dummy component return\n    coef <- rep(0, ncol(model@x))\n    names(coef) <- colnames(model@x)\n    control <- eval(formals(glm.fit)$control)\n    comp.1 <- with(list(coef = coef, df = 0, offset = NULL,\n                        family = model@family), eval(model@defineComponent))\n\n    # iterate over components\n    unlist(lapply(seq_len(ncol(weights)), function(i) {\n        if(i %in% model@vci) {\n            #message(paste(\"FLXMRglmC:FLXmstep - running m-step for component\", i))\n            FLXmstep(as(model, \"FLXMRglm\"), weights[, i, drop = FALSE])\n        } else {\n            #message(paste(\"FLXMRglmC:FLXmstep - dummy return for component\", i))\n            list(comp.1)\n        }\n    }), recursive = FALSE)\n})\n\n# mu-fixed version\nsetClass(\"FLXMRglmCf\", representation(mu = \"numeric\"), contains = \"FLXMRglmC\", package = \"flexmix\")\n\nFLXMRglmCf <- function(... , family = c(\"binomial\", \"poisson\"), mu = 0) {\n    #require(MASS)\n    family <- match.arg(family)\n    z <- new(\"FLXMRglmCf\", FLXMRglmC(..., family = family), mu = mu)\n    z\n}\n\nsetMethod(\"FLXmstep\", signature(model = \"FLXMRglmCf\"), function(model, weights, ...) {\n    # make up a dummy component return\n    coef <- c(model@mu, rep(0, ncol(model@x)-1))\n    names(coef) <- colnames(model@x)\n    control <- eval(formals(glm.fit)$control)\n    comp.1 <- with(list(coef = coef, df = 0, offset = NULL,\n                        family = model@family), eval(model@defineComponent))\n\n    # iterate over components\n    unlist(lapply(seq_len(ncol(weights)), function(i) {\n        list(comp.1)\n    }), recursive = FALSE)\n})\n\n# a magnitude-weighted version of the FLXPmultinom (to down-weight low-fpkm points during concomitant fit)\n# alternatively: some kind of non-decreasing function could be used\nsetClass(\"FLXPmultinomW\", contains = \"FLXPmultinom\")\n\nFLXPmultinomW <- function(formula = ~1) {\n    z <- new(\"FLXPmultinom\", name = \"FLXPmultinom\", formula = formula)\n    multinom.fit <- function(x, y, w, ...) {\n        r <- ncol(x)\n        p <- ncol(y)\n        if (p < 2) stop(\"Multinom requires at least two components.\")\n        mask <- c(rep(0, r + 1), rep(c(0, rep(1, r)), p - 1))\n        #if(missing(w)) w <- rep(1, nrow(y))\n        #w <- round(exp(x[, 2]))\n        nnet::nnet.default(x, y, w, mask = mask, size = 0,\n                           skip = TRUE, softmax = TRUE, censored = FALSE,\n                           rang = 0, trace = FALSE, ...)\n    }\n    z@fit <- function(x, y, w, ...) multinom.fit(x, y, w, ...)$fitted.values\n    z@refit <- function(x, y, w, ...) {\n        if (missing(w) || is.null(w)) w <- rep(1, nrow(y))\n        #w <- round(exp(x[, 2]))\n        fit <- nnet::multinom(y ~ 0 + x, weights = w, data = list(y = y, x = x), Hess = TRUE, trace = FALSE)\n        fit$coefnames <- colnames(x)\n        fit$vcoefnames <- fit$coefnames[seq_along(fit$coefnames)]\n        dimnames(fit$Hessian) <- lapply(dim(fit$Hessian) / ncol(x), function(i) paste(rep(seq_len(i) + 1, each = ncol(x)), colnames(x), sep = \":\"))\n        fit\n    }\n    z\n}\n\n# variation of negative.binomial family that keeps theta value accessible\nnegbin.th <- function (theta = stop(\"'theta' must be specified\"), link = \"log\")\n{\n    linktemp <- substitute(link)\n    if (!is.character(linktemp))\n        linktemp <- deparse(linktemp)\n    if (linktemp %in% c(\"log\", \"identity\", \"sqrt\"))\n        stats <- make.link(linktemp)\n    else if (is.character(link)) {\n        stats <- make.link(link)\n        linktemp <- link\n    }\n    else {\n        if (inherits(link, \"link-glm\")) {\n            stats <- link\n            if (!is.null(stats$name))\n                linktemp <- stats$name\n        }\n        else stop(linktemp, \" link not available for negative binomial family available links are \\\"identity\\\", \\\"log\\\" and \\\"sqrt\\\"\")\n    }\n    .Theta <- theta\n    env <- new.env(parent = .GlobalEnv)\n    assign(\".Theta\", theta, envir = env)\n    variance <- function(mu) mu + mu^2/.Theta\n    validmu <- function(mu) all(mu  >  0)\n    dev.resids <- function(y, mu, wt) 2 * wt * (y * log(pmax(1,\n                                                             y)/mu) - (y + .Theta) * log((y + .Theta)/(mu + .Theta)))\n    aic <- function(y, n, mu, wt, dev) {\n        term <- (y + .Theta) * log(mu + .Theta) - y * log(mu) +\n            lgamma(y + 1) - .Theta * log(.Theta) + lgamma(.Theta) -\n            lgamma(.Theta + y)\n        2 * sum(term * wt)\n    }\n    initialize <- expression({\n        if (any(y < 0)) stop(\"negative values not allowed for the negative binomial family\")\n        n <- rep(1, nobs)\n        mustart <- y + (y ==  0)/6\n    })\n    simfun <- function(object, nsim) {\n        ftd <- fitted(object)\n        val <- rnegbin(nsim * length(ftd), ftd, .Theta)\n    }\n    environment(variance) <- environment(validmu) <- environment(dev.resids) <- environment(aic) <- environment(simfun) <- env\n    famname <- paste(\"Negative Binomial(\", format(round(theta,\n                                                        4)), \")\", sep = \"\")\n    structure(list(family = famname, link = linktemp, linkfun = stats$linkfun,\n                   linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,\n                   aic = aic, mu.eta = stats$mu.eta, initialize = initialize,\n                   validmu = validmu, valideta = stats$valideta, simulate = simfun, theta = theta),\n              class = \"family\")\n}\n\n\n# .fit version of the glm.nb\nglm.nb.fit <- function(x, y, weights = rep(1, nobs), control = list(trace = 0, maxit = 20), offset = rep(0, nobs), etastart = NULL, start = NULL, mustart = NULL, init.theta = NULL, link = \"log\", method = \"glm.fit\", intercept = TRUE, theta.range = c(0, 1e5), ...) {\n    method <- \"custom.glm.fit\"\n    #require(MASS)\n    loglik <- function(n, th, mu, y, w) sum(w * (lgamma(th +\n                                                            y) - lgamma(th) - lgamma(y + 1) + th * log(th) + y *\n                                                     log(mu + (y ==  0)) - (th + y) * log(th + mu)))\n    # link <- substitute(link)\n\n    Call <- match.call()\n    control <- do.call(\"glm.control\", control)\n    n <- length(y)\n    # family for the initial guess\n    fam0 <- if (missing(init.theta) | is.null(init.theta))\n        do.call(\"poisson\", list(link = link))\n    else\n        do.call(\"negative.binomial\", list(theta = init.theta, link = link))\n\n    # fit function\n    if (!missing(method)) {\n        #message(paste(\"glm.nb.fit: method = \", method))\n        if (!exists(method, mode = \"function\"))\n            stop(\"unimplemented method: \", sQuote(method))\n        glm.fitter <- get(method)\n    }\n    else {\n        #message(\"glm.nb.fit: using default glm.fit\")\n        method <- \"glm.fit\"\n        glm.fitter <- stats::glm.fit\n        #glm.fitter <- custom.glm.fit\n    }\n\n    if (control$trace  >  1) {\n        message(\"Initial fit:\")\n    }\n    fit <- glm.fitter(x = x, y = y, weights = weights, start = start, etastart = etastart, mustart = mustart, offset = offset, family = fam0, control = control, intercept = intercept)\n    class(fit) <- c(\"glm\", \"lm\")\n\n    mu <- fit$fitted.values\n    th <- as.vector(theta.ml(y, mu, sum(weights), weights, limit = control$maxit, trace = control$trace  >  2))\n    if(!is.null(theta.range)) {\n        if(th<theta.range[1]) {\n            if (control$trace  >  1)\n                message(\"adjusting theta from \", signif(th), \" to \", signif(theta.range[1]), \" to fit the specified range\")\n            th <- theta.range[1]\n        } else if(th > theta.range[2]) {\n            if (control$trace  >  1)\n                message(\"adjusting theta from \", signif(th), \" to \", signif(theta.range[2]), \" to fit the specified range\")\n            th <- theta.range[2]\n        }\n    }\n    if (control$trace  >  1)\n        message(\"Initial value for theta:\", signif(th))\n    fam <- do.call(\"negative.binomial\", list(theta = th, link = link))\n    iter <- 0\n    d1 <- sqrt(2 * max(1, fit$df.residual))\n    d2 <- del <- 1\n    g <- fam$linkfun\n    Lm <- loglik(n, th, mu, y, weights)\n    Lm0 <- Lm + 2 * d1\n    while ((iter <- iter + 1) <=  control$maxit && (abs(Lm0 - Lm)/d1 + abs(del)/d2)  >  control$epsilon) {\n        eta <- g(mu)\n        fit <- glm.fitter(x = x, y = y, weights = weights, etastart = eta, offset = offset, family = fam, control = list(maxit = control$maxit*10, epsilon = control$epsilon, trace = control$trace  >  1), intercept = intercept)\n        t0 <- th\n        th <- theta.ml(y, mu, sum(weights), weights, limit = control$maxit, trace = control$trace  >  2)\n        if(!is.null(theta.range)) {\n            if(th<theta.range[1]) {\n                if (control$trace  >  1)\n                    message(\"adjusting theta from \", signif(th), \" to \", signif(theta.range[1]), \" to fit the specified range\")\n                th <- theta.range[1]\n            } else if(th > theta.range[2]) {\n                if (control$trace  >  1)\n                    message(\"adjusting theta from \", signif(th), \" to \", signif(theta.range[2]), \" to fit the specified range\")\n                th <- theta.range[2]\n            }\n        }\n        fam <- do.call(\"negative.binomial\", list(theta = th, link = link))\n        mu <- fit$fitted.values\n        del <- t0 - th\n        Lm0 <- Lm\n        Lm <- loglik(n, th, mu, y, weights)\n        if (control$trace) {\n            Ls <- loglik(n, th, y, y, weights)\n            Dev <- 2 * (Ls - Lm)\n            message(\"Theta(\", iter, \")  = \", signif(th), \", 2(Ls - Lm)  = \",  signif(Dev))\n        }\n    }\n    if (!is.null(attr(th, \"warn\")))\n        fit$th.warn <- attr(th, \"warn\")\n    if (iter  >  control$maxit) {\n        warning(\"alternation limit reached\")\n        fit$th.warn <- gettext(\"alternation limit reached\")\n    }\n    if (length(offset) && intercept) {\n        null.deviance <- if (\"(Intercept)\" %in% colnames(x))\n            glm.fitter(x[, \"(Intercept)\", drop = FALSE], y, weights = weights, offset = offset, family = fam, control = list(maxit = control$maxit*10, epsilon = control$epsilon, trace = control$trace  >   1), intercept = TRUE)$deviance\n        else\n            fit$deviance\n        fit$null.deviance <- null.deviance\n    }\n    class(fit) <- c(\"negbin.th\", \"glm\", \"lm\")\n    Call$init.theta <- signif(as.vector(th), 10)\n    Call$link <- link\n    fit$call <- Call\n    fit$x <- x\n    fit$y <- y\n    fit$theta <- as.vector(th)\n    fit$SE.theta <- attr(th, \"SE\")\n    fit$twologlik <- as.vector(2 * Lm)\n    fit$aic <- -fit$twologlik + 2 * fit$rank + 2\n    fit$method <- method\n    fit$control <- control\n    fit$offset <- offset\n    fit\n}\n\ncustom.glm.fit <- function (x, y, weights = rep(1, nobs), start = NULL, etastart = NULL,\n                            mustart = NULL, offset = rep(0, nobs), family = gaussian(),\n                            control = list(), intercept = TRUE, alpha = 0)\n{\n    control <- do.call(\"glm.control\", control)\n    x <- as.matrix(x)\n    xnames <- dimnames(x)[[2L]]\n    ynames <- if (is.matrix(y))\n        rownames(y)\n    else names(y)\n    conv <- FALSE\n    nobs <- NROW(y)\n    nvars <- ncol(x)\n    EMPTY <- nvars ==  0\n    if (is.null(weights))\n        weights <- rep.int(1, nobs)\n    if (is.null(offset))\n        offset <- rep.int(0, nobs)\n    variance <- family$variance\n    linkinv <- family$linkinv\n    if (!is.function(variance) || !is.function(linkinv))\n        stop(\"'family' argument seems not to be a valid family object\",\n             call. = FALSE)\n    dev.resids <- family$dev.resids\n    aic <- family$aic\n    mu.eta <- family$mu.eta\n    unless.null <- function(x, if.null) if (is.null(x))\n        if.null\n    else x\n    valideta <- unless.null(family$valideta, function(eta) TRUE)\n    validmu <- unless.null(family$validmu, function(mu) TRUE)\n    if (is.null(mustart)) {\n        eval(family$initialize)\n    }\n    else {\n        mukeep <- mustart\n        eval(family$initialize)\n        mustart <- mukeep\n    }\n    if (EMPTY) {\n        eta <- rep.int(0, nobs) + offset\n        if (!valideta(eta))\n            stop(\"invalid linear predictor values in empty model\",\n                 call. = FALSE)\n        mu <- linkinv(eta)\n        if (!validmu(mu))\n            stop(\"invalid fitted means in empty model\", call. = FALSE)\n        dev <- sum(dev.resids(y, mu, weights))\n        w <- ((weights * mu.eta(eta)^2)/variance(mu))^0.5\n        residuals <- (y - mu)/mu.eta(eta)\n        good <- rep(TRUE, length(residuals))\n        boundary <- conv <- TRUE\n        coef <- numeric(0L)\n        iter <- 0L\n    }\n    else {\n        coefold <- NULL\n        eta <- if (!is.null(etastart))\n            etastart\n        else if (!is.null(start))\n            if (length(start)  !=  nvars)\n                stop(gettextf(\"length of 'start' should equal %d and correspond to initial coefs for %s\",\n                              nvars, paste(deparse(xnames), collapse = \", \")),\n                     domain = NA)\n        else {\n            coefold <- start\n            offset + as.vector(if (NCOL(x) ==  1)\n                x * start\n                else x %*% start)\n        }\n        else family$linkfun(mustart)\n        mu <- linkinv(eta)\n        if (!(validmu(mu) && valideta(eta)))\n            stop(\"cannot find valid starting values: please specify some\",\n                 call. = FALSE)\n        devold <- sum(dev.resids(y, mu, weights))\n        boundary <- conv <- FALSE\n        for (iter in 1L:control$maxit) {\n            good <- weights  >  0\n            varmu <- variance(mu)[good]\n            if (any(is.na(varmu)))\n                stop(\"NAs in V(mu)\")\n            if (any(varmu ==  0))\n                stop(\"0s in V(mu)\")\n            mu.eta.val <- mu.eta(eta)\n            if (any(is.na(mu.eta.val[good])))\n                stop(\"NAs in d(mu)/d(eta)\")\n            good <- (weights  >  0) & (mu.eta.val  !=  0)\n            if (all(!good)) {\n                conv <- FALSE\n                warning(\"no observations informative at iteration \",\n                        iter)\n                break\n            }\n            z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]\n            #z <- (eta - offset)[good] + (family$linkfun(y[good]) - family$linkfun(mu[good]))/family$linkfun(mu.eta.val[good])\n\n            # attempting to be robust here, trowing out fraction with highest abs(z)\n            if(alpha > 0) {\n                qv <- quantile(abs(z), probs = c(alpha/2, 1.0-alpha/2))\n                gvi <- which(good)[which(abs(z)<qv[1] | abs(z) > qv[2])]\n                good[gvi] <- FALSE\n                if (all(!good)) {\n                    conv <- FALSE\n                    warning(\"no observations informative at iteration \",\n                            iter)\n                    break\n                }\n                z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]\n            }\n\n            w <- mu.eta.val[good]*sqrt(weights[good]/variance(mu)[good])\n\n            ngoodobs <- as.integer(nobs - sum(!good))\n            fit <- .Fortran(\"dqrls\", qr = x[good, ] * w, n = ngoodobs,\n                            p = nvars, y = w * z, ny = 1L, tol = min(1e-07,\n                                                                     control$epsilon/1000), coefficients = double(nvars),\n                            residuals = double(ngoodobs), effects = double(ngoodobs),\n                            rank = integer(1L), pivot = 1L:nvars, qraux = double(nvars),\n                            work = double(2 * nvars)) # , PACKAGE = \"base\"\n            #browser()\n            if (any(!is.finite(fit$coefficients))) {\n                conv <- FALSE\n                warning(gettextf(\"non-finite coefficients at iteration %d\",\n                                 iter), domain = NA)\n                break\n            }\n            if (nobs < fit$rank)\n                stop(gettextf(\"X matrix has rank %d, but only %d observations\",\n                              fit$rank, nobs), domain = NA)\n            start[fit$pivot] <- fit$coefficients\n            eta <- drop(x %*% start)\n            mu <- linkinv(eta <- eta + offset)\n            dev <- sum(dev.resids(y, mu, weights))\n            if (control$trace)\n                cat(\"Deviance  = \", dev, \"Iterations -\", iter,\n                    \"\\n\")\n            boundary <- FALSE\n            if (!is.finite(dev)) {\n                if (is.null(coefold))\n                    stop(\"no valid set of coefficients has been found: please supply starting values\",\n                         call. = FALSE)\n                warning(\"step size truncated due to divergence\",\n                        call. = FALSE)\n                ii <- 1\n                while (!is.finite(dev)) {\n                    if (ii  >  control$maxit)\n                        stop(\"inner loop 1 cannot correct step size\",\n                             call. = FALSE)\n                    ii <- ii + 1\n                    start <- (start + coefold)/2\n                    eta <- drop(x %*% start)\n                    mu <- linkinv(eta <- eta + offset)\n                    dev <- sum(dev.resids(y, mu, weights))\n                }\n                boundary <- TRUE\n                if (control$trace)\n                    cat(\"Step halved: new deviance  = \", dev, \"\\n\")\n            }\n            # require deviance to go down\n            if ((!is.null(coefold)) & (dev - devold)/(0.1 + abs(dev))  >  3*control$epsilon) {\n                warning(\"step size truncated due to increasing divergence\", call. = FALSE)\n                ii <- 1\n                while ((dev - devold)/(0.1 + abs(dev))  >  3*control$epsilon) {\n                    if (ii  >  control$maxit)   {\n                        warning(\"inner loop 1 cannot correct step size\", call. = FALSE)\n                        break\n                    }\n                    ii <- ii + 1\n                    start <- (start + coefold)/2\n                    eta <- drop(x %*% start)\n                    mu <- linkinv(eta <- eta + offset)\n                    dev <- sum(dev.resids(y, mu, weights))\n                }\n                boundary <- TRUE\n                if (control$trace)\n                    cat(\"Step halved: new deviance  = \", dev, \"\\n\")\n            }\n            if (!(valideta(eta) && validmu(mu))) {\n                if (is.null(coefold))\n                    stop(\"no valid set of coefficients has been found: please supply starting values\",\n                         call. = FALSE)\n                warning(\"step size truncated: out of bounds\",\n                        call. = FALSE)\n                ii <- 1\n                while (!(valideta(eta) && validmu(mu))) {\n                    if (ii  >  control$maxit)\n                        stop(\"inner loop 2 cannot correct step size\",\n                             call. = FALSE)\n                    ii <- ii + 1\n                    start <- (start + coefold)/2\n                    eta <- drop(x %*% start)\n                    mu <- linkinv(eta <- eta + offset)\n                }\n                boundary <- TRUE\n                dev <- sum(dev.resids(y, mu, weights))\n                if (control$trace)\n                    cat(\"Step halved: new deviance  = \", dev, \"\\n\")\n            }\n            if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {\n                conv <- TRUE\n                coef <- start\n                break\n            }\n\n            devold <- dev\n            coef <- coefold <- start\n        }\n        if (!conv)\n            warning(\"glm.fit: algorithm did not converge\", call. = FALSE)\n        if (boundary)\n            warning(\"glm.fit: algorithm stopped at boundary value\",\n                    call. = FALSE)\n        eps <- 10 * .Machine$double.eps\n        if (family$family ==  \"binomial\") {\n            if (any(mu  >  1 - eps) || any(mu < eps))\n                warning(\"glm.fit: fitted probabilities numerically 0 or 1 occurred\",\n                        call. = FALSE)\n        }\n        if (family$family ==  \"poisson\") {\n            if (any(mu < eps))\n                warning(\"glm.fit: fitted rates numerically 0 occurred\",\n                        call. = FALSE)\n        }\n        if (fit$rank < nvars)\n            coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA\n        xxnames <- xnames[fit$pivot]\n        residuals <- (y - mu)/mu.eta(eta)\n        fit$qr <- as.matrix(fit$qr)\n        nr <- min(sum(good), nvars)\n        if (nr < nvars) {\n            Rmat <- diag(nvars)\n            Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]\n        }\n        else Rmat <- fit$qr[1L:nvars, 1L:nvars]\n        Rmat <- as.matrix(Rmat)\n        Rmat[row(Rmat)  >  col(Rmat)] <- 0\n        names(coef) <- xnames\n        colnames(fit$qr) <- xxnames\n        dimnames(Rmat) <- list(xxnames, xxnames)\n    }\n    names(residuals) <- ynames\n    names(mu) <- ynames\n    names(eta) <- ynames\n    wt <- rep.int(0, nobs)\n    wt[good] <- w^2\n    names(wt) <- ynames\n    names(weights) <- ynames\n    names(y) <- ynames\n    if (!EMPTY)\n        names(fit$effects) <- c(xxnames[seq_len(fit$rank)], rep.int(\"\",\n                                                                    sum(good) - fit$rank))\n    wtdmu <- if (intercept)\n        sum(weights * y)/sum(weights)\n    else linkinv(offset)\n    nulldev <- sum(dev.resids(y, wtdmu, weights))\n    n.ok <- nobs - sum(weights ==  0)\n    nulldf <- n.ok - as.integer(intercept)\n    rank <- if (EMPTY)\n        0\n    else fit$rank\n    resdf <- n.ok - rank\n    aic.model <- aic(y, length(y), mu, weights, dev) + 2 * rank\n    list(coefficients = coef, residuals = residuals, fitted.values = mu,\n         effects = if (!EMPTY) fit$effects, R = if (!EMPTY) Rmat,\n         rank = rank, qr = if (!EMPTY) structure(fit[c(\"qr\", \"rank\",\n                                                       \"qraux\", \"pivot\", \"tol\")], class = \"qr\"), family = family,\n         linear.predictors = eta, deviance = dev, aic = aic.model,\n         null.deviance = nulldev, iter = iter, weights = wt, prior.weights = weights,\n         df.residual = resdf, df.null = nulldf, y = y, converged = conv,\n         boundary = boundary)\n}\n\n# copied from limma\nweighted.median.scde <- function (x, w, na.rm = FALSE)\n    #       Weighted median\n    #       Gordon Smyth\n    #       30 June 2005\n{\n    if (missing(w))\n        w <- rep.int(1, length(x))\n    else {\n        if(length(w)  !=  length(x)) stop(\"'x' and 'w' must have the same length\")\n        if(any(is.na(w))) stop(\"NA weights not allowed\")\n        if(any(w<0)) stop(\"Negative weights not allowed\")\n    }\n    if(is.integer(w))\n        w <- as.numeric(w)\n    if(na.rm) {\n        w <- w[i <- !is.na(x)]\n        x <- x[i]\n    }\n    if(all(w == 0)) {\n        warning(\"All weights are zero\")\n        return(NA)\n    }\n    o <- order(x)\n    x <- x[o]\n    w <- w[o]\n    p <- cumsum(w)/sum(w)\n    n <- sum(p<0.5)\n    if(p[n+1]  >  0.5)\n        x[n+1]\n    else\n        (x[n+1]+x[n+2])/2\n}\n\n# FROM common.r\nsn <- function(x) {\n    names(x) <- x\n    return(x)\n}\n\n# panel routines for pairs()\npairs.panel.hist <- function(x, i = NULL, ...) {\n    usr <- par(\"usr\")\n    on.exit(par(usr))\n    par(usr = c(usr[1:2], 0, 1.5) )\n    h <- hist(x, plot = FALSE)\n    breaks <- h$breaks\n    nB <- length(breaks)\n    y <- h$counts\n    y <- y/max(y)\n    rect(breaks[-nB], 0, breaks[-1], y, col = \"gray70\", ...)\n}\npairs.panel.cor <- function(x, y, digits = 2, prefix = \"\", cex.cor, i = NULL, j = NULL) {\n    usr <- par(\"usr\")\n    on.exit(par(usr))\n    par(usr = c(0, 1, 0, 1))\n    r <- abs(cor(x, y, method = \"pearson\"))\n    #r <- abs(cor(x, y, method = \"spearman\"))\n    txt <- format(c(r, 0.123456789), digits = digits)[1]\n    txt <- paste(prefix, txt, sep = \"\")\n    if(missing(cex.cor)) { cex <- 0.6/strwidth(txt) }\n    #text(0.5, 0.5, txt, cex = cex * r)\n    text(0.5, 0.5, txt, cex = cex)\n}\npairs.panel.scatter <- function(x, y, i = NULL, j = NULL, ...) {\n    vi <- x > 0 | y > 0\n    points(x[vi], y[vi], pch = \".\", col = densCols(x[vi], y[vi], colramp = colorRampPalette(brewer.pal(9, \"Blues\")[-(1:2)])), cex = 2)\n}\npairs.panel.smoothScatter <- function(x, y, i = NULL, j = NULL, ...) {\n    vi <- x > 0 | y > 0\n    smoothScatter(x[vi], y[vi], add = TRUE, ...)\n}\n\n# a slight modification of pairs that passes i/j indices to the panel methods\npairs.extended <- function (x, labels, panel = points, ...,\n                            lower.panel = panel, upper.panel = panel,\n                            diag.panel = NULL, text.panel = textPanel,\n                            label.pos = 0.5 + has.diag/3,\n                            cex.labels = NULL, font.labels = 1,\n                            row1attop = TRUE, gap = 1)\n{\n    textPanel <- function(x = 0.5, y = 0.5, txt, cex, font) {\n        text(x, y, txt, cex = cex, font = font)\n    }\n\n    localAxis <- function(side, x, y, xpd, bg, col = NULL, main, oma, ...) {\n        ## Explicitly ignore any color argument passed in as\n        ## it was most likely meant for the data points and\n        ## not for the axis.\n        if(side %%2 ==  1) { Axis(x, side = side, xpd = NA, ...) }\n        else { Axis(y, side = side, xpd = NA, ...) }\n    }\n\n    localPlot <- function(..., main, oma, font.main, cex.main) { plot(...) }\n    localLowerPanel <- function(..., main, oma, font.main, cex.main) { lower.panel(...) }\n    localUpperPanel <- function(..., main, oma, font.main, cex.main) { upper.panel(...) }\n    localDiagPanel <- function(..., main, oma, font.main, cex.main) { diag.panel(...) }\n\n    dots <- list(...)\n    nmdots <- names(dots)\n    if (!is.matrix(x)) {\n        x <- as.data.frame(x)\n        for(i in seq_along(names(x))) {\n            if(is.factor(x[[i]]) || is.logical(x[[i]])) {\n                x[[i]] <- as.numeric(x[[i]])\n            }\n            if(!is.numeric(unclass(x[[i]]))) {\n                stop(\"non-numeric argument to 'pairs'\")\n            }\n        }\n    } else if(!is.numeric(x)) {\n        stop(\"non-numeric argument to 'pairs'\")\n    }\n    panel <- match.fun(panel)\n    if((has.lower <- !is.null(lower.panel)) && !missing(lower.panel)) {\n        lower.panel <- match.fun(lower.panel)\n    }\n    if((has.upper <- !is.null(upper.panel)) && !missing(upper.panel)) {\n        upper.panel <- match.fun(upper.panel)\n    }\n    if((has.diag  <- !is.null( diag.panel)) && !missing( diag.panel)) {\n        diag.panel <- match.fun( diag.panel)\n    }\n\n    if(row1attop) {\n        tmp <- lower.panel\n        lower.panel <- upper.panel\n        upper.panel <- tmp\n        tmp <- has.lower\n        has.lower <- has.upper\n        has.upper <- tmp\n    }\n\n    nc <- ncol(x)\n    if (nc < 2) stop(\"only one column in the argument to 'pairs'\")\n    has.labs <- TRUE\n    if (missing(labels)) {\n        labels <- colnames(x)\n        if (is.null(labels)) {\n            labels <- paste(\"var\", 1L:nc)\n        }\n    } else if(is.null(labels)) {\n        has.labs <- FALSE\n    }\n    oma <- if(\"oma\" %in% nmdots) {\n        dots$oma\n    } else {\n        NULL\n    }\n    main <- if(\"main\" %in% nmdots) {\n        dots$main\n    } else {\n        NULL\n    }\n    if (is.null(oma)) {\n        oma <- c(4, 4, 4, 4)\n        if (!is.null(main)) {\n            oma[3L] <- 6\n        }\n    }\n    opar <- par(mfrow = c(nc, nc), mar = rep.int(gap/2, 4), oma = oma)\n    on.exit(par(opar))\n\n    for (i in if(row1attop) 1L:nc else nc:1L)\n        for (j in 1L:nc) {\n            localPlot(x[, j], x[, i], xlab = \"\", ylab = \"\", axes = FALSE, type = \"n\", ...)\n            if(i ==  j || (i < j && has.lower) || (i  >  j && has.upper) ) {\n                box()\n                if(i ==  1  && (!(j %% 2) || !has.upper || !has.lower ))\n                    localAxis(1 + 2*row1attop, x[, j], x[, i], ...)\n                if(i ==  nc && (  j %% 2  || !has.upper || !has.lower ))\n                    localAxis(3 - 2*row1attop, x[, j], x[, i], ...)\n                if(j ==  1  && (!(i %% 2) || !has.upper || !has.lower ))\n                    localAxis(2, x[, j], x[, i], ...)\n                if(j ==  nc && (  i %% 2  || !has.upper || !has.lower ))\n                    localAxis(4, x[, j], x[, i], ...)\n                mfg <- par(\"mfg\")\n                if(i ==  j) {\n                    if (has.diag) localDiagPanel(as.vector(x[, i]), i = i, ...)\n                    if (has.labs) {\n                        par(usr = c(0, 1, 0, 1))\n                        if(is.null(cex.labels)) {\n                            l.wid <- strwidth(labels, \"user\")\n                            cex.labels <- max(0.8, min(2, .9 / max(l.wid)))\n                        }\n                        text.panel(0.5, label.pos, labels[i],\n                                   cex = cex.labels, font = font.labels)\n                    }\n                } else if(i < j)\n                    localLowerPanel(as.vector(x[, j]), as.vector(x[, i]), i = i, j = j, ...)\n                else\n                    localUpperPanel(as.vector(x[, j]), as.vector(x[, i]), i = i, j = j, ...)\n                if (any(par(\"mfg\")  !=  mfg))\n                    stop(\"the 'panel' function made a new plot\")\n            } else {\n                par(new = FALSE)\n            }\n        }\n    if (!is.null(main)) {\n        font.main <- if(\"font.main\" %in% nmdots) {\n            dots$font.main\n        } else {\n            par(\"font.main\")\n        }\n        cex.main <- if(\"cex.main\" %in% nmdots) {\n            dots$cex.main\n        } else {\n            par(\"cex.main\")\n        }\n        mtext(main, 3, 3, TRUE, 0.5, cex = cex.main, font = font.main)\n    }\n    invisible(NULL)\n}\n\n\n# given a set of pdfs (columns), calculate summary statistics (mle, 95% CI, Z-score deviations from 0)\nquick.distribution.summary <- function(s.bdiffp) {\n    diffv <- as.numeric(colnames(s.bdiffp))\n    dq <- t(apply(s.bdiffp, 1, function(p) {\n        mle <- which.max(p)\n        p <- cumsum(p)\n        return(diffv[c(lb = max(c(1, which(p<0.025))), mle, min(c(length(p), which(p > (1-0.025)))))])\n    }))/log10(2)\n    colnames(dq) <- c(\"lb\", \"mle\", \"ub\")\n    cq <- rep(0, nrow(dq))\n    cq[dq[, 1] > 0] <- dq[dq[, 1] > 0, 1]\n    cq[dq[, 3]<0] <- dq[dq[, 3]<0, 3]\n    z <- get.ratio.posterior.Z.score(s.bdiffp)\n    za <- sign(z)*qnorm(p.adjust(pnorm(abs(z), lower.tail = FALSE), method = \"BH\"), lower.tail = FALSE)\n    data.frame(dq, \"ce\" = as.numeric(cq), \"Z\" = as.numeric(z), \"cZ\" = as.numeric(za))\n}\n\n\n#######\n## INTERNAL PAGODA ROUTINES\n#######\n\n# performs weighted centering of mat rows (mat - rowSums(mat*weights)/rowSums(weights))\n# possibly accounting for batch effects (i.e. centering each batch separately\nweightedMatCenter <- function(mat, matw, batch = NULL) {\n    if(is.null(batch)) {\n        return(mat-rowSums(mat*matw)/rowSums(matw))\n    } else {\n        cmat <- mat\n        invisible(tapply(seq_len(ncol(mat)), as.factor(batch), function(ii) {\n            cmat[, ii] <<- cmat[, ii, drop = FALSE] - rowSums(cmat[, ii, drop = FALSE]*matw[, ii, drop = FALSE])/rowSums(matw[, ii, drop = FALSE])\n        }))\n        return(cmat)\n    }\n}\n\n# per-experiment/per-gene weighted variance estimate\n# weight matrix should have the same dimensions as the data matrix\nweightedMatVar <- function(mat, matw, batch = NULL, center = TRUE, min.weight = 0, normalize.weights = TRUE) {\n    # normalize weights\n    #matw <- matw/rowSums(matw)\n    #matw <- matw/rowSums(matw)*ncol(matw)\n    if(center) {\n        mat <- weightedMatCenter(mat, matw, batch)\n    }\n\n    #weightedMatVar.Rcpp(mat, matw)\n    #return(rowSums(mat*mat*matw) / (1-rowSums(matw*matw)))\n    #return(rowSums(mat*mat*matw))\n\n    #return(rowSums(mat*mat*matw) * rowSums(matw) /pmax(rowSums(matw)^2 - rowSums(matw*matw), rep(min.weight, nrow(matw))))\n\n    v<- rowSums(mat*mat*matw)\n    if(normalize.weights) { v <- v/rowSums(matw) }\n    v\n}\n\n# GEV t() function\ngev.t <- function(x, loc, scale, shape = rep(0, length(loc)), log = FALSE) {\n    if(log) {\n        pmin(0, ifelse(shape == 0, -(x-loc)/scale, (-1/shape)*log(pmax(0, 1+shape*(x-loc)/scale))))\n    } else {\n        pmin(1, ifelse(shape == 0, exp(-(x-loc)/scale), ((pmax(0, 1+shape*(x-loc)/scale))^(-1/shape))))\n    }\n}\n# returns upper tail of GEV in log scale\npgev.upper.log <- function(x, loc, scale, shape = rep(0, length(loc))) {\n    tv <- gev.t(x, loc, scale, shape, log = TRUE)\n    tv[tv >  -5 & tv<0] <- log(-expm1(-exp(tv[tv >  -5 & tv<0])))\n    tv\n}\n\n# BH P-value adjustment with a log option\nbh.adjust <- function(x, log = FALSE) {\n    nai <- which(!is.na(x))\n    ox <- x\n    x<-x[nai]\n    id <- order(x, decreasing = FALSE)\n    if(log) {\n        q <- x[id] + log(length(x)/seq_along(x))\n    } else {\n        q <- x[id]*length(x)/seq_along(x)\n    }\n    a <- rev(cummin(rev(q)))[order(id)]\n    ox[nai]<-a\n    ox\n}\n\npathway.pc.correlation.distance <- function(pcc, xv, n.cores = 10, target.ndf = NULL) {\n    # all relevant gene names\n    rotn <- unique(unlist(lapply(pcc[gsub(\"^#PC\\\\d+# \", \"\", rownames(xv))], function(d) rownames(d$xp$rotation))))\n    # prepare an ordered (in terms of genes) and centered version of each component\n    pl <- papply(rownames(xv), function(nam) {\n        pnam <- gsub(\"^#PC\\\\d+# \", \"\", nam)\n        pn <- as.integer(gsub(\"^#PC(\\\\d+)# .*\", \"\\\\1\", nam))\n        rt <- pcc[[pnam]]$xp$rotation[, pn]\n        # order names/values according to increasing name match index\n        mi <- match(names(rt), rotn)\n        mo <- order(mi, decreasing = FALSE)\n        rt <- as.numeric(rt)-mean(rt)\n        return(list(i = mi[mo], v = rt[mo]))\n    }, n.cores = n.cores)\n\n    x <- .Call(\"plSemicompleteCor2\", pl, PACKAGE = \"scde\")\n\n    if(!is.null(target.ndf)) {\n        r <- x$r[upper.tri(x$r)]\n        n <- x$n[upper.tri(x$n)]\n        suppressWarnings(tv <- r*sqrt((n-2)/(1-r^2)))\n        z <- pt(tv, df = n-2, lower.tail = FALSE, log.p = TRUE)\n        nr <- qt(z, df = target.ndf-2, lower.tail = FALSE, log.p = TRUE)\n        nr <- nr/sqrt(target.ndf-2+nr^2)\n        nr[is.nan(nr)] <- r[is.nan(nr)]\n\n        cr <- x$r\n        cr[upper.tri(cr)] <- nr\n        cr[lower.tri(cr)] <- t(cr)[lower.tri(cr)]\n    } else {\n        cr <- x$r\n    }\n\n    rownames(cr) <- colnames(cr) <- rownames(xv)\n    d <- stats::as.dist(1-abs(cr))\n    d[d<0] <- 0\n    d\n\n}\n\ncollapse.aspect.clusters <- function(d, dw, ct, scale = TRUE, pick.top = FALSE) {\n    xvm <- do.call(rbind, tapply(seq_len(nrow(d)), factor(ct, levels = sort(unique(ct))), function(ii) {\n        if(length(ii) == 1) return(d[ii, ])\n        if(pick.top) {\n            return(d[ii[which.max(apply(d[ii, ], 1, var))], ])\n        }\n        xp <- pcaMethods::pca(t(d[ii, ]), nPcs = 1, center = TRUE, scale = \"none\")\n        xv <- pcaMethods::scores(xp)[, 1]\n        if(sum(abs(diff(xv))) > 0 && cor(xv, colMeans(d[ii, ]*abs(pcaMethods::loadings(xp)[, 1])))<0) { xv <- -1*xv }\n        #set scale at top pathway?\n        if(sum(abs(diff(xv))) > 0) {\n            if(scale) {\n                xv <- xv*sqrt(max(apply(d[ii, ], 1, var)))/sqrt(var(xv))\n            }\n            if(sum(abs(xv)) == 0) { xv <- abs(rnorm(length(xv), sd = 1e-6)) }\n        } else {\n            xv <- abs(rnorm(length(xv), sd = 1e-6))\n        }\n        #xv <- xv/sqrt(length(ii))\n        xv\n    }))\n    rownames(xvm) <- unlist(tapply(seq_len(nrow(d)), factor(ct, levels = sort(unique(ct))), function(ii) {\n        if(length(ii) == 1) return(rownames(d)[ii])\n        return(rownames(d)[ii[which.max(apply(d[ii, ], 1, var))]])\n    }))\n\n    xvmw <- do.call(rbind, tapply(seq_len(nrow(d)), factor(ct, levels = sort(unique(ct))), function(ii) {\n        w <- colSums(dw[ii, , drop = FALSE]*apply(d[ii, , drop = FALSE], 1, sd))\n        w <- w/sum(w)\n    }))\n\n    return(list(d = xvm, w = xvmw))\n}\n# convert R color to a web hex representation\ncol2hex <- function(col) {\n    unlist(lapply(col, function(c) {\n        c <- col2rgb(c)\n        sprintf(\"#%02X%02X%02X\", c[1], c[2], c[3])\n    }))\n}\n\nmy.heatmap2 <- function (x, Rowv = NULL, Colv = if(symm)\"Rowv\" else NULL,\n                         distfun = dist, hclustfun = hclust,\n                         reorderfun = function(d, w) reorder(d, w),\n                         add.expr, symm = FALSE, revC = identical(Colv, \"Rowv\"),\n                         scale = c(\"none\", \"row\", \"column\"), na.rm = TRUE,\n                         margins = c(5, 5), internal.margin = 0.5, ColSideColors, RowSideColors,\n                         cexRow = 0.2 + 1/log10(nr), cexCol = 0.2 + 1/log10(nc),\n                         labRow = NULL, labCol = NULL,  xlab = NULL, ylab = NULL,\n                         keep.dendro = FALSE,\n                         grid = FALSE, grid.col = 1, grid.lwd = 1,\n                         verbose = getOption(\"verbose\"), Colv.vsize = 0.15, Rowv.hsize = 0.15, ColSideColors.unit.vsize = \"0.08\", RowSideColors.hsize = 0.03, lasCol = 2, lasRow = 2, respect = FALSE, box = FALSE, zlim = NULL, ...)\n{\n    scale <- if(symm && missing(scale)) \"none\" else match.arg(scale)\n    if(length(di <- dim(x))  !=  2 || !is.numeric(x))\n        stop(\"'x' must be a numeric matrix\")\n    nr <- di[1]\n    nc <- di[2]\n    if(nr < 1 || nc <=  1)\n        stop(\"'x' must have at least one row and 2 columns\")\n    if(!is.numeric(margins) || length(margins)  !=  2)\n        stop(\"'margins' must be a numeric vector of length 2\")\n\n    if(is.null(zlim)) {\n        zlim <- range(x[is.finite(x)])\n    } else {\n        x[x<zlim[1]] <- zlim[1]\n        x[x > zlim[2]] <- zlim[2]\n    }\n\n    doRdend <- !identical(Rowv, NA)\n    doCdend <- !identical(Colv, NA)\n    ## by default order by row/col means\n    if(is.null(Rowv)) {\n        Rowv <- rowMeans(x, na.rm = na.rm)\n    }\n    if(is.null(Colv)) {\n        Colv <- colMeans(x, na.rm = na.rm)\n    }\n\n    ## get the dendrograms and reordering indices\n\n    if(doRdend) {\n        if(inherits(Rowv, \"dendrogram\"))\n            ddr <- Rowv\n        else {\n            hcr <- hclustfun(distfun(x))\n            ddr <- as.dendrogram(hcr)\n            if(!is.logical(Rowv) || Rowv)\n                ddr <- reorderfun(ddr, Rowv)\n        }\n        if(nr  !=  length(rowInd <- order.dendrogram(ddr)))\n            stop(\"row dendrogram ordering gave index of wrong length\")\n    }\n    else rowInd <- 1:nr\n\n    if(doCdend) {\n        if(inherits(Colv, \"dendrogram\"))\n            ddc <- Colv\n        else if(identical(Colv, \"Rowv\")) {\n            if(nr  !=  nc)\n                stop('Colv = \"Rowv\" but nrow(x)  !=  ncol(x)')\n            ddc <- ddr\n        }\n        else {\n            hcc <- hclustfun(distfun(if(symm)x else t(x)))\n            ddc <- as.dendrogram(hcc)\n            if(!is.logical(Colv) || Colv)\n                ddc <- reorderfun(ddc, Colv)\n        }\n        if(nc  !=  length(colInd <- order.dendrogram(ddc)))\n            stop(\"column dendrogram ordering gave index of wrong length\")\n    }\n    else colInd <- 1:nc\n\n    ## reorder x\n    x <- x[rowInd, colInd, drop = FALSE]\n\n    labRow <-\n        if(is.null(labRow))\n            if(is.null(rownames(x))) (1:nr)[rowInd] else rownames(x)\n    else labRow[rowInd]\n    labCol <-\n        if(is.null(labCol))\n            if(is.null(colnames(x))) (1:nc)[colInd] else colnames(x)\n    else labCol[colInd]\n\n    if(scale ==  \"row\") {\n        x <- sweep(x, 1, rowMeans(x, na.rm = na.rm))\n        sx <- apply(x, 1, sd, na.rm = na.rm)\n        x <- sweep(x, 1, sx, \"/\")\n    }\n    else if(scale ==  \"column\") {\n        x <- sweep(x, 2, colMeans(x, na.rm = na.rm))\n        sx <- apply(x, 2, sd, na.rm = na.rm)\n        x <- sweep(x, 2, sx, \"/\")\n    }\n\n    ## Calculate the plot layout\n    ds <- dev.size(units = \"cm\")\n\n\n    lmat <- rbind(c(NA, 3), 2:1)\n    if(doRdend) {\n        lwid <- c(if(is.character(Rowv.hsize)) Rowv.hsize else lcm(Rowv.hsize*ds[1]), 1)\n    } else {\n        lmat[2, 1] <- NA\n        lmat[1, 2] <- 2\n        lwid <- c(0, 1)\n    }\n    if(doCdend) {\n        lhei <- c(if(is.character(Colv.vsize)) Colv.vsize else lcm(Colv.vsize*ds[2]), 1)\n    } else {\n        lmat[1, 2] <- NA\n        lhei <- c(0, 1)\n    }\n    #lwid <- c(if(doRdend) lcm(Rowv.hsize*ds[1]) else \"0.5 cm\", 1)\n    #lhei <- c((if(doCdend) lcm(Colv.vsize*ds[2]) else \"0.5 cm\"), 1)\n    if(!missing(ColSideColors) && !is.null(ColSideColors)) { ## add middle row to layout\n\n        if(is.matrix(ColSideColors)) {\n            if(ncol(ColSideColors) != nc)\n                stop(\"'ColSideColors' matrix must have the same number of columns as length ncol(x)\")\n            if(is.character(ColSideColors.unit.vsize)) {\n                ww <- paste(as.numeric(gsub(\"(\\\\d+\\\\.?\\\\d*)(.*)\", \"\\\\1\", ColSideColors.unit.vsize, perl = TRUE))*nrow(ColSideColors), gsub(\"(\\\\d+\\\\.?\\\\d*)(.*)\", \"\\\\2\", ColSideColors.unit.vsize, perl = TRUE), sep = \"\")\n            } else {\n                ww <- lcm(ColSideColors.unit.vsize*ds[2]*nrow(ColSideColors))\n            }\n            lmat <- rbind(lmat[1, ]+1, c(NA, 1), lmat[2, ]+1)\n            lhei <- c(lhei[1], ww, lhei[2])\n        } else {\n            if(!is.character(ColSideColors) || length(ColSideColors)  !=  nc)\n                stop(\"'ColSideColors' must be a character vector of length ncol(x)\")\n            if(is.character(ColSideColors.unit.vsize)) {\n                ww <- paste(as.numeric(gsub(\"(\\\\d+\\\\.?\\\\d*)(.*)\", \"\\\\1\", ColSideColors.unit.vsize, perl = TRUE)), gsub(\"(\\\\d+\\\\.?\\\\d*)(.*)\", \"\\\\2\", ColSideColors.unit.vsize, perl = TRUE), sep = \"\")\n            } else {\n                ww <- lcm(ColSideColors.unit.vsize*ds[2])\n            }\n            lmat <- rbind(lmat[1, ]+1, c(NA, 1), lmat[2, ]+1)\n            lhei <- c(lhei[1], ww, lhei[2])\n        }\n    }\n    if(!missing(RowSideColors) && !is.null(RowSideColors)) { ## add middle column to layout\n        if(!is.character(RowSideColors) || length(RowSideColors)  !=  nr)\n            stop(\"'RowSideColors' must be a character vector of length nrow(x)\")\n        lmat <- cbind(lmat[, 1]+1, c(rep(NA, nrow(lmat)-1), 1), lmat[, 2]+1)\n        lwid <- c(lwid[1], if(is.character(RowSideColors.hsize)) RowSideColors.hsize else lcm(RowSideColors.hsize*ds[1]), lwid[2])\n    }\n    lmat[is.na(lmat)] <- 0\n    if(verbose) {\n        cat(\"layout: widths = \", lwid, \", heights = \", lhei, \" lmat = \\n\")\n        print(lmat)\n    }\n\n    ## Graphics `output' -----------------------\n\n    op <- par(no.readonly = TRUE)\n    #on.exit(par(op))\n    layout(lmat, widths = lwid, heights = lhei, respect = respect)\n    ## draw the side bars\n    if(!missing(RowSideColors) && !is.null(RowSideColors)) {\n        par(mar = c(margins[1], 0, 0, internal.margin))\n        image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE)\n        if(box) { box() }\n    }\n    if(!missing(ColSideColors) && !is.null(ColSideColors)) {\n        par(mar = c(internal.margin, 0, 0, margins[2]))\n        if(is.matrix(ColSideColors)) {\n            image(t(matrix(1:length(ColSideColors), byrow = TRUE, nrow = nrow(ColSideColors), ncol = ncol(ColSideColors))), col = as.vector(t(ColSideColors[, colInd, drop = FALSE])), axes = FALSE)\n            if(box) { box() }\n        } else {\n            image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE)\n            if(box) { box() }\n        }\n    }\n    ## draw the main carpet\n    par(mar = c(margins[1], 0, 0, margins[2]))\n    if(!symm || scale  !=  \"none\")\n        x <- t(x)\n    if(revC) { # x columns reversed\n        iy <- nr:1\n        ddr <- rev(ddr)\n        x <- x[, iy, drop = FALSE]\n    } else iy <- 1:nr\n\n    image(1:nc, 1:nr, x, xlim = 0.5+ c(0, nc), ylim = 0.5+ c(0, nr),\n          axes = FALSE, xlab = \"\", ylab = \"\", zlim = zlim, ...)\n    if(box) { box() }\n    axis(1, 1:nc, labels =  labCol, las =  lasCol, line =  -0.5, tick =  0, cex.axis =  cexCol)\n    if(!is.null(xlab)) mtext(xlab, side = 1, line = margins[1] - 1.25)\n    axis(4, iy, labels =  labRow, las =  lasRow, line =  -0.5, tick =  0, cex.axis =  cexRow)\n    if(!is.null(ylab)) mtext(ylab, side = 4, line = margins[2] - 1.25, las = lasRow)\n    if (!missing(add.expr))\n        eval(substitute(add.expr))\n\n\n    if(grid) {\n        abline(v = c(1:nc)-0.5, col = grid.col, lwd = grid.lwd)\n        abline(h = c(1:nr)-0.5, col = grid.col, lwd = grid.lwd)\n        box(col = grid.col, lwd = grid.lwd)\n    }\n\n    ## the two dendrograms :\n    if(doRdend) {\n        par(mar = c(margins[1], internal.margin, 0, 0))\n        plot(ddr, horiz = TRUE, axes = FALSE, yaxs = \"i\", leaflab = \"none\", xaxs = \"i\")\n    }\n\n    if(doCdend) {\n        par(mar = c(0, 0, internal.margin, margins[2]))\n        plot(ddc, axes = FALSE, xaxs = \"i\", leaflab = \"none\", yaxs = \"i\")\n    }\n    invisible(list(rowInd = rowInd, colInd = colInd,\n                   Rowv = if(keep.dendro && doRdend) ddr,\n                   Colv = if(keep.dendro && doCdend) ddc ))\n}\n\n\n# rook class for browsing differential expression results\n\nViewDiff <- setRefClass(\n    'ViewDiff',\n    fields = c('gt', 'models', 'counts', 'prior', 'groups', 'batch', 'geneLookupURL'),\n    methods = list(\n\n        initialize = function(results, models, counts, prior, groups = NULL, batch = NULL, geneLookupURL = NULL) {\n            if(!is.null(results$results)) {\n                gt <<- results$results\n            } else {\n                gt <<- results\n            }\n            # add raw names if this wasn't a batch-corrected sample\n            if(\"mle\" %in% colnames(gt)) {\n                colnames(gt) <<- paste(colnames(gt), \"raw\", sep = \"_\")\n            }\n            if(!is.null(results$batch.adjusted)) {\n                df <- results$batch.adjusted\n                colnames(df) <- paste(colnames(df), \"cor\", sep = \"_\")\n                gt <<- cbind(gt, df)\n            }\n            if(!is.null(results$batch.effect)) {\n                df <- results$batch.effect\n                colnames(df) <- paste(colnames(df), \"bat\", sep = \"_\")\n                gt <<- cbind(gt, df)\n            }\n            colnames(gt) <<- tolower(colnames(gt))\n\n            # append expression levels to the results\n            if(!is.null(results$joint.posteriors)) {\n                gt$lev1 <<- log10(as.numeric(colnames(results$joint.posteriors[[1]]))[max.col(results$joint.posteriors[[1]])]+1)\n                gt$lev2 <<- log10(as.numeric(colnames(results$joint.posteriors[[2]]))[max.col(results$joint.posteriors[[2]])]+1)\n            }\n            gt$gene <<- rownames(gt)\n            gt <<- data.frame(gt)\n\n            # guess gene lookup for common cases\n            if(is.null(geneLookupURL)) {\n                # human\n                if( any(grepl(\"ENSG\\\\d+\", gt$gene[1])) || any(c(\"CCLU1\", \"C22orf45\") %in% gt$gene)) {\n                    geneLookupURL <<- \"http://useast.ensembl.org/Homo_sapiens/Gene/Summary?g = {0}\"\n                } else if( any(grepl(\"ENSMUSG\\\\d+\", gt$gene[1]))) {\n                    geneLookupURL <<- \"http://useast.ensembl.org/Mus_musculus/Gene/Summary?g = {0}\"\n                } else if( any(c(\"Foxp2\", \"Sept1\", \"Lrrc34\") %in% gt$gene)) {\n                    # mouse MGI\n                    geneLookupURL <<- \"http://www.informatics.jax.org/searchtool/Search.do?query = {0}\"\n                } else if( any(grepl(\"FBgn\\\\d+\", gt$gene[1])) || any(c(\"CG3680\", \"CG8290\") %in% gt$gene)) {\n                    # flybase\n                    geneLookupURL <<- \"http://flybase.org/cgi-bin/uniq.html?db = fbgn&GeneSearch = {0}&context = {1}&species = Dmel&cs = yes&caller = genejump\"\n                } else {\n                    # default, forward to ensemble search\n                    geneLookupURL <<- \"http://useast.ensembl.org/Multi/Search/Results?q = {0}site = ensembl\"\n                }\n            } else {\n                geneLookupURL <<- geneLookupURL\n            }\n\n\n\n            gt <<- gt[gt$z_raw != \"NA\", ]\n            gt <<- gt[!is.na(gt$z_raw), ]\n\n\n            models <<- models\n            counts <<- counts\n            prior <<- prior\n            if(is.null(groups)) { # recover groups from models\n                groups <<- as.factor(attr(models, \"groups\"))\n                if(is.null(groups)) stop(\"ERROR: groups factor is not provided, and models structure is lacking groups attribute\")\n                names(groups) <<- rownames(models)\n            } else {\n                groups <<- groups\n            }\n            if(length(levels(groups)) != 2) {\n                stop(paste(\"ERROR: wrong number of levels in the grouping factor (\", paste(levels(groups), collapse = \" \"), \"), but must be two.\", sep = \"\"))\n            }\n\n            batch <<- batch\n            callSuper()\n        },\n        call = function(env){\n            path <- env[['PATH_INFO']]\n            req <- Request$new(env)\n            res <- Response$new()\n\n            switch(path,\n                   # INDEX\n                   '/index.html' = {\n                       body <- paste('<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\" >\n                                     <html >\n                                     <head >\n                                     <meta http-equiv = \"Content-Type\" content = \"text/html charset = iso-8859-1\" >\n                                     <title > SCDE: ', paste(levels(groups), collapse = \" vs. \"), '</title >\n                                     <!-- ExtJS -- >\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/extjs/resources/ext-theme-neptune/ext-theme-neptune-all.css\" / >\n\n                                     <!-- Shared -- >\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/ext-4.2.1.883/examples/shared/example.css\" / >\n\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/additional.css\" / >\n                                     <!-- GC -- >\n\n                                     <style type = \"text/css\" >\n                                     .x-panel-framed {\n                                     padding: 0\n                                     }\n                                     </style >\n                                     <script type = \"text/javascript\" src = \"http://pklab.med.harvard.edu/sde/ext-4.2.1.883/ext-all.js\" > </script >\n\n                                     <script type = \"text/javascript\" > var geneLookupURL = \"', geneLookupURL, '\"</script >\n                                     <script type = \"text/javascript\" src = \"http://pklab.med.harvard.edu/sde/viewembed.js\" > </script >\n\n                                     </head >\n                                     <body style = \"margin-top:0padding-top:10px\" >\n                                     <div id = \"example-grid\" > </div >\n                                     </body >\n                                     </html >\n                                     ', sep = \"\")\n                       res$header('\"Content-Type\": \"text/html\"')\n                       res$write(body)\n                   },\n                   # GENE TABLE\n                   '/genetable.json' = {\n                       lgt <- gt\n                       if(!is.null(req$params()$filter)) {\n                           fl <- rjson::fromJSON(URLdecode(req$params()$filter))\n                           for( fil in fl) {\n                               lgt <- lgt[grep(fil$value, lgt[, fil$property], perl = TRUE, ignore.case = TRUE), ]\n                           }\n                       }\n                       start <- ifelse(is.null(req$params()$start), 1, as.integer(req$params()$start)+1)\n                       limit <- ifelse(is.null(req$params()$limit), 1000, as.integer(req$params()$limit))\n                       dir <- ifelse(is.null(req$params()$dir), \"DESC\", req$params()$dir)\n                       trows <- nrow(lgt)\n                       if(trows > 0) {\n                           if(!is.null(req$params()$sort)) {\n                               if(req$params()$sort %in% colnames(lgt)) {\n                                   lgt <- lgt[order(lgt[, req$params()$sort], decreasing = (dir == \"DESC\")), ]\n                               }\n                           } else { # default sort\n                               if(is.null(lgt$z_cor)) { lgt <- lgt[order(abs(lgt$z_raw), decreasing = TRUE), ] } else { lgt <- lgt[order(abs(lgt$z_cor), decreasing = TRUE), ] }\n                           }\n                       }\n                       lgt <- format(lgt[min(start, nrow(lgt)):min((start+limit), nrow(lgt)), ], nsmall = 2, digits = 2)\n                       ol <- apply(lgt, 1, function(x) as.list(x))\n                       names(ol) <- NULL\n                       s <- rjson::toJSON(list(totalCount = trows, genes = ol))\n                       res$header('\"Content-Type\": \"application/json\"')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n\n                   },\n                   # POSTERIOR PLOT\n                   '/posterior.png' = {\n                       gene <- ifelse(is.null(req$params()$gene), sample(gt$gene), req$params()$gene)\n                       bootstrap <- ifelse(is.null(req$params()$bootstrap), TRUE, req$params()$bootstrap == \"T\")\n                       show.individual.posteriors <- ifelse(is.null(req$params()$show.individual.posteriors), TRUE, req$params()$show.individual.posteriors == \"true\")\n\n                       t <- tempfile()\n                       #require(Cairo)\n                       CairoPNG(filename = TRUE, width = 350, height = 560)\n                       scde.test.gene.expression.difference(gene = gene, models = models, counts = counts, groups = groups, prior = prior, batch = batch, ratio.range = c(-10, 10), show.individual.posteriors = show.individual.posteriors, verbose = FALSE)\n                       dev.off()\n                       res$header('Content-type', 'image/png')\n                       res$body <- t\n                       names(res$body) <- 'file'\n                   },\n                   # GENE EXPRESSION LEVELS\n                   '/elevels.html' = {\n                       geneName <- ifelse(is.null(req$params()$geneName), gt$gene[[1]], req$params()$geneName)\n                       gc <- counts[rownames(counts) == geneName, , drop = FALSE]\n                       fpm <- exp(scde.expression.magnitude(models, counts = gc))\n                       df <- rbind(FPM = gc, level = fpm)\n                       df <- round(df, 2)\n                       # order columns according to groups\n                       df <- df[, unlist(tapply(seq_len(ncol(df)), groups, I))]\n                       cell.col <- rep(c(\"#E9A994\", \"#66CCFF\"), as.integer(table(groups)))\n\n                       render.row <- function(nam, val, col) {\n                           paste(\"<tr > \", \"<th > \", nam, \"</th > \", paste(\"<td bgcolor = \", col, \" > \", val, \"</td > \", sep = \"\", collapse = \" \"), \"</tr > \", sep = \"\")\n                       }\n\n                       sh <- paste(\"<tr > \", paste(\"<th > \", c(\" \", colnames(df)), \"</th > \", sep = \"\", collapse = \" \"), \"</tr > \")\n                       #sb <- paste(render.row(\"cells\", colnames(df), cell.col), render.row(\"FPKM\", df[1, ], cell.col), render.row(\"mode\", df[2, ], cell.col), collapse = \"\\n\")\n                       sb <- paste(render.row(\"counts\", df[1, ], cell.col), render.row(\"FPM\", df[2, ], cell.col), collapse = \"\\n\")\n                       res$header('\"Content-Type\": \"text/html\"')\n                       res$write(paste(\"<table id = \\\"elevels\\\" > \", sh, sb, \"</table > \"))\n                   },\n{\n    res$write('default')\n}\n                       )\n            res$finish()\n        }\n            )\n    )\n\nt.view.pathways <- function(pathways, mat, matw, env, proper.names = rownames(mat), colcols = NULL, zlim = NULL, labRow = NA, vhc = NULL, cexCol = 1, cexRow = 1, n.pc = 1, nstarts = 50, row.order = NULL, show.Colv = TRUE, plot = TRUE, trim = 1.1/ncol(mat), bwpca = TRUE, ...) {\n    # retrieve gis\n    lab <- which(proper.names %in% na.omit(unlist(mget(pathways, envir = env, ifnotfound = NA))))\n\n    if(length(lab) == 0) {\n        # try genes\n        lab <- which(proper.names %in% pathways)\n    }\n    if(length(lab) == 0)\n        return(NULL)\n    #t.quick.show.mat(mat[lab, ], normalize.rows = TRUE)\n    #table(rownames(mat) %in% mget(pathways, envir = env))\n\n    if(trim > 0) {\n        mat <- winsorize.matrix(mat, trim = trim)\n    }\n    d <- mat[lab, , drop = FALSE]\n    dw <- matw[lab, , drop = FALSE]\n    if(length(lab) > 2) {\n        xp <- bwpca(t(d), t(dw), npcs = n.pc, center = FALSE, nstarts = 3)\n    } else {\n        xp <- list()\n    }\n\n    d <- d-rowMeans(d)\n    dd <- as.dist(1-abs(cor(t(as.matrix(d)))))\n    dd[is.na(dd)] <- 1\n    if(is.null(row.order)) {\n        if(length(lab) > 2) {\n            if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n                hc <- fastcluster::hclust(dd, method = \"ward.D\")\n            } else {\n                hc <- stats::hclust(dd, method = \"ward.D\")\n            }\n            row.order <- hc$order\n        } else {\n            row.order <- c(seq_along(lab))\n        }\n    }\n\n    if(is.null(vhc)) {\n        vd <- as.dist(1-cor(as.matrix(d)))\n        vd[is.na(vd)] <- 1\n        if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n            vhc <- fastcluster::hclust(vd, method = \"ward.D\")\n        } else {\n            vhc <- stats::hclust(vd, method = \"ward.D\")\n        }\n\n    }\n\n    if(is.null(zlim)) { zlim <- quantile(d, p = c(0.01, 0.99)) }\n    vmap <- d\n    vmap[vmap<zlim[1]] <- zlim[1]\n    vmap[vmap > zlim[2]] <- zlim[2]\n    rownames(vmap) <- rownames(d)\n\n    aval <- colSums(d*dw)/colSums(dw)\n    if(!is.null(xp$scores)) {\n        oc <- xp$scores[, n.pc]\n        if(cor(oc, aval, method = \"spearman\")<0) {\n            oc <- -1*oc\n            xp$scores[, n.pc] <- -1*xp$scores[, n.pc]\n            xp$rotation[, n.pc] <- -1*xp$rotation[, n.pc]\n        }\n        xp$oc <- oc\n        z <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc/max(abs(oc))*49)+50]\n        ld <- xp$rotation[row.order, , drop = FALSE]\n        ld <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(ld/max(abs(ld))*49)+50]\n    } else {\n        ld <- z <- NULL\n    }\n\n    aval <- colorRampPalette(c(\"blue\", \"white\", \"red\"), space = \"Lab\")(100)[round(aval/max(abs(aval))*49)+50]\n\n\n\n    # oc2 <- xp$scores[, 2]\n    # oc2 <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc2/max(abs(oc2))*49)+50]\n    # oc3 <- xp$scores[, 3]\n    # oc3 <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc3/max(abs(oc3))*49)+50]\n\n\n    #z <- do.call(rbind, list(aval, oc))\n    #z <- rbind(oc3, oc2, oc)\n\n    col <- colorRampPalette(c(\"blue\", \"white\", \"red\"), space = \"Lab\")(256)\n    if(plot) {\n        if(!is.null(colcols)) { z <- rbind(colcols, z) }\n        if(show.Colv) { Colv <- as.dendrogram(vhc) } else { Colv <- NA }\n        my.heatmap2(vmap[row.order, , drop = FALSE], Rowv = NA, Colv = Colv, zlim = zlim, col = col, scale = \"none\", RowSideColors = ld, ColSideColors = z, labRow = labRow, cexCol = cexCol, cexRow = cexRow, ...)\n    }\n    xp$vhc <- vhc\n    xp$lab <- lab\n    xp$row.order <- row.order\n    xp$col <- col\n    xp$oc.col <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(256)\n    xp$vmap <- vmap\n    xp$zlim <- zlim\n    return(invisible(xp))\n}\n\n##' View pathway or gene weighted PCA\n##'\n##' Takes in a list of pathways (or a list of genes), runs weighted PCA, optionally showing the result.\n##' @param pathways character vector of pathway or gene names\n##' @param varinfo output of pagoda.varnorm()\n##' @param goenv environment mapping pathways to genes\n##' @param n.genes number of genes to show\n##' @param two.sided whether the set of shown genes should be split among highest and lowest loading (T) or if genes with highest absolute loading (F) should be shown\n##' @param n.pc optional integer vector giving the number of principal component to show for each listed pathway\n##' @param colcols optional column color matrix\n##' @param zlim optional z color limit\n##' @param showRowLabels controls whether row labels are shown in the plot\n##' @param cexCol column label size (cex)\n##' @param cexRow row label size (cex)\n##' @param nstarts number of random starts for the wPCA\n##' @param cell.clustering cell clustering\n##' @param show.cell.dendrogram whether cell dendrogram should be shown\n##' @param plot whether the plot should be shown\n##' @param box whether to draw a box around the plotted matrix\n##' @param trim optional Winsorization trim that should be applied\n##' @param return.details whether the function should return the matrix as well as full PCA info instead of just PC1 vector\n##' @param ... additional arguments are passed to the \\code{c.view.pathways}\n##' @return cell scores along the first principal component of shown genes (returned as invisible)\n##' @export\npagoda.show.pathways <- function(pathways, varinfo, goenv = NULL, n.genes = 20, two.sided = FALSE, n.pc = rep(1, length(pathways)), colcols = NULL, zlim = NULL, showRowLabels = FALSE, cexCol = 1, cexRow = 1, nstarts = 10, cell.clustering = NULL, show.cell.dendrogram = TRUE, plot = TRUE, box = TRUE, trim = 0, return.details = FALSE , ...) {\n    labRow <- NA\n    if(showRowLabels) { labRow <- NULL }\n    x <- c.view.pathways(pathways, varinfo$mat, varinfo$matw, goenv, batch = varinfo$batch, n.genes = n.genes, two.sided = two.sided, n.pc = n.pc, colcols = colcols, zlim = zlim, labRow = labRow, cexCol = cexCol, cexRow = cexRow, trim = trim, show.Colv = show.cell.dendrogram, plot = plot, vhc = cell.clustering, labCol = NA, box = TRUE, ...)\n    if(return.details) {\n        invisible(x)\n    } else {\n        invisible(x$scores[, 1])\n    }\n}\n\n# takes in a list of pathways with a list of corresponding PC numbers\n# recalculates PCs for each individual pathway, weighting gene loading in each pathway and then by total\n# pathway variance over the number of genes (rough approximation)\nc.view.pathways <- function(pathways, mat, matw, goenv = NULL, batch = NULL, n.genes = 20, two.sided = TRUE, n.pc = rep(1, length(pathways)), colcols = NULL, zlim = NULL, labRow = NA, vhc = NULL, cexCol = 1, cexRow = 1, nstarts = 50, row.order = NULL, show.Colv = TRUE, plot = TRUE, trim = 1.1/ncol(mat), showPC = TRUE,  ...) {\n    # are these genes or pathways being passed?\n    if(!is.null(goenv)) {\n        x <- pathways %in% ls(goenv)\n    } else {\n        x <- rep(FALSE, length(pathways))\n    }\n    if(sum(x) > 0) { # some pathways matched\n      if(!all(x)) {\n        message(\"WARNING: partial match to pathway names. The following entries did not match: \", paste(pathways[!x], collapse = \" \"))\n        }\n        # look up genes for each pathway\n        pathways <- pathways[x]\n        p.genes <- mget(pathways, goenv, ifnotfound = NA)\n    } else { # try as genes\n        x <- pathways %in% rownames(mat)\n        if(sum(x) > 0) {\n            if(!all(x)) {\n                message(\"WARNING: partial match to gene names. The following entries did not match: \", paste(pathways[!x], collapse = \" \"))\n            }\n            p.genes <- list(\"genes\" = pathways[x])\n            pathways <- c(\"genes\");\n        } else { # neither genes nor pathways are passed\n            stop(\"ERROR: provided names do not match either gene nor pathway names (if the pathway environment was provided)\")\n        }\n    }\n    gvi <- rownames(mat) %in% unlist(p.genes)\n    if(trim > 0) {\n        mat <- winsorize.matrix(mat, trim = trim)\n    }\n    # recalculate wPCA for each pathway\n    ppca <- pagoda.pathway.wPCA(varinfo = list(mat = mat[gvi, , drop = FALSE], matw = matw[gvi, , drop = FALSE], batch = batch), setenv = list2env(p.genes), n.cores = 1, n.randomizations = 0, n.starts = 2, n.components = max(n.pc), verbose = FALSE, min.pathway.size = 0, max.pathway.size = Inf, n.internal.shuffles = 0)\n\n    if(length(ppca) > 1) { # if more than one pathway was supplied, combine genes using appropriate loadings and use consensus PCA (1st PC) as a pattern\n        # score top loading genes for each desired PC, scaling by the sd/sqrt(n) (so that ^2 is = var/n)\n        scaled.gene.loadings <- unlist(lapply(seq_along(pathways), function(i) {\n            gl <- ppca[[pathways[i]]]$xp$rotation[, n.pc[i], drop = TRUE]*as.numeric(ppca[[pathways[i]]]$xp$sd)[n.pc[i]]/sqrt(ppca[[pathways[i]]]$n)\n            names(gl) <- rownames(ppca[[pathways[i]]]$xp$rotation)\n            gl\n        }))\n\n\n        if(two.sided) {\n            # positive\n            reduced.gene.loadings <- sort(tapply(scaled.gene.loadings, as.factor(names(scaled.gene.loadings)), max), decreasing = TRUE)\n            selected.genes.pos <- reduced.gene.loadings[1:min(length(reduced.gene.loadings), round(n.genes/2))]\n\n            # negative\n            reduced.gene.loadings <- sort(tapply(scaled.gene.loadings, as.factor(names(scaled.gene.loadings)), min), decreasing = FALSE)\n            selected.genes.neg <- reduced.gene.loadings[1:min(length(reduced.gene.loadings), round(n.genes/2))]\n            selected.genes <- c(selected.genes.pos, selected.genes.neg)\n            selected.genes <- selected.genes[match(unique(names(selected.genes)), names(selected.genes))]\n\n        } else {\n            reduced.gene.loadings <- sort(tapply(abs(scaled.gene.loadings), as.factor(names(scaled.gene.loadings)), max), decreasing = TRUE)\n            selected.genes <- reduced.gene.loadings[1:min(length(reduced.gene.loadings), n.genes)]\n        }\n\n        # consensus pattern\n        #lab <- match(names(selected.genes), rownames(mat))\n        lab <- names(selected.genes);\n\n        if(length(lab) == 0)\n            return(NULL)\n        if(length(lab)<3) { return(NULL) }\n        if(trim > 0) {\n            rn <- rownames(mat)\n            cn <- colnames(mat)\n            mat <- winsorize.matrix(mat, trim = trim)\n            rownames(mat) <- rn\n            colnames(mat) <- cn\n        }\n        d <- mat[lab, , drop = FALSE]\n        dw <- matw[lab, , drop = FALSE]\n\n        #d <- d*abs(as.numeric(selected.genes))\n        xp <- bwpca(t(d), t(dw), npcs = 1, center = FALSE, nstarts = 3)\n\n        consensus.npc = 1 # use first PC as a pattern\n    } else { # only one pathway was provided\n        xp <- ppca[[1]]$xp\n        lab <- rownames(xp$rotation)[order(abs(xp$rotation[, n.pc[1]]), decreasing = TRUE)]\n        if(length(lab) > n.genes) {\n            lab <- lab[1:n.genes]\n            xp$rotation <- xp$rotation[lab, , drop = FALSE]\n        }\n\n        d <- mat[lab, , drop = FALSE]\n        dw <- matw[lab, , drop = FALSE]\n        consensus.npc = n.pc[1] # use specified PC as a pattern\n    }\n\n    d <- d-rowMeans(d)\n    dd <- as.dist(1-abs(cor(t(as.matrix(d)))))\n    dd[is.na(dd)] <- 1\n    if(is.null(row.order)) {\n        if(length(lab) > 2) {\n            if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n                hc <- fastcluster::hclust(dd, method = \"ward.D\")\n            } else {\n                hc <- stats::hclust(dd, method = \"ward.D\")\n            }\n            row.order <- hc$order\n        } else {\n            row.order <- c(seq_along(lab))\n        }\n    }\n\n    if(is.null(vhc)) {\n        vd <- as.dist(1-cor(as.matrix(d)))\n        vd[is.na(vd)] <- 1\n        if(is.element(\"fastcluster\", installed.packages()[, 1])) {\n            vhc <- fastcluster::hclust(vd, method = \"ward.D\")\n        } else {\n            vhc <- stats::hclust(vd, method = \"ward.D\")\n        }\n    }\n\n    if(is.null(zlim)) { zlim <- quantile(d, p = c(0.01, 0.99)) }\n    vmap <- d\n    vmap[vmap<zlim[1]] <- zlim[1]\n    vmap[vmap > zlim[2]] <- zlim[2]\n    rownames(vmap) <- rownames(d)\n\n    aval <- colSums(d*dw*as.numeric(abs(xp$rotation[, consensus.npc])))/colSums(dw)\n    oc <- xp$scores[, consensus.npc]\n    if(cor(oc, aval, method = \"p\")<0) {\n        oc <- -1*oc\n        xp$scores[, consensus.npc] <- -1*xp$scores[, consensus.npc]\n        xp$rotation[, consensus.npc] <- -1*xp$rotation[, consensus.npc]\n    }\n\n    aval <- colorRampPalette(c(\"blue\", \"white\", \"red\"), space = \"Lab\")(100)[round(aval/max(abs(aval))*49)+50]\n    z <- rbind(colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc/max(abs(oc))*49)+50])\n\n    ld <- xp$rotation[lab[row.order], consensus.npc]\n    ld <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(ld/max(abs(ld))*49)+50]\n\n    # oc2 <- xp$scores[, 2]\n    # oc2 <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc2/max(abs(oc2))*49)+50]\n    # oc3 <- xp$scores[, 3]\n    # oc3 <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(100)[round(oc3/max(abs(oc3))*49)+50]\n\n    #z <- do.call(rbind, list(aval, oc))\n    #z <- rbind(oc3, oc2, oc)\n    if((!showPC) || length(lab)<= 1) {\n        z <- NULL\n    }\n    col <- colorRampPalette(c(\"blue\", \"white\", \"red\"), space = \"Lab\")(256)\n\n    if(!is.null(colcols)) {\n      if(is.null(z)) {\n        z <- colcols;\n      } else {\n        z <- rbind(colcols, z)\n      }\n    }\n\n    if(plot) {\n        if(show.Colv) {\n            my.heatmap2(vmap[row.order, , drop = FALSE], Rowv = NA, Colv = as.dendrogram(vhc), zlim = zlim, col = col, scale = \"none\", RowSideColors = ld, ColSideColors = z, labRow = labRow, cexCol = cexCol, cexRow = cexRow, ...)\n        } else {\n            my.heatmap2(vmap[row.order, vhc$order, drop = FALSE], Rowv = NA, Colv = NA, zlim = zlim, col = col, scale = \"none\", RowSideColors = ld, ColSideColors = z[,vhc$order], labRow = labRow, cexCol = cexCol, cexRow = cexRow, ...)\n        }\n\n    }\n    xp$vhc <- vhc\n    xp$lab <- lab\n    xp$row.order <- row.order\n    xp$oc <- oc\n    xp$col <- col\n    xp$oc.col <- colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space = \"Lab\")(256)\n    xp$vmap <- vmap\n    xp$zlim <- zlim\n    xp$consensus.pc <- consensus.npc\n    return(invisible(xp))\n}\n\n# returns enriched categories for a given gene list as compared with a given universe\n# returns a list with over and under fields containing list of over and underrepresented terms\ncalculate.go.enrichment <- function(genelist, universe, pvalue.cutoff = 1e-3, mingenes = 3, env = go.env, subset = NULL, list.genes = FALSE, over.only = FALSE) {\n    genelist <- unique(genelist)\n    all.genes <- unique(ls(env))\n    # determine sizes\n    universe <- unique(c(universe, genelist))\n    universe <- universe[universe != \"\"]\n    genelist <- genelist[genelist != \"\"]\n    ns <- length(intersect(genelist, all.genes))\n    us <- length(intersect(universe, all.genes))\n    #pv <- lapply(go.map, function(gl) { nwb <- length(intersect(universe, gl[[1]])) if(nwb<mingenes) { return(0.5)} else { p <- phyper(length(intersect(genelist, gl[[1]])), nwb, us-nwb, ns) return(ifelse(p > 0.5, 1.0-p, p)) }})\n\n    # compile count vectors\n    stab <- table(unlist(mget(as.character(genelist), env, ifnotfound = NA), recursive = TRUE))\n    utab <- table(unlist(mget(as.character(universe), env, ifnotfound = NA), recursive = TRUE))\n    if(!is.null(subset)) {\n        stab <- stab[names(stab) %in% subset]\n        utab <- utab[names(utab) %in% subset]\n    }\n\n    tabmap <- match(rownames(stab), rownames(utab))\n\n    cv <- data.frame(cbind(utab, rep(0, length(utab))))\n    names(cv) <- c(\"u\", \"s\")\n    cv$s[match(rownames(stab), rownames(utab))] <- as.vector(stab)\n    cv <- na.omit(cv)\n    cv <- cv[cv$u > mingenes, ]\n\n    if(over.only) {\n        lpr <- phyper(cv$s-1, cv$u, us-cv$u, ns, lower.tail = FALSE, log.p = TRUE)\n    } else {\n        pv <- phyper(cv$s, cv$u, us-cv$u, ns, lower.tail = FALSE)\n        lpr <- ifelse(pv<0.5, phyper(cv$s-1, cv$u, us-cv$u, ns, lower.tail = FALSE, log.p = TRUE), phyper(cv$s+1, cv$u, us-cv$u, ns, lower.tail = TRUE, log.p = TRUE))\n    }\n    lpr <- phyper(cv$s-1, cv$u, us-cv$u, ns, lower.tail = FALSE, log.p = TRUE)\n    lpra <- bh.adjust(lpr, log = TRUE)\n    z <- qnorm(lpr, lower.tail = FALSE, log.p = TRUE)\n    za <- qnorm(lpra, lower.tail = FALSE, log.p = TRUE)\n    # correct for multiple hypothesis\n    mg <- length(which(cv$u > mingenes))\n    if(over.only) {\n        if(pvalue.cutoff<1) {\n            ovi <- which(lpra<= log(pvalue.cutoff))\n            uvi <- c()\n        } else {\n            ovi <- which((lpr+mg)<= log(pvalue.cutoff))\n            uvi <- c()\n        }\n    } else {\n        if(pvalue.cutoff<1) {\n            ovi <- which(pv<0.5 & lpra<= log(pvalue.cutoff))\n            uvi <- which(pv > 0.5 & lpra<= log(pvalue.cutoff))\n        } else {\n            ovi <- which(pv<0.5 & (lpr+mg)<= log(pvalue.cutoff))\n            uvi <- which(pv > 0.5 & (lpr+mg)<= log(pvalue.cutoff))\n        }\n    }\n    ovi <- ovi[order(lpr[ovi])]\n    uvi <- uvi[order(lpr[uvi])]\n\n    #return(list(over = data.frame(t = rownames(cv)[ovi], o = cv$s[ovi], u = cv$u[ovi], p = pr[ovi]*mg), under = data.frame(t = rownames(cv)[uvi], o = cv$s[uvi], u = cv$u[uvi], p = pr[uvi]*mg)))\n    if(list.genes) {\n        x <- mget(as.character(genelist), env, ifnotfound = NA)\n        df <- data.frame(id = rep(names(x), unlist(lapply(x, function(d) length(na.omit(d))))), go = na.omit(unlist(x)), stringsAsFactors = FALSE)\n        ggl <- tapply(df$id, as.factor(df$go), I)\n        ovg <- as.character(unlist(lapply(ggl[rownames(cv)[ovi]], paste, collapse = \" \")))\n        uvg <- as.character(unlist(lapply(ggl[rownames(cv)[uvi]], paste, collapse = \" \")))\n        return(list(over = data.frame(t = rownames(cv)[ovi], o = cv$s[ovi], u = cv$u[ovi], Za = za, fe = cv$s[ovi]/(ns*cv$u[ovi]/us), genes = ovg), under = data.frame(t = rownames(cv)[uvi], o = cv$s[uvi], u = cv$u[uvi], Za = za, fe = cv$s[uvi]/(ns*cv$u[uvi]/us), genes = uvg)))\n    } else {\n        return(list(over = data.frame(t = rownames(cv)[ovi], o = cv$s[ovi], u = cv$u[ovi], p.raw = exp(lpr[ovi]), fdr = exp(lpra)[ovi], Z = z[ovi], Za = za[ovi], fe = cv$s[ovi]/(ns*cv$u[ovi]/us), fer = cv$s[ovi]/(length(genelist)*cv$u[ovi]/length(universe))), under = data.frame(t = rownames(cv)[uvi], o = cv$s[uvi], u = cv$u[uvi], p.raw = exp(lpr[uvi]), fdr = exp(lpra)[uvi], Z = z[uvi], Za = za[uvi], fe = cv$s[uvi]/(ns*cv$u[uvi]/us))))\n    }\n}\n\n##' wrapper around different mclapply mechanisms\n##'\n##' Abstracts out mclapply implementation, and defaults to lapply when only one core is requested (helps with debugging)\n##' @param ... parameters to pass to lapply, mclapply, bplapply, etc.\n##' @param n.cores number of cores. If 1 core is requested, will default to lapply\npapply <- function(...,n.cores=n) {\n  if(n.cores>1) {\n    # bplapply implementation\n    bplapply(... , BPPARAM = MulticoreParam(workers = n.cores))\n  } else { # fall back on lapply\n    lapply(...);\n  }\n}\n\n\n##' A Reference Class to represent the PAGODA application\n##'\n##' This ROOK application class enables communication with the client-side ExtJS framework and Inchlib HTML5 canvas libraries to create the graphical user interface for PAGODA\n##' Refer to the code in \\code{\\link{make.pagoda.app}} for usage example\n##'\n##' @field results Output of the pathway clustering and redundancy reduction\n##' @field genes List of genes to display in the Detailed clustering panel\n##' @field pathways\n##' @field mat Matrix of posterior mode count estimates\n##' @field matw Matrix of weights associated with each estimate in \\code{mat}\n##' @field goenv Gene set list as an environment\n##' @field renv Global environment\n##' @field name Name of the application page; for display as the page title\n##' @field trim Trim quantity used for Winsorization for visualization\n##' @field batch Any batch or other known confounders to be included in the visualization as a column color track\n##'\nViewPagodaApp <- setRefClass(\n    'ViewPagodaApp',\n    fields = c('results', 'genes', 'pathways', 'mat', 'matw', 'goenv', 'renv', 'name', 'trim', 'batch'),\n    methods = list(\n\n        initialize = function(results, pathways, genes, mat, matw, goenv, batch = NULL, name = \"pathway overdispersion\", trim = 1.1/ncol(mat)) {\n            results <<- results\n            genes <<- genes\n            genes$svar <<- genes$var/max(genes$var)\n            genes <<- genes\n            mat <<- mat\n            matw <<- matw\n            batch <<- batch\n            goenv <<- goenv\n            pathways <<- pathways\n            name <<- name\n            trim <<- trim\n            # reverse lookup environment\n            renvt <- new.env(parent = globalenv())\n            xn <- ls(envir = goenv)\n            xl <- mget(xn, envir = goenv)\n            gel <- tapply(rep(xn, unlist(lapply(xl, length))), unlist(xl), I)\n            gel <- gel[nchar(names(gel)) > 0]\n            x <- lapply(names(gel), function(n) assign(n, gel[[n]], envir = renvt))\n            renv <<- renvt\n            rm(xn, xl, x, gel, renvt)\n            gc()\n            callSuper()\n        },\n        getgenecldata = function(genes = NULL, gcl = NULL, ltrim = 0) { # helper function to get the heatmap data for a given set of genes\n            if(is.null(gcl)) {\n                gcl <- t.view.pathways(genes, mat = mat, matw = matw, env = goenv, vhc = results$hvc, plot = FALSE, trim = ltrim)\n            }\n\n            matrix <- gcl$vmap[rev(gcl$row.order), results$hvc$order, drop = FALSE]\n            matrix <- list(data = as.numeric(t(matrix)),\n                           dim = dim(matrix),\n                           rows = rownames(matrix),\n                           cols = colnames(matrix),\n                           colors = gcl$col,\n                           domain = seq.int(gcl$zlim[1], gcl$zlim[2], length.out = length(gcl$col))\n            )\n\n            ol <- list(matrix = matrix)\n            if(nrow(gcl$vmap) > 2) {\n                rcmvar <- matrix(gcl$rotation[rev(gcl$row.order), , drop = FALSE], ncol = 1)\n                rowcols <- list(data = as.numeric(t(rcmvar)),\n                                dim = dim(rcmvar),\n                                colors = gcl$oc.col,\n                                domain = seq.int(-1*max(abs(rcmvar)), max(abs(rcmvar)), length.out = length(gcl$oc.col))\n                )\n\n                colcols <- matrix(gcl$oc[results$hvc$order], nrow = 1)\n                colcols <- list(data = as.numeric(t(colcols)),\n                                dim = dim(colcols),\n                                colors = gcl$oc.col,\n                                domain = seq.int(-1*max(abs(colcols)), max(abs(colcols)), length.out = length(gcl$oc.col))\n                )\n                ol <- c(ol, list(rowcols = rowcols, colcols = colcols))\n            }\n            ol\n        },\n        call = function(env){\n            path <- env[['PATH_INFO']]\n            req <- Request$new(env)\n            res <- Response$new()\n            switch(path,\n                   # INDEX\n                   '/index.html' = {\n                       body <- paste('<!DOCTYPE html >\n                                     <meta charset = \"utf-8\" >\n                                     <html >\n                                     <head >\n                                     <title > ', name, '</title >\n                                     <meta http-equiv = \"Content-Type\" content = \"text/html charset = iso-8859-1\" >\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/extjs/resources/ext-theme-neptune/ext-theme-neptune-all.css\" / >\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/extjs/examples/shared/example.css\" / >\n                                     <link rel = \"stylesheet\" type = \"text/css\" href = \"http://pklab.med.harvard.edu/sde/pathcl.css\" / >\n                                     <head profile = \"http://www.w3.org/2005/10/profile\" >\n                                     <link rel = \"icon\" type = \"image/png\" href = \"http://pklab.med.harvard.edu/sde/pagoda.png\" >\n                                     <script type = \"text/javascript\" src = \"http://pklab.med.harvard.edu/sde/extjs/ext-all.js\" > </script >\n                                     <script type = \"text/javascript\" src = \"http://pklab.med.harvard.edu/sde/jquery-1.11.1.min.js\" > </script >\n                                     <script src = \"http://d3js.org/d3.v3.min.js\" charset = \"utf-8\" > </script >\n                                     <script type = \"text/javascript\" src = \"http://pklab.med.harvard.edu/sde/pathcl.js\" > </script >\n                                     </head >\n                                     <body > </body >\n                                     </html >\n                                     ', sep = \"\")\n                       res$header('\"Content-Type\": \"text/html\"')\n                       res$write(body)\n                   },\n                   '/pathcl.json' = { # report pathway clustering heatmap data\n                       # column dendrogram\n                       t <- paste(tempfile(), \"svg\", sep = \".\")\n                       svg(file = t, width = 1, height = 1) # will be rescaled later\n                       par(mar = rep(0, 4), mgp = c(2, 0.65, 0), cex = 1, oma = rep(0, 4))\n                       #plot(results$hvc, main = \"\", sub = \"\", xlab = \"\", ylab = \"\", axes = FALSE, labels = FALSE, xaxs = \"i\", yaxs = \"i\", hang = 0.02)\n                       plot(as.dendrogram(results$hvc), axes = FALSE, yaxs = \"i\", xaxs = \"i\", xlab = \"\", ylab = \"\", sub = \"\", main = \"\", leaflab = \"none\")\n                       dev.off()\n                       x <- readLines(t)\n                       treeg <- paste(x[-c(1, 2, length(x))], collapse = \"\")\n\n                       matrix <- results$rcm[rev(results$tvc$order), results$hvc$order]\n                       matrix <- list(data = as.numeric(t(matrix)),\n                                      dim = dim(matrix),\n                                      rows = rownames(matrix),\n                                      cols = colnames(matrix),\n                                      colors = results$cols,\n                                      domain = seq.int(results$zlim2[1], results$zlim2[2], length.out = length(results$cols)),\n                                      range = range(matrix)\n                       )\n\n\n                       icols <- colorRampPalette(c(\"white\", \"black\"), space = \"Lab\")(256)\n                       rcmvar <- matrix(apply(results$rcm[rev(results$tvc$order), , drop = FALSE], 1, var), ncol = 1)\n                       rowcols <- list(data = as.numeric(t(rcmvar)),\n                                       # TODO: add annotation\n                                       dim = dim(rcmvar),\n                                       colors = icols,\n                                       domain = seq.int(0, max(rcmvar), length.out = length(icols))\n                       )\n                       colcols <- list(data = unlist(lapply(as.character(t(results$colcol[nrow(results$colcol):1, results$hvc$order, drop = FALSE])), col2hex)),\n                                       dim = dim(results$colcol)\n                       )\n                       ol <- list(matrix = matrix, rowcols = rowcols, colcols = colcols, coldend = treeg, trim = trim)\n                       s <- toJSON(ol)\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n                   },\n                   '/genecl.json' = { # report heatmap data for a selected set of genes\n                       selgenes <- fromJSON(req$POST()$genes)\n                       ltrim <- ifelse(is.null(req$params()$trim), 1.1/ncol(mat), as.numeric(req$params()$trim))\n                       ol <- getgenecldata(selgenes, ltrim = ltrim)\n                       s <- toJSON(ol)\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n                   },\n                   '/pathwaygenes.json' = { # report heatmap data for a selected set of genes\n                       ngenes <- ifelse(is.null(req$params()$ngenes), 20, as.integer(req$params()$ngenes))\n                       twosided <- ifelse(is.null(req$params()$twosided), FALSE, as.logical(req$params()$twosided))\n                       ltrim <- ifelse(is.null(req$params()$trim), 1.1/ncol(mat), as.numeric(req$params()$trim))\n                       pws <- fromJSON(req$POST()$genes)\n                       n.pcs <- as.integer(gsub(\"^#PC(\\\\d+)# .*\", \"\\\\1\", pws))\n                       n.pcs[is.na(n.pcs)]<-1\n                       x <- c.view.pathways(gsub(\"^#PC\\\\d+# \", \"\", pws), mat, matw, goenv = goenv, n.pc = n.pcs, n.genes = ngenes, two.sided = twosided, vhc = results$hvc, plot = FALSE, trim = ltrim, batch = batch)\n                       #x <- t.view.pathways(gsub(\"^#PC\\\\d+# \", \"\", pws), mat, matw, env = goenv, vhc = results$hvc, plot = FALSE, trim = ltrim, n.pc = 1)\n                       ##rsc <- as.vector(rowSums(matw[rownames(x$rotation), ]))*x$rotation[, 1]\n                       #rsc <- x$rotation[, 1]\n                       #if(twosided) {\n                       #  extgenes <- unique(c(names(sort(rsc))[1:min(length(rsc), round(ngenes/2))], names(rev(sort(rsc)))[1:min(length(rsc), round(ngenes/2))]))\n                       # } else {\n                       #   extgenes <- names(sort(abs(rsc), decreasing = TRUE))[1:min(length(rsc), ngenes)]\n                       #}\n                       #ol <- getgenecldata(extgenes, ltrim = ltrim)\n                       ol <- getgenecldata(genes = NULL, gcl = x, ltrim = ltrim)\n                       s <- toJSON(ol)\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n                   },\n                   '/patterngenes.json' = { # report heatmap of genes most closely matching a given pattern\n                       ngenes <- ifelse(is.null(req$params()$ngenes), 20, as.integer(req$params()$ngenes))\n                       twosided <- ifelse(is.null(req$params()$twosided), FALSE, as.logical(req$params()$twosided))\n                       ltrim <- ifelse(is.null(req$params()$trim), 1.1/ncol(mat), as.numeric(req$params()$trim))\n                       pat <- fromJSON(req$POST()$pattern)\n                       # reorder the pattern back according to column clustering\n                       pat[results$hvc$order] <- pat\n                       patc <- .Call(\"matCorr\", as.matrix(t(mat)), as.matrix(pat, ncol = 1) , PACKAGE = \"scde\")\n                       if(twosided) { patc <- abs(patc) }\n                       mgenes <- rownames(mat)[order(as.numeric(patc), decreasing = TRUE)[1:ngenes]]\n                       ol <- getgenecldata(mgenes, ltrim = ltrim)\n                       ol$pattern <- pat\n                       s <- toJSON(ol)\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n                   },\n                   '/clinfo.json' = {\n                       pathcl <- ifelse(is.null(req$params()$pathcl), 1, as.integer(req$params()$pathcl))\n                       ii <- which(results$ct == pathcl)\n                       tpi <- order(results$matvar[ii], decreasing = TRUE)\n                       #tpi <- tpi[seq(1, min(length(tpi), 15))]\n                       npc <- gsub(\"^#PC(\\\\d+)#.*\", \"\\\\1\", names(ii[tpi]))\n                       nams <- gsub(\"^#PC\\\\d+# \", \"\", names(ii[tpi]))\n                       if(exists(\"myGOTERM\", envir = globalenv())) {\n                           tpn <- paste(nams, mget(nams, get(\"myGOTERM\", envir = globalenv()), ifnotfound = \"\"), sep = \" \")\n                       } else {\n                           tpn <- names(ii[tpi])\n                       }\n\n                       lgt <- data.frame(do.call(rbind, lapply(seq_along(tpn), function(i) c(id = names(ii[tpi[i]]), name = tpn[i], npc = npc[i], od = as.numeric(results$matvar[ii[tpi[i]]])/max(results$matvar), sign = as.numeric(results$matrcmcor[ii[tpi[i]]]), initsel = as.integer(results$matvar[ii[tpi[i]]] >= results$matvar[ii[tpi[1]]]*0.8)))))\n\n                       # process additional filters\n                       if(!is.null(req$params()$filter)) {\n                           fl <- fromJSON(URLdecode(req$params()$filter))\n                           for( fil in fl) {\n                               lgt <- lgt[grep(fil$value, lgt[, fil$property], perl = TRUE, ignore.case = TRUE), ]\n                           }\n                       }\n                       start <- ifelse(is.null(req$params()$start), 1, as.integer(req$params()$start)+1)\n                       limit <- ifelse(is.null(req$params()$limit), 100, as.integer(req$params()$limit))\n                       dir <- ifelse(is.null(req$params()$dir), \"DESC\", req$params()$dir)\n                       trows <- nrow(lgt)\n                       if(trows > 0) {\n                           if(!is.null(req$params()$sort)) {\n                               if(req$params()$sort %in% colnames(lgt)) {\n                                   lgt <- lgt[order(lgt[, req$params()$sort], decreasing = (dir == \"DESC\")), ]\n                               }\n                           }\n                       }\n                       lgt <- lgt[min(start, nrow(lgt)):min((start+limit), nrow(lgt)), ]\n                       lgt$od <- format(lgt$od, nsmall = 2, digits = 2)\n                       ol <- apply(lgt, 1, function(x) as.list(x))\n                       names(ol) <- NULL\n                       s <- toJSON(list(totalCount = trows, genes = ol))\n\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n                   },\n                   '/genes.json' = {\n                       lgt <- genes\n                       if(!is.null(req$params()$filter)) {\n                           fl <- fromJSON(URLdecode(req$params()$filter))\n                           for( fil in fl) {\n                               lgt <- lgt[grep(fil$value, lgt[, fil$property], perl = TRUE, ignore.case = TRUE), ]\n                           }\n                       }\n                       start <- ifelse(is.null(req$params()$start), 1, as.integer(req$params()$start)+1)\n                       limit <- ifelse(is.null(req$params()$limit), 1000, as.integer(req$params()$limit))\n                       dir <- ifelse(is.null(req$params()$dir), \"DESC\", req$params()$dir)\n                       trows <- nrow(lgt)\n                       if(trows > 0) {\n                           if(!is.null(req$params()$sort)) {\n                               if(req$params()$sort %in% colnames(lgt)) {\n                                   lgt <- lgt[order(lgt[, req$params()$sort], decreasing = (dir == \"DESC\")), ]\n                               }\n                           } else { # default sort\n                               # already done\n                           }\n                       }\n                       lgt <- format(lgt[min(start, nrow(lgt)):min((start+limit), nrow(lgt)), ], nsmall = 2, digits = 2)\n                       ol <- apply(lgt, 1, function(x) as.list(x))\n                       names(ol) <- NULL\n                       s <- toJSON(list(totalCount = trows, genes = ol))\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n\n                   },\n                   '/pathways.json' = {\n                       lgt <- pathways\n                       if(!is.null(req$params()$filter)) {\n                           fl <- fromJSON(URLdecode(req$params()$filter))\n                           for( fil in fl) {\n                               lgt <- lgt[grep(fil$value, lgt[, fil$property], perl = TRUE, ignore.case = TRUE), ]\n                           }\n                       }\n                       start <- ifelse(is.null(req$params()$start), 1, as.integer(req$params()$start)+1)\n                       limit <- ifelse(is.null(req$params()$limit), 1000, as.integer(req$params()$limit))\n                       dir <- ifelse(is.null(req$params()$dir), \"DESC\", req$params()$dir)\n                       trows <- nrow(lgt)\n                       if(trows > 0) {\n                           if(!is.null(req$params()$sort)) {\n                               if(req$params()$sort %in% colnames(lgt)) {\n                                   lgt <- lgt[order(lgt[, req$params()$sort], decreasing = (dir == \"DESC\")), ]\n                               }\n                           } else { # default sort\n                               # already done\n                           }\n                       }\n                       lgt <- format(lgt[min(start, nrow(lgt)):min((start+limit), nrow(lgt)), ], nsmall = 2, digits = 2)\n                       ol <- apply(lgt, 1, function(x) as.list(x))\n                       names(ol) <- NULL\n                       s <- toJSON(list(totalCount = trows, genes = ol))\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n\n                   },\n                   '/testenr.json' = { # run an enrichment test\n                       selgenes <- fromJSON(req$POST()$genes)\n                       lgt <- calculate.go.enrichment(selgenes, rownames(mat), pvalue.cutoff = 0.99, env = renv, over.only = TRUE)$over\n                       if(exists(\"myGOTERM\", envir = globalenv())) {\n                           lgt$nam <- paste(lgt$t, mget(as.character(lgt$t), get(\"myGOTERM\", envir = globalenv()), ifnotfound = \"\"), sep = \" \")\n                       } else {\n                           lgt$name <- lgt$t\n                       }\n                       lgt <- data.frame(id = paste(\"#PC1#\", lgt$t), name = lgt$nam, o = lgt$o, u = lgt$u, Z = lgt$Z, Za = lgt$Za, fe = lgt$fe, stringsAsFactors = FALSE)\n\n                       if(!is.null(req$params()$filter)) {\n                           fl <- fromJSON(URLdecode(req$params()$filter))\n                           for( fil in fl) {\n                               lgt <- lgt[grep(fil$value, lgt[, fil$property], perl = TRUE, ignore.case = TRUE), ]\n                           }\n                       }\n                       start <- ifelse(is.null(req$params()$start), 1, as.integer(req$params()$start)+1)\n                       limit <- ifelse(is.null(req$params()$limit), 1000, as.integer(req$params()$limit))\n                       dir <- ifelse(is.null(req$params()$dir), \"DESC\", req$params()$dir)\n                       trows <- nrow(lgt)\n                       if(trows > 0) {\n                           if(!is.null(req$params()$sort)) {\n                               if(req$params()$sort %in% colnames(lgt)) {\n                                   lgt <- lgt[order(lgt[, req$params()$sort], decreasing = (dir == \"DESC\")), ]\n                               }\n                           }\n                       }\n                       lgt <- format(lgt[min(start, nrow(lgt)):min((start+limit), nrow(lgt)), ], nsmall = 2, digits = 2)\n                       ol <- apply(lgt, 1, function(x) as.list(x))\n                       names(ol) <- NULL\n                       s <- toJSON(list(totalCount = trows, genes = ol))\n                       res$header('Content-Type', 'application/javascript')\n                       if(!is.null(req$params()$callback)) {\n                           res$write(paste(req$params()$callback, \"(\", s, \")\", sep = \"\"))\n                       } else {\n                           res$write(s)\n                       }\n\n                   },\n                   '/celltable.txt' = {\n                       matrix <- results$rcm[rev(results$tvc$order), results$hvc$order]\n                       body <- paste(capture.output(write.table(round(matrix, 1), sep = \"\\t\")), collapse = \"\\n\")\n                       res$header('Content-Type', 'text/plain')\n                       #res$header('\"Content-disposition\": attachment')\n                       res$write(body)\n                   },\n{\n    res$header('Location', 'index.html')\n    res$write('Redirecting to <a href = \"index.html\" > index.html</a >  for interactive browsing.')\n}\n                       )\n            res$finish()\n        }\n            )\n    )\n"
  },
  {
    "path": "README.md",
    "content": "[![](https://img.shields.io/badge/release%20version-2.27.1-green.svg)](https://www.bioconductor.org/packages/scde)\n\n# Overview of SCDE\n\nThe `scde` package implements a set of statistical methods for analyzing single-cell RNA-seq data. `scde` fits individual error models for single-cell RNA-seq measurements. These models can then be used for assessment of differential expression between groups of cells, as well as other types of analysis. The `scde` package also contains the `pagoda` framework which applies pathway and gene set overdispersion analysis to identify aspects of transcriptional heterogeneity among single cells. \n  \nThe overall approach to the differential expression analysis is detailed in the following publication:  \n[\"Bayesian approach to single-cell differential expression analysis\" (Kharchenko PV, Silberstein L, Scadden DT, Nature Methods, doi:10.1038/nmeth.2967)](http://www.nature.com/nmeth/journal/v11/n7/abs/nmeth.2967.html)\n\nThe overall approach to pathways and gene set overdispersion analysis is detailed in the following publication:\n[\"Characterizing transcriptional heterogeneity through pathway and gene set overdispersion analysis\" (Fan J, Salathia N, Liu R, Kaeser G, Yung Y, Herman J, Kaper F, Fan JB, Zhang K, Chun J, and Kharchenko PV, Nature Methods, doi:10.1038/nmeth.3734)](http://www.nature.com/nmeth/journal/vaop/ncurrent/full/nmeth.3734.html)\n\n**For additional installation information, tutorials, and more, please visit [the SCDE website ☞](http://hms-dbmi.github.io/scde/) and [the Bioconductor package page](https://bioconductor.org/packages/release/bioc/html/scde.html)**\n\n**Note:** We recommend that users also refer to the package [pagoda2](https://github.com/kharchenkolab/pagoda2). While we do continue to maintain the [Bioconductor package scde](https://bioconductor.org/packages/release/bioc/html/scde.html), we don't have the bandwidth to address all bugs and feature requests reported in this repo. \n\n# Sample analyses and images\n\n## Single cell error modeling\n<table>\n  <tr>\n    <td width=400px>\n      <img src=\"https://github.com/hms-dbmi/scde/blob/develop/inst/figures/pagoda-cell.model.fits-0.png\" width=\"400px\">\n    </td>\n    <td>\n      <code>scde</code> fits individual error models for single cells using counts derived from single-cell RNA-seq data to estimate drop-out and amplification biases on gene expression magnitude.\n    </td>\n  </tr>\n</table>\n\n## Differential expression analysis\n<table>\n  <tr>\n    <td width=250px>\n      <img src=\"https://github.com/hms-dbmi/scde/blob/develop/inst/figures/scde-diffexp3-1.png\" width=\"250px\">\n    </td>\n    <td>\n      <pre>\n             lb   mle     ub    ce     Z    cZ\nDppa5a    8.075 9.965 11.541 8.075 7.160 5.968\nPou5f1    5.357 7.208  9.178 5.357 7.160 5.968\nGm13242   5.672 7.681  9.768 5.672 7.159 5.968\nTdh       5.829 8.075 10.281 5.829 7.159 5.968\nIft46     5.435 7.366  9.217 5.435 7.150 5.968</pre>\n      <br>\n      <code>scde</code> compares groups of single cells and tests for differential expression, taking into account variability in the single cell RNA-seq data due to drop-out and amplification biases in order to identify more robustly differentially expressed genes. \n    </td>\n  </tr>\n</table>\n\n## Pathway and gene set overdispersion analysis\n<table>\n  <tr>\n    <td width=400px>\n      <img src=\"https://github.com/hms-dbmi/scde/blob/develop/inst/figures/PAGODA.gif\" width=\"400px\"> \n    </td>\n    <td>\n      <code>scde</code> contains <code>pagoda</code> routines that characterize aspects of transcriptional heterogeneity in populations of single cells using pre-defined gene sets as well as 'de novo' gene sets derived from the data. Significant aspects are used to cluster cells into subpopulations. A graphical user interface can be deployed to interactively explore results. See examples from the PAGODA publication <a href=\"http://pklab.med.harvard.edu/scde/pagoda.links.html\">here</a>. See analysis of the PBMC data from 10x Genomics <a href=\"http://pklab.med.harvard.edu/cgi-bin/R/rook/10x.pbmc/index.html\">here</a>.\n    </td>\n  </tr>\n</table>\n    \n---\n    \n`scde` is maintained by [Jean Fan](https://github.com/jefworks) and [Evan Biederstedt](https://github.com/evanbiederstedt) of the [Kharchenko Lab](http://pklab.med.harvard.edu/) at the [Department of Biomedical Informatics at Harvard Medical School](https://github.com/hms-dbmi).\n\n---\n\n# Contributing\n\nWe welcome any bug reports, enhancement requests, and other contributions. To submit a bug report or enhancement request, please use the [`scde` GitHub issues tracker](https://github.com/hms-dbmi/scde/issues). For more substantial contributions, please fork this repo, push your changes to your fork, and submit a pull request with a good commit message. For more general discussions or troubleshooting, please consult the [`scde` Google Group](http://hms-dbmi.github.io/scde/help.html).  \n\n\n## Citation\nIf you find `scde` useful for your publication, please cite:\n\n```\nKharchenko P, Fan J, Biederstedt E (2023). scde: Single Cell Differential Expression. \nR package version 2.27.1, http://pklab.med.harvard.edu/scde.\n```\n"
  },
  {
    "path": "inst/NEWS",
    "content": "Version 2.0.0\n\nVersion 1.1.0\n\t- added support for batch bias correction (see scde.expression.difference and scde.test.gene.expression.difference).\n\t- added a browser interface to view differential expression results. see scde.browse.diffexp().\n\t- added ability to fit models to a reference transcriptome (usually a bulk measurement). see scde.fit.models.to.reference().\n"
  },
  {
    "path": "inst/diffexp.Rmd",
    "content": "---\ntitle: \"Getting Started with `scde`\"\nauthor: \"Peter Kharchenko, Jean Fan\"\ndate: '`r Sys.Date()`'\noutput:\n  md_document:\n    variant: markdown_github\nvignette: |\n  %\\VignetteIndexEntry{Vignette Title} \\usepackage[utf8]{inputenc}\n  %\\VignetteEngine{knitr::rmarkdown}\n---\n\n# Single-Cell Differential Expression Analysis\n\nIn this vignette, we show you how perform single cell differential expression analysis using single cell RNA-seq data with the `scde` package.\n\nThe `scde` package implements routines for fitting individual error models for single-cell RNA-seq measurements. Briefly, the read counts observed for each gene are modeled using a mixture of a negative binomial (NB) distribution (for the amplified/detected transcripts) and low-level Poisson distribution (for the unobserved or background-level signal of genes that failed to amplify or were not detected for other reasons). These models can then be used to identify robustly differentially expressed genes between groups of cells. For more information, please refer to the original manuscript by [_Kharchenko et al._](http://www.ncbi.nlm.nih.gov/pubmed/24836921).\n\n## Preparing data\n\nThe analysis starts with a matrix of read counts. Depending on the protocol, these may be raw numbers of reads mapped to each gene, or count values adjusted for potential biases (sequence dependency, splice variant coverage, etc. - the values must be integers). The `scde` package includes a subset of the ES/MEF cell dataset published by [_Islam et al._](http://www.ncbi.nlm.nih.gov/pubmed/24363023). The subset includes first 20 ES and MEF cells. Here we load the cells and define a factor separating ES and MEF cell types:\n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/scde-',\n    cache.path = 'cache/scde-',\n    cache = TRUE\n)\n\nlibrary(scde)\n```\n\n```{r, data}\n# load example dataset\ndata(es.mef.small)\n\n# factor determining cell types\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(es.mef.small)), levels = c(\"ESC\", \"MEF\"))\n# the group factor should be named accordingly\nnames(sg) <- colnames(es.mef.small)  \ntable(sg)\n\n# clean up the dataset\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n```\n\n## Fitting error models\n\nAs a next step we fit the error models on which all subsequent calculations will rely. The fitting process relies on a subset of robust genes that are detected in multiple cross-cell comparisons. Here we supply the `groups = sg` argument, so that the error models for the two cell types are fit independently (using two different sets of \"robust\" genes). If the `groups` argument is omitted, the models will be fit using a common set. \n\nNote this step takes a considerable amount of time unless multiple cores are used. \n```{r, fit, eval = FALSE}\n# EVALUATION NOT NEEDED\n# calculate models\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 1, threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE, verbose = 1)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n```{r, fit2, results = 'hide'}\ndata(o.ifm)\n```\n\nThe `o.ifm` is a dataframe with error model coefficients for each cell (rows).\n```{r, fit3, results = 'hide'}\nhead(o.ifm)\n```\n\nHere, `corr.a` and `corr.b` are slope and intercept of the correlated component fit, `conc.*` refer to the concomitant fit, `corr.theta` is the NB over-dispersion, and `fail.r` is the background Poisson rate (fixed).\n\nParticularly poor cells may result in abnormal fits, most commonly showing negative `corr.a`, and should be removed.\n\n```{r, fit4}\n# filter out cells that don't show positive correlation with\n# the expected expression magnitudes (very poor fits)\nvalid.cells <- o.ifm$corr.a > 0\ntable(valid.cells)\no.ifm <- o.ifm[valid.cells, ]\n```\n\nHere, all the fits were valid.\n\nFinally, we need to define an expression magnitude prior for the genes. Its main function, however, is to define a grid of expression magnitude values on which the numerical calculations will be carried out.\n\n```{r, prior}\n# estimate gene expression prior\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n```\n\nHere we used a grid of 400 points, and let the maximum expression magnitude be determined by the default 0.999 quantile (use `max.value` parameter to specify the maximum expression magnitude explicitly - on log10 scale).\n\n## Testing for differential expression\n\nTo test for differential expression, we first define a factor that specifies which two groups of cells are to be compared. The factor elements correspond to the rows of the model matrix (`o.ifm`), and can contain `NA` values (i.e. cells that won't be included in either group). Here we key off the the ES and MEF names.\n\n```{r, diffexp}\n# define two groups of cells\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels  =  c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\n# run differential expression tests on all genes.\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups  =  groups, n.randomizations  =  100, n.cores  =  1, verbose  =  1)\n# top upregulated genes (tail would show top downregulated ones)\nhead(ediff[order(ediff$Z, decreasing  =  TRUE), ])\n```\n\n```{r, diffexp2, eval = FALSE}\n# write out a table with all the results, showing most significantly different genes (in both directions) on top\nwrite.table(ediff[order(abs(ediff$Z), decreasing = TRUE), ], file = \"results.txt\", row.names = TRUE, col.names = TRUE, sep = \"\\t\", quote = FALSE)\n```\n\nOne can also browse the differentially expressed genes, along with the associated posterior distributions, using a web browser application. The command below will start an internal web server accessible on port 1299 and call up a browser to point to the resulting page (note: internal browser doesn't work under RStudio)\n```{r, browse,eval=FALSE}\nscde.browse.diffexp(ediff, o.ifm, cd, o.prior, groups = groups, name = \"diffexp1\", port = 1299)\n```\n\nAlternatively we can run the differential expression on a single gene, and visualize the results:\n\n```{r, diffexp3, cache = TRUE, fig.height = 9, fig.width = 6, fig.show='hold', fig.path='figures/scde-'}\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior)\n```\n\nThe top and the bottom plots show expression posteriors derived from individual cells (colored lines) and joint posteriors (black lines). The middle plot shows posterior of the expression fold difference between the two cell groups, highlighting the 95% credible interval by the red shading.\n\n## Correcting for batch effects\n\nWhen the data combines cells that were measured in different batches, it is sometimes necessary to explicitly account for the expression differences that could be explained by the batch composition of the cell groups being compared. The example below makes up a random batch composition for the ES/MEF cells, and re-test the expression difference.\n\n```{r, seed, include = FALSE}\nset.seed(1)\n```\n\n```{r, batch, fig.height = 9, fig.width = 6}\nbatch <- as.factor(ifelse(rbinom(nrow(o.ifm), 1, 0.5) == 1, \"batch1\", \"batch2\"))\n# check the interaction between batches and cell types (shouldn't be any)\ntable(groups, batch)\n# test the Tdh gene again\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior, batch = batch)\n```\n\nIn the plot above, the grey lines are used to show posterior distributions based on the batch composition alone. The expression magnitude posteriors (top and bottom plots) look very similar, and as a result the log2 expression ratio posterior is close to 0. The thin black line shows log2 expression ratio posterior before correction. The batch correction doesn't shift the location, but increases uncertainty in the ratio estimate (since we're controlling for another factor).\n\nSimilarly, batch correction can be performed when calculating expression differences for the entire dataset:\n\n```{r, batch2}\n# test for all of the genes\nediff.batch <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, batch = batch, n.randomizations = 100, n.cores = 1, return.posteriors = TRUE, verbose = 1)\n```\n\n### More detailed functions\n\nThe `scde.expression.difference` method can return a more extensive set of results, including joint posteriors and the expression fold difference posteriors for all of the exam\nined genes:\n```{r, detailed1, include=FALSE, eval=FALSE}\n# recalculate difference and return with joint posteriors and difference posterior\nediff.details <- scde.expression.difference(o.ifm, cd, o.prior, n.randomizations = 100, n.cores = 1, verbose = 1, return.posteriors = TRUE)\n```\n\nThe joint posteriors can also be obtained explicitly for a particular set of cells:\n```{r, detailed2, eval=FALSE}\n# calculate joint posterior for ESCs (set return.individual.posterior.modes=T if you need p.modes)\njp <- scde.posteriors(models = o.ifm[grep(\"ESC\",rownames(o.ifm)), ], cd, o.prior, n.cores = 1)\n```\n\nThe error models fit the intercept and the slope of the NB \"correlated\" component, providing more consistent expression magnitude estimates among the cells. These can be obtain\ned with a quick helper function:\n```{r, detailed3}\n# get expression magntiude estimates\no.fpm <- scde.expression.magnitude(o.ifm, counts = cd)\n```\n\nDrop-out probabilities (as a function of expression magnitudes) for different cells are useful for assessing the quality of the measurements: \n```{r, detailed4, fig.width=4, fig.height=4}\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\npar(mfrow = c(1,1), mar = c(3.5,3.5,0.5,0.5), mgp = c(2.0,0.65,0), cex = 1)\nplot(c(), c(), xlim=range(o.prior$x), ylim=c(0,1), xlab=\"expression magnitude (log10)\", ylab=\"drop-out probability\")\ninvisible(apply(o.fail.curves[, grep(\"ES\",colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y,col = \"orange\")))\ninvisible(apply(o.fail.curves[, grep(\"MEF\", colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y, col = \"dodgerblue\")))\n```\n\nThe drop-out probabilities (at a given expression magnitude, or at an observed count) can be useful in subsequent analysis\n```{r, detailed5}\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\n# get self-fail probabilities (at a given observed count)\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n```\n\n## Adjusted distance meaures\n\nThe dependency of drop-out probability on the average expression magntiude captured by the cell-speicifc models can be used to adjust cell-to-cell similarity measures, for insta\nnce in the context of cell clustering. Several such measures are explored below.\n\n### Direct drop-out\n\nDirect weighting downweights the contribution of a given gene to the cell-to-cell distance based on the probability that the given measurement is a drop-out event (i.e. belongs to the drop-out component) - the \"self-fail\" probability shown in the previous section. To estimate the adjusted distance, we will simulate the drop-out events, replacing them with `NA` values, and calculating correlation using the remaining points: \n```{r, adjusted1, results='hide', eval=FALSE}\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n# simulate drop-outs\n# note: using 10 sampling rounds for illustration here. ~500 or more should be used.\nn.simulations <- 10; k <- 0.9;\ncell.names <- colnames(cd); names(cell.names) <- cell.names;\ndl <- mclapply(1:n.simulations,function(i) {\n  scd1 <- do.call(cbind,lapply(cell.names,function(nam) {\n    x <- cd[,nam];\n    # replace predicted drop outs with NAs\n    x[!as.logical(rbinom(length(x),1,1-p.self.fail[,nam]*k))] <- NA;\n    x;\n    }))\n  rownames(scd1) <- rownames(cd); \n  # calculate correlation on the complete observation pairs\n  cor(log10(scd1+1),use=\"pairwise.complete.obs\");\n}, mc.cores = 1)\n# calculate average distance across sampling rounds\ndirect.dist <- as.dist(1-Reduce(\"+\",dl)/length(dl))\n```\n\n### Reciprocal weighting\nThe reciprocal weighting of the Pearson correlation will give increased weight to pairs of observations where a gene expressed (on average) at a level x1 observed in a cell c1 would not be likely to fail in a cell c2, and vice versa:\n```{r, adjusted2, results='hide', eval=FALSE}\n# load boot package for the weighted correlation implementation\nrequire(boot)\nk <- 0.95;\nreciprocal.dist <- as.dist(1 - do.call(rbind, mclapply(cell.names, function(nam1) {\n  unlist(lapply(cell.names, function(nam2) {\n    # reciprocal probabilities\n    f1 <- scde.failure.probability(models = o.ifm[nam1,,drop = FALSE], magnitudes = o.fpm[, nam2])\n    f2 <- scde.failure.probability(models = o.ifm[nam2,,drop = FALSE], magnitudes = o.fpm[, nam1])\n    # weight factor\n    pnf <- sqrt((1-f1)*(1-f2))*k +(1-k); \n    boot::corr(log10(cbind(cd[, nam1], cd[, nam2])+1), w = pnf)\n    }))\n},mc.cores = 1)), upper = FALSE)\n```\n\n### Mode-relative weighting\nA more reliable reference magnitude against which drop-out likelihood could be assessed would be an estimate of the average expression magnitude, such as joint posterior mode. Below we estimate `p.mode.fail`, a probability that a drop-out event could be observed at the level of average expression magntiude in a given cell. For each measurement we then reduce it weight if it indeed dropped out in a cell where we expect it to drop-out given its average expression magnitude `(p.self.fail*p.mode.fail)`. However we do want to give high weight to measurements where the drop-out was not observed, even though it was exected based on the average expression magnitude, so the overall weight expression is `(1-p.self.fail*sqrt(p.self.fail*p.mode.fail))` (other formulations are clearly possible here).\n```{r, adjusted3, results='hide', eval=FALSE}\n# reclculate posteriors with the individual posterior modes \njp <- scde.posteriors(models = o.ifm, cd, o.prior, return.individual.posterior.modes = TRUE, n.cores = 1)\n# find joint posterior modes for each gene - a measure of MLE of group-average expression\njp$jp.modes <- log(as.numeric(colnames(jp$jp)))[max.col(jp$jp)]\np.mode.fail <- scde.failure.probability(models = o.ifm, magnitudes = jp$jp.modes)\n# weight matrix\nmatw <- 1-sqrt(p.self.fail*sqrt(p.self.fail*p.mode.fail))\n# magnitude matrix (using individual posterior modes here)\nmat <- log10(exp(jp$modes)+1);\n# weighted distance\nmode.fail.dist <- as.dist(1-do.call(rbind,mclapply(cell.names,function(nam1) {\n  unlist(lapply(cell.names,function(nam2) {\n    corr(cbind(mat[, nam1], mat[, nam2]), w = sqrt(sqrt(matw[, nam1]*matw[, nam2])))\n  }))\n}, mc.cores = 1)), upper = FALSE)\n```\n"
  },
  {
    "path": "inst/diffexp.md",
    "content": "Single-Cell Differential Expression Analysis\n============================================\n\nIn this vignette, we show you how perform single cell differential expression analysis using single cell RNA-seq data with the `scde` package.\n\nThe `scde` package implements routines for fitting individual error models for single-cell RNA-seq measurements. Briefly, the read counts observed for each gene are modeled using a mixture of a negative binomial (NB) distribution (for the amplified/detected transcripts) and low-level Poisson distribution (for the unobserved or background-level signal of genes that failed to amplify or were not detected for other reasons). These models can then be used to identify robustly differentially expressed genes between groups of cells. For more information, please refer to the original manuscript by [*Kharchenko et al.*](http://www.ncbi.nlm.nih.gov/pubmed/24836921).\n\nPreparing data\n--------------\n\nThe analysis starts with a matrix of read counts. Depending on the protocol, these may be raw numbers of reads mapped to each gene, or count values adjusted for potential biases (sequence dependency, splice variant coverage, etc. - the values must be integers). The `scde` package includes a subset of the ES/MEF cell dataset published by [*Islam et al.*](http://www.ncbi.nlm.nih.gov/pubmed/24363023). The subset includes first 20 ES and MEF cells. Here we load the cells and define a factor separating ES and MEF cell types:\n\n``` r\n# load example dataset\ndata(es.mef.small)\n\n# factor determining cell types\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(es.mef.small)), levels = c(\"ESC\", \"MEF\"))\n# the group factor should be named accordingly\nnames(sg) <- colnames(es.mef.small)  \ntable(sg)\n```\n\n    ## sg\n    ## ESC MEF \n    ##  20  20\n\n``` r\n# clean up the dataset\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n```\n\nFitting error models\n--------------------\n\nAs a next step we fit the error models on which all subsequent calculations will rely. The fitting process relies on a subset of robust genes that are detected in multiple cross-cell comparisons. Here we supply the `groups = sg` argument, so that the error models for the two cell types are fit independently (using two different sets of \"robust\" genes). If the `groups` argument is omitted, the models will be fit using a common set.\n\nNote this step takes a considerable amount of time unless multiple cores are used.\n\n``` r\n# EVALUATION NOT NEEDED\n# calculate models\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 1, threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE, verbose = 1)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n``` r\ndata(o.ifm)\n```\n\nThe `o.ifm` is a dataframe with error model coefficients for each cell (rows).\n\n``` r\nhead(o.ifm)\n```\n\nHere, `corr.a` and `corr.b` are slope and intercept of the correlated component fit, `conc.*` refer to the concomitant fit, `corr.theta` is the NB over-dispersion, and `fail.r` is the background Poisson rate (fixed).\n\nParticularly poor cells may result in abnormal fits, most commonly showing negative `corr.a`, and should be removed.\n\n``` r\n# filter out cells that don't show positive correlation with\n# the expected expression magnitudes (very poor fits)\nvalid.cells <- o.ifm$corr.a > 0\ntable(valid.cells)\n```\n\n    ## valid.cells\n    ## TRUE \n    ##   40\n\n``` r\no.ifm <- o.ifm[valid.cells, ]\n```\n\nHere, all the fits were valid.\n\nFinally, we need to define an expression magnitude prior for the genes. Its main function, however, is to define a grid of expression magnitude values on which the numerical calculations will be carried out.\n\n``` r\n# estimate gene expression prior\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n```\n\nHere we used a grid of 400 points, and let the maximum expression magnitude be determined by the default 0.999 quantile (use `max.value` parameter to specify the maximum expression magnitude explicitly - on log10 scale).\n\nTesting for differential expression\n-----------------------------------\n\nTo test for differential expression, we first define a factor that specifies which two groups of cells are to be compared. The factor elements correspond to the rows of the model matrix (`o.ifm`), and can contain `NA` values (i.e. cells that won't be included in either group). Here we key off the the ES and MEF names.\n\n``` r\n# define two groups of cells\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels  =  c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\n# run differential expression tests on all genes.\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups  =  groups, n.randomizations  =  100, n.cores  =  1, verbose  =  1)\n```\n\n    ## comparing groups:\n    ## \n    ## ESC MEF \n    ##  20  20 \n    ## calculating difference posterior\n    ## summarizing differences\n\n``` r\n# top upregulated genes (tail would show top downregulated ones)\nhead(ediff[order(ediff$Z, decreasing  =  TRUE), ])\n```\n\n    ##                     lb      mle        ub       ce        Z       cZ\n    ## Dppa5a        8.075220 9.984631 11.575807 8.075220 7.160813 5.989598\n    ## Pou5f1        5.370220 7.200073  9.189043 5.370220 7.160328 5.989598\n    ## Gm13242       5.688455 7.677425  9.785734 5.688455 7.159979 5.989598\n    ## Tdh           5.807793 8.075220 10.302866 5.807793 7.159589 5.989598\n    ## Ift46         5.449779 7.359190  9.228822 5.449779 7.150242 5.989598\n    ## 4930509G22Rik 5.409999 7.478528  9.785734 5.409999 7.115605 5.978296\n\n``` r\n# write out a table with all the results, showing most significantly different genes (in both directions) on top\nwrite.table(ediff[order(abs(ediff$Z), decreasing = TRUE), ], file = \"results.txt\", row.names = TRUE, col.names = TRUE, sep = \"\\t\", quote = FALSE)\n```\n\nOne can also browse the differentially expressed genes, along with the associated posterior distributions, using a web browser application. The command below will start an internal web server accessible on port 1299 and call up a browser to point to the resulting page (note: internal browser doesn't work under RStudio)\n\n``` r\nscde.browse.diffexp(ediff, o.ifm, cd, o.prior, groups = groups, name = \"diffexp1\", port = 1299)\n```\n\nAlternatively we can run the differential expression on a single gene, and visualize the results:\n\n``` r\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior)\n```\n\n    ##           lb     mle       ub       ce        Z       cZ\n    ## Tdh 5.728235 8.03544 10.30287 5.728235 7.151425 7.151425\n\n![](figures/scde-diffexp3-1.png)\n\nThe top and the bottom plots show expression posteriors derived from individual cells (colored lines) and joint posteriors (black lines). The middle plot shows posterior of the expression fold difference between the two cell groups, highlighting the 95% credible interval by the red shading.\n\nCorrecting for batch effects\n----------------------------\n\nWhen the data combines cells that were measured in different batches, it is sometimes necessary to explicitly account for the expression differences that could be explained by the batch composition of the cell groups being compared. The example below makes up a random batch composition for the ES/MEF cells, and re-test the expression difference.\n\n``` r\nbatch <- as.factor(ifelse(rbinom(nrow(o.ifm), 1, 0.5) == 1, \"batch1\", \"batch2\"))\n# check the interaction between batches and cell types (shouldn't be any)\ntable(groups, batch)\n```\n\n    ##       batch\n    ## groups batch1 batch2\n    ##    ESC     11      9\n    ##    MEF      8     12\n\n``` r\n# test the Tdh gene again\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior, batch = batch)\n```\n\n    ##           lb      mle       ub       ce        Z       cZ\n    ## Tdh 3.659705 7.796764 12.01338 3.659705 3.782082 3.782082\n\n![](figures/scde-batch-1.png)\n\nIn the plot above, the grey lines are used to show posterior distributions based on the batch composition alone. The expression magnitude posteriors (top and bottom plots) look very similar, and as a result the log2 expression ratio posterior is close to 0. The thin black line shows log2 expression ratio posterior before correction. The batch correction doesn't shift the location, but increases uncertainty in the ratio estimate (since we're controlling for another factor).\n\nSimilarly, batch correction can be performed when calculating expression differences for the entire dataset:\n\n``` r\n# test for all of the genes\nediff.batch <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, batch = batch, n.randomizations = 100, n.cores = 1, return.posteriors = TRUE, verbose = 1)\n```\n\n    ## controlling for batch effects. interaction:\n    ##       batch\n    ## groups batch1 batch2\n    ##    ESC     11      9\n    ##    MEF      8     12\n    ## calculating batch posteriors\n    ## calculating batch differences\n    ## calculating difference posterior\n    ## summarizing differences\n    ## adjusting for batch effects\n\n### More detailed functions\n\nThe `scde.expression.difference` method can return a more extensive set of results, including joint posteriors and the expression fold difference posteriors for all of the exam ined genes:\n\nThe joint posteriors can also be obtained explicitly for a particular set of cells:\n\n``` r\n# calculate joint posterior for ESCs (set return.individual.posterior.modes=T if you need p.modes)\njp <- scde.posteriors(models = o.ifm[grep(\"ESC\",rownames(o.ifm)), ], cd, o.prior, n.cores = 1)\n```\n\nThe error models fit the intercept and the slope of the NB \"correlated\" component, providing more consistent expression magnitude estimates among the cells. These can be obtain ed with a quick helper function:\n\n``` r\n# get expression magntiude estimates\no.fpm <- scde.expression.magnitude(o.ifm, counts = cd)\n```\n\nDrop-out probabilities (as a function of expression magnitudes) for different cells are useful for assessing the quality of the measurements:\n\n``` r\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\npar(mfrow = c(1,1), mar = c(3.5,3.5,0.5,0.5), mgp = c(2.0,0.65,0), cex = 1)\nplot(c(), c(), xlim=range(o.prior$x), ylim=c(0,1), xlab=\"expression magnitude (log10)\", ylab=\"drop-out probability\")\ninvisible(apply(o.fail.curves[, grep(\"ES\",colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y,col = \"orange\")))\ninvisible(apply(o.fail.curves[, grep(\"MEF\", colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y, col = \"dodgerblue\")))\n```\n\n![](figures/scde-detailed4-1.png)\n\nThe drop-out probabilities (at a given expression magnitude, or at an observed count) can be useful in subsequent analysis\n\n``` r\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\n# get self-fail probabilities (at a given observed count)\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n```\n\nAdjusted distance meaures\n-------------------------\n\nThe dependency of drop-out probability on the average expression magntiude captured by the cell-speicifc models can be used to adjust cell-to-cell similarity measures, for insta nce in the context of cell clustering. Several such measures are explored below.\n\n### Direct drop-out\n\nDirect weighting downweights the contribution of a given gene to the cell-to-cell distance based on the probability that the given measurement is a drop-out event (i.e. belongs to the drop-out component) - the \"self-fail\" probability shown in the previous section. To estimate the adjusted distance, we will simulate the drop-out events, replacing them with `NA` values, and calculating correlation using the remaining points:\n\n``` r\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n# simulate drop-outs\n# note: using 10 sampling rounds for illustration here. ~500 or more should be used.\nn.simulations <- 10; k <- 0.9;\ncell.names <- colnames(cd); names(cell.names) <- cell.names;\ndl <- mclapply(1:n.simulations,function(i) {\n  scd1 <- do.call(cbind,lapply(cell.names,function(nam) {\n    x <- cd[,nam];\n    # replace predicted drop outs with NAs\n    x[!as.logical(rbinom(length(x),1,1-p.self.fail[,nam]*k))] <- NA;\n    x;\n    }))\n  rownames(scd1) <- rownames(cd); \n  # calculate correlation on the complete observation pairs\n  cor(log10(scd1+1),use=\"pairwise.complete.obs\");\n}, mc.cores = 1)\n# calculate average distance across sampling rounds\ndirect.dist <- as.dist(1-Reduce(\"+\",dl)/length(dl))\n```\n\n### Reciprocal weighting\n\nThe reciprocal weighting of the Pearson correlation will give increased weight to pairs of observations where a gene expressed (on average) at a level x1 observed in a cell c1 would not be likely to fail in a cell c2, and vice versa:\n\n``` r\n# load boot package for the weighted correlation implementation\nrequire(boot)\nk <- 0.95;\nreciprocal.dist <- as.dist(1 - do.call(rbind, mclapply(cell.names, function(nam1) {\n  unlist(lapply(cell.names, function(nam2) {\n    # reciprocal probabilities\n    f1 <- scde.failure.probability(models = o.ifm[nam1,,drop = FALSE], magnitudes = o.fpm[, nam2])\n    f2 <- scde.failure.probability(models = o.ifm[nam2,,drop = FALSE], magnitudes = o.fpm[, nam1])\n    # weight factor\n    pnf <- sqrt((1-f1)*(1-f2))*k +(1-k); \n    boot::corr(log10(cbind(cd[, nam1], cd[, nam2])+1), w = pnf)\n    }))\n},mc.cores = 1)), upper = FALSE)\n```\n\n### Mode-relative weighting\n\nA more reliable reference magnitude against which drop-out likelihood could be assessed would be an estimate of the average expression magnitude, such as joint posterior mode. Below we estimate `p.mode.fail`, a probability that a drop-out event could be observed at the level of average expression magntiude in a given cell. For each measurement we then reduce it weight if it indeed dropped out in a cell where we expect it to drop-out given its average expression magnitude `(p.self.fail*p.mode.fail)`. However we do want to give high weight to measurements where the drop-out was not observed, even though it was exected based on the average expression magnitude, so the overall weight expression is `(1-p.self.fail*sqrt(p.self.fail*p.mode.fail))` (other formulations are clearly possible here).\n\n``` r\n# reclculate posteriors with the individual posterior modes \njp <- scde.posteriors(models = o.ifm, cd, o.prior, return.individual.posterior.modes = TRUE, n.cores = 1)\n# find joint posterior modes for each gene - a measure of MLE of group-average expression\njp$jp.modes <- log(as.numeric(colnames(jp$jp)))[max.col(jp$jp)]\np.mode.fail <- scde.failure.probability(models = o.ifm, magnitudes = jp$jp.modes)\n# weight matrix\nmatw <- 1-sqrt(p.self.fail*sqrt(p.self.fail*p.mode.fail))\n# magnitude matrix (using individual posterior modes here)\nmat <- log10(exp(jp$modes)+1);\n# weighted distance\nmode.fail.dist <- as.dist(1-do.call(rbind,mclapply(cell.names,function(nam1) {\n  unlist(lapply(cell.names,function(nam2) {\n    corr(cbind(mat[, nam1], mat[, nam2]), w = sqrt(sqrt(matw[, nam1]*matw[, nam2])))\n  }))\n}, mc.cores = 1)), upper = FALSE)\n```\n"
  },
  {
    "path": "inst/experimental.Rmd",
    "content": "---\ntitle: \"Experimental use of PAGODA\"\nauthor: \"Jean Fan\"\ndate: '`r Sys.Date()`'\noutput:\n  md_document:\n    variant: markdown_github\nvignette: |\n  %\\VignetteIndexEntry{Vignette Title} \\usepackage[utf8]{inputenc}\n  %\\VignetteEngine{knitr::rmarkdown}\n---\n\n# Experimental use of PAGODA\n\nIn this vignette, we show you how to modify `pagoda` to run on your own normalized gene expression and associated weights matrices. \n\n**NOTE:** Gene expression matrices must be normalized. Variances observed in the normalized data must be representative of biological variation. We strongly recommend taking into consideration magnitude dependencies, batch effects, library size, and other potential technical aspects of variability. \n\n**IMPORTANT:** Depending on your normalization and weights, results may be misleading and/or non-sensical. This vignette is experimental. USE WITH CAUTION. \n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/experimental-',\n    cache.path = 'cache/experimental-',\n    cache = TRUE\n)\n\nlibrary(scde)\n```\n\n## Simulating data for demonstration purposes\n\nFor the purposes of demonstration, we will simulate a normalized gene expression matrix. We will generate out data such that there are two major components of variation, supported by every other pathway. \n```{r, data}\n# Get gene sets\nlibrary(org.Hs.eg.db)\ngos <- ls(org.Hs.egGO2ALLEGS)\ngos.sub <- mget(gos[1:10], org.Hs.egGO2ALLEGS)\ngo.env <- list2env(gos.sub)\n# Get genes from this universe\ngenes <- unlist(gos.sub)\nN <- length(unique(genes))\n# Simulate cells\nM <- 100\ncells <- paste('cell', 1:M)\n# Simulate normalized gene expression matrix\nmat <- do.call(rbind, lapply(seq_along(gos.sub), function(i) {\n  gs <- gos.sub[[i]]\n  set.seed(i%%2) # make every other pathway split the same cells\n  c <- sample(1:M, M/2, replace=FALSE) \n  s <- length(gs)\n  mat <- matrix(rnorm(s*M,mean=0,sd=5), s, M)\n  mat[, c] <- rnorm(s*c,mean=10,sd=5)\n  rownames(mat) <- gs\n  colnames(mat) <- cells\n  return(mat)\n}))\n# Get rid of duplicate generated gene\nmat <- mat[unique(genes),]\n# Just cluster and visualize gene expression\nheatmap(mat, col = colorRampPalette(c('blue', 'white', 'red'))(100))\n```\n\nWe will set the associated weights to 1 for all observations in all cells and calculated variances for each gene. \n\n```{r, data2}\n# Set all weights to 1\nmatw <- matrix(1, N, M)\nrownames(matw) <- rownames(mat)\ncolnames(matw) <- colnames(mat)\n# Regular variance since equal weights anyway\nvar <- apply(mat, 1, var)\n```\n\n## Using `pagoda` with custom matrices\n\nNow we can put our simulated data into an object to pipe into the `pagoda` pipeline.\n\n```{r, data3}\n# Create varinfo object to pipe into PAGODA\nvarinfo <- list('mat' = mat, 'matw' = matw, 'arv' = var)\n```\n\nWhen we run `pagoda` on the generated data, we indeed recover our two major components of variation. \n\n```{r, pagoda}\n# Run PAGODA with generated data\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, batch.center = FALSE, verbose = 1)\ntam <- pagoda.top.aspects(pwpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.2)\n\n# Cluster on final\nhc <- hclust(dist(t(tamr2$xv)), method='ward.D')\ncol.cols <- rbind(groups = cutree(hc, 3))\npagoda.view.aspects(tamr2, cell.clustering = hc, col.cols = col.cols)\n```\n\nWe can also create an interactive app to browse our results. \n\n```{r, app, eval=FALSE}\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, col.cols = col.cols, cell.clustering = hc, title = \"Experiment\")\nshow.app(app, \"Experiment\", browse = TRUE, port = 1400)  \n```\n"
  },
  {
    "path": "inst/experimental.md",
    "content": "Experimental use of PAGODA\n==========================\n\nIn this vignette, we show you how to modify `pagoda` to run on your own normalized gene expression and associated weights matrices.\n\n**NOTE:** Gene expression matrices must be normalized. Variances observed in the normalized data must be representative of biological variation. We strongly recommend taking into consideration magnitude dependencies, batch effects, library size, and other potential technical aspects of variability.\n\n**IMPORTANT:** Depending on your normalization and weights, results may be misleading and/or non-sensical. This vignette is experimental. USE WITH CAUTION.\n\nSimulating data for demonstration purposes\n------------------------------------------\n\nFor the purposes of demonstration, we will simulate a normalized gene expression matrix. We will generate out data such that there are two major components of variation, supported by every other pathway.\n\n``` r\n# Get gene sets\nlibrary(org.Hs.eg.db)\ngos <- ls(org.Hs.egGO2ALLEGS)\ngos.sub <- mget(gos[1:10], org.Hs.egGO2ALLEGS)\ngo.env <- list2env(gos.sub)\n# Get genes from this universe\ngenes <- unlist(gos.sub)\nN <- length(unique(genes))\n# Simulate cells\nM <- 100\ncells <- paste('cell', 1:M)\n# Simulate normalized gene expression matrix\nmat <- do.call(rbind, lapply(seq_along(gos.sub), function(i) {\n  gs <- gos.sub[[i]]\n  set.seed(i%%2) # make every other pathway split the same cells\n  c <- sample(1:M, M/2, replace=FALSE) \n  s <- length(gs)\n  mat <- matrix(rnorm(s*M,mean=0,sd=5), s, M)\n  mat[, c] <- rnorm(s*c,mean=10,sd=5)\n  rownames(mat) <- gs\n  colnames(mat) <- cells\n  return(mat)\n}))\n# Get rid of duplicate generated gene\nmat <- mat[unique(genes),]\n# Just cluster and visualize gene expression\nheatmap(mat, col = colorRampPalette(c('blue', 'white', 'red'))(100))\n```\n\n![](figures/experimental-data-1.png)\n\nWe will set the associated weights to 1 for all observations in all cells and calculated variances for each gene.\n\n``` r\n# Set all weights to 1\nmatw <- matrix(1, N, M)\nrownames(matw) <- rownames(mat)\ncolnames(matw) <- colnames(mat)\n# Regular variance since equal weights anyway\nvar <- apply(mat, 1, var)\n```\n\nUsing `pagoda` with custom matrices\n-----------------------------------\n\nNow we can put our simulated data into an object to pipe into the `pagoda` pipeline.\n\n``` r\n# Create varinfo object to pipe into PAGODA\nvarinfo <- list('mat' = mat, 'matw' = matw, 'arv' = var)\n```\n\nWhen we run `pagoda` on the generated data, we indeed recover our two major components of variation.\n\n``` r\n# Run PAGODA with generated data\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, batch.center = FALSE, verbose = 1)\ntam <- pagoda.top.aspects(pwpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.2)\n\n# Cluster on final\nhc <- hclust(dist(t(tamr2$xv)), method='ward.D')\ncol.cols <- rbind(groups = cutree(hc, 3))\npagoda.view.aspects(tamr2, cell.clustering = hc, col.cols = col.cols)\n```\n\n![](figures/experimental-pagoda-1.png)\n\nWe can also create an interactive app to browse our results.\n\n``` r\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, col.cols = col.cols, cell.clustering = hc, title = \"Experiment\")\nshow.app(app, \"Experiment\", browse = TRUE, port = 1400)  \n```\n"
  },
  {
    "path": "inst/genesets.Rmd",
    "content": "---\ntitle: \"Preparing custom gene sets for `pagoda`\"\nauthor: \"Jean Fan, Jens Preußner\"\ndate: '`r Sys.Date()`'\noutput:\n  md_document:\n    variant: markdown_github\nvignette: |\n  %\\VignetteIndexEntry{Vignette Title} \\usepackage[utf8]{inputenc}\n  %\\VignetteEngine{knitr::rmarkdown}\n---\n\n# Creating custom pathway annotations or gene sets\n\nIn this vignette, we show you how to create and use your own custom pathway annotations or gene sets with pagoda.\n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/genesets-',\n    cache.path = 'cache/genesets-',\n    cache = TRUE\n)\nlibrary(scde)\ndata(pollen)\ncd <- pollen\n```\n\n# GO annotations\n\n```{r, go}\n# Use the org.Hs.eg.db package for GO annotations\nlibrary(org.Hs.eg.db)\n# Translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\n# Reverse map\nrids <- names(ids)\nnames(rids) <- ids\n# Convert ids per GO category to gene names\ngo.env <- eapply(org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x])))\ngo.env <- clean.gos(go.env) # Remove GOs with too few or too many genes\ngo.env <- list2env(go.env)  # Convert to an environment\n\n# Test\nclass(go.env)\nhead(ls(go.env)) # Look at gene set names\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n\n# BioMart\n\nAlternatively, we can use Ensembl's BioMart service to get the GO annotations.\n\n```{r, biomart}\nlibrary(biomaRt)\nlibrary(GO.db)\n\n# Initialize the connection to the Ensembl BioMart Service\n# Available datasets can be listed with \n# listDatasets(useMart(\"ENSEMBL_MART_ENSEMBL\", host=\"www.ensembl.org\"))\n# Use mmusculus_gene_ensembl for mouse\nensembl <- useMart(\"ENSEMBL_MART_ENSEMBL\", dataset = \"hsapiens_gene_ensembl\", host=\"www.ensembl.org\")\n\n# Constructs a dataframe with two columns: hgnc_symbol and go_id\n# If rownames are Ensembl IDs, use ensembl_gene_id as filter value\ngo <- getBM(attributes = c(\"hgnc_symbol\", \"go_id\"), filters = \"hgnc_symbol\", values = rownames(cd), mart = ensembl)\n\n# Use the GO.db library to add a column with the GO-term to the dataframe\ngo$term <- Term(go$go_id)\n\n# Create a named list of character vectors out of the df\ns = split(go$hgnc_symbol, paste(go$go_id,go$term))\n\n# Saves the list as a R environment\ngo.env <- list2env(s)\n\n# Test\nclass(go.env)\nhead(ls(go.env)) # Look at gene set names\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n\n# From GMT\n\nThe GMT file format is a tab delimited file format that describes gene sets. GMT files for Broad's MSigDB and other gene sets can be downloaded from the [Broad Website](http://www.broadinstitute.org/gsea/downloads.jsp).\n\n```{r, gmt}\n## read in Broad gmt format\nlibrary(GSA)\nfilename <- 'https://raw.githubusercontent.com/JEFworks/genesets/master/msigdb.v5.0.symbols.gmt'\ngs <- GSA.read.gmt(filename)\n\n## number of gene sets\nn <- length(gs$geneset.names)\n\n## create environment\nenv <- new.env(parent=globalenv())\ninvisible(lapply(1:n,function(i) {\n  genes <- as.character(unlist(gs$genesets[i]))\n  name <- as.character(gs$geneset.names[i])\n  assign(name, genes, envir = env)\n}))\n\ngo.env <- env\n\n# Test\nclass(go.env)\nhead(ls(go.env)) # Look at gene set names\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n"
  },
  {
    "path": "inst/genesets.md",
    "content": "Creating custom pathway annotations or gene sets\n================================================\n\nIn this vignette, we show you how to create and use your own custom pathway annotations or gene sets with pagoda.\n\nGO annotations\n==============\n\n``` r\n# Use the org.Hs.eg.db package for GO annotations\nlibrary(org.Hs.eg.db)\n# Translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\n# Reverse map\nrids <- names(ids)\nnames(rids) <- ids\n# Convert ids per GO category to gene names\ngo.env <- eapply(org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x])))\ngo.env <- clean.gos(go.env) # Remove GOs with too few or too many genes\ngo.env <- list2env(go.env)  # Convert to an environment\n\n# Test\nclass(go.env)\n```\n\n    ## [1] \"environment\"\n\n``` r\nhead(ls(go.env)) # Look at gene set names\n```\n\n    ## [1] \"GO:0000002\" \"GO:0000003\" \"GO:0000012\" \"GO:0000014\" \"GO:0000018\"\n    ## [6] \"GO:0000022\"\n\n``` r\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n\n    ## [1] \"SLC25A4\" \"DNA2\"    \"TYMP\"    \"LIG3\"    \"LIG3\"    \"MEF2A\"\n\nBioMart\n=======\n\nAlternatively, we can use Ensembl's BioMart service to get the GO annotations.\n\n``` r\nlibrary(biomaRt)\nlibrary(GO.db)\n\n# Initialize the connection to the Ensembl BioMart Service\n# Available datasets can be listed with \n# listDatasets(useMart(\"ENSEMBL_MART_ENSEMBL\", host=\"www.ensembl.org\"))\n# Use mmusculus_gene_ensembl for mouse\nensembl <- useMart(\"ENSEMBL_MART_ENSEMBL\", dataset = \"hsapiens_gene_ensembl\", host=\"www.ensembl.org\")\n\n# Constructs a dataframe with two columns: hgnc_symbol and go_id\n# If rownames are Ensembl IDs, use ensembl_gene_id as filter value\ngo <- getBM(attributes = c(\"hgnc_symbol\", \"go_id\"), filters = \"hgnc_symbol\", values = rownames(cd), mart = ensembl)\n\n# Use the GO.db library to add a column with the GO-term to the dataframe\ngo$term <- Term(go$go_id)\n\n# Create a named list of character vectors out of the df\ns = split(go$hgnc_symbol, paste(go$go_id,go$term))\n\n# Saves the list as a R environment\ngo.env <- list2env(s)\n\n# Test\nclass(go.env)\n```\n\n    ## [1] \"environment\"\n\n``` r\nhead(ls(go.env)) # Look at gene set names\n```\n\n    ## [1] \" NA\"                                                 \n    ## [2] \"GO:0000002 mitochondrial genome maintenance\"         \n    ## [3] \"GO:0000003 reproduction\"                             \n    ## [4] \"GO:0000009 alpha-1,6-mannosyltransferase activity\"   \n    ## [5] \"GO:0000010 trans-hexaprenyltranstransferase activity\"\n    ## [6] \"GO:0000012 single strand break repair\"\n\n``` r\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n\n    ## [1] \"HLA-DOB\" \"CLDN6\"   \"YPEL5\"   \"DYNLRB1\" \"LRRC41\"  \"RSPH3\"\n\nFrom GMT\n========\n\nThe GMT file format is a tab delimited file format that describes gene sets. GMT files for Broad's MSigDB and other gene sets can be downloaded from the [Broad Website](http://www.broadinstitute.org/gsea/downloads.jsp).\n\n``` r\n## read in Broad gmt format\nlibrary(GSA)\nfilename <- 'https://raw.githubusercontent.com/JEFworks/genesets/master/msigdb.v5.0.symbols.gmt'\ngs <- GSA.read.gmt(filename)\n\n## number of gene sets\nn <- length(gs$geneset.names)\n\n## create environment\nenv <- new.env(parent=globalenv())\ninvisible(lapply(1:n,function(i) {\n  genes <- as.character(unlist(gs$genesets[i]))\n  name <- as.character(gs$geneset.names[i])\n  assign(name, genes, envir = env)\n}))\n\ngo.env <- env\n\n# Test\nclass(go.env)\n```\n\n    ## [1] \"environment\"\n\n``` r\nhead(ls(go.env)) # Look at gene set names\n```\n\n    ## [1] \"3_5_CYCLIC_NUCLEOTIDE_PHOSPHODIESTERASE_ACTIVITY\"\n    ## [2] \"3_5_EXONUCLEASE_ACTIVITY\"                        \n    ## [3] \"AAACCAC,MIR-140\"                                 \n    ## [4] \"AAAGACA,MIR-511\"                                 \n    ## [5] \"AAAGGAT,MIR-501\"                                 \n    ## [6] \"AAAGGGA,MIR-204,MIR-211\"\n\n``` r\nhead(get(ls(go.env)[1], go.env)) # Look at one gene set\n```\n\n    ## [1] \"PDE3B\"  \"PDE4D\"  \"PDE3A\"  \"PDE10A\" \"PDE4C\"  \"PDE7B\"\n"
  },
  {
    "path": "inst/pagoda.Rmd",
    "content": "---\ntitle: \"Getting Started with `pagoda` Routines\"\nauthor: \"Peter Kharchenko, Jean Fan\"\ndate: '`r Sys.Date()`'\noutput:\n  md_document:\n    variant: markdown_github\nvignette: |\n  %\\VignetteIndexEntry{Vignette Title} \\usepackage[utf8]{inputenc}\n  %\\VignetteEngine{knitr::rmarkdown}\n---\n\n# Pathway and Gene Set Overdispersion Analysis\n\nIn this vignette, we show you how to use `pagoda` routines in the `scde` package to characterize aspects of transcriptional heterogeneity in populations of single cells. \n\nThe `pagoda` routines implemented in the `scde` resolves multiple, potentially overlapping aspects of transcriptional heterogeneity by identifying known pathways or novel gene sets that show significant excess of coordinated variability among the measured cells. Briefly, cell-specific error models derived from `scde` are used to estimate residual gene expression variance, and identify pathways and gene sets that exhibit statistically significant excess of coordinated variability (overdispersion). `pagoda` can be used to effectively recover known subpopulations and discover putative new subpopulations and their corresponding functional characteristics in single-cell samples. For more information, please refer to the original manuscript by [_Fan et al._](http://biorxiv.org/content/early/2015/09/16/026948).\n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/pagoda-',\n    cache.path = 'cache/pagoda-',\n    cache = TRUE\n)\nlibrary(scde)\n```\n\n## Preparing data\n\nThe analysis starts with a matrix of read counts. Here, we use the read count table and cell group annotations from [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649) can be loaded using the `data(\"pollen\")` call. Some additional filters are also applied.\n\n```{r, data}\ndata(pollen)\n# remove poor cells and genes\ncd <- clean.counts(pollen)\n# check the final dimensions of the read count matrix\ndim(cd)\n```\n\nNext, we'll translate group and sample source data from [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649) into color codes. These will be used later to compare [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649)'s derived annotation with subpopulations identified by `pagoda`:\n\n```{r, colorcodes}\nx <- gsub(\"^Hi_(.*)_.*\", \"\\\\1\", colnames(cd))\nl2cols <- c(\"coral4\", \"olivedrab3\", \"skyblue2\", \"slateblue3\")[as.integer(factor(x, levels = c(\"NPC\", \"GW16\", \"GW21\", \"GW21+3\")))]\n```\n\n## Fitting error models\n\nNext, we'll construct error models for individual cells. Here, we use k-nearest neighbor model fitting procedure implemented by `knn.error.models()` method. This is a relatively noisy dataset (non-UMI), so we raise the `min.count.threshold` to 2 (minimum number of reads for the gene to be initially classified as a non-failed measurement), requiring at least 5 non-failed measurements per gene. We're providing a rough guess to the complexity of the population, by fitting the error models based on 1/4 of most similar cells (i.e. guessing there might be ~4 subpopulations). \n\nNote this step takes a considerable amount of time unless multiple cores are used. We highly recommend use of multiple cores. You can check the number of available cores available using `detectCores()`. \n\n```{r, models, eval = FALSE, hide = TRUE}\n# EVALUATION NOT NEEDED\nknn <- knn.error.models(cd, k = ncol(cd)/4, n.cores = 1, min.count.threshold = 2, min.nonfailed = 5, max.model.plots = 10)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n```{r, models2, results = 'hide'}\ndata(knn)\n```\n\nThe fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). The fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). Here's an example of such plot:\n\n![cell 3 model](figures/pagoda-cell.model.fits-0.png)\n\nThe two scatter plots on the left show observed (in a given cell) vs. expected (from k similar cells) expression magnitudes for each gene that is being used for model fitting. The second (from the left) scatter plot shows genes belonging to the drop-out component in red. The black dashed lines show 95% confidence band for the amplified genes (the grey dashed lines show confidence band for an alternative constant-theta model). The third plot shows drop-out probability as a function of magnitude, and the fourth plot shows negative binomial theta local regression fit as a function of magnitude (for the amplified component). \n\n## Normalizing variance\n\nIn order to accurately quantify excess variance or overdispersion, we must normalize out expected levels of technical and intrinsic biological noise. Briefly, variance of the NB/Poisson mixture processes derived from the error modeling step are modeled as a chi-squared distribution using adjusted degrees of freedom and observation weights based on the drop-out probability of a given gene. Here, we normalize variance, trimming 3 most extreme cells and limiting maximum adjusted variance to 5.\n\n```{r varnorm, fig.height = 3, fig.width = 6}\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = TRUE)\n```\n\nThe plot on the left shows coefficient of variance squared (on log10 scale) as a function of expression magnitude (log10 FPM). The red line shows local regression model for the genome-wide average dependency. The plot on the right shows adjusted variance (derived based on chi-squared probability of observed/genomewide expected ratio for each gene, with degrees of freedom adjusted for each gene). The adjusted variance of 1 means that a given gene exhibits as much variance as expected for a gene of such population average expression magnitude. Genes with high adjusted variance are overdispersed within the measured population and most likely show subpopulation-specific expression:\n\n```{r, varnorm2}\n# list top overdispersed genes\nsort(varinfo$arv, decreasing = TRUE)[1:10]\n```\n\n## Controlling for sequencing depth \n\nEven with all the corrections, sequencing depth or gene coverage is typically still a major aspects of variability. In most studies, we would want to control for that as a technical artifact (exceptions are cell mixtures where subtypes significantly differ in the amount of total mRNA). Below we will control for the gene coverage (estimated as a number of genes with non-zero magnitude per cell) and normalize out that aspect of cell heterogeneity: \n\n```{r, varnorm3}\nvarinfo <- pagoda.subtract.aspect(varinfo, colSums(cd[, rownames(knn)]>0))\n```\n\n## Evaluate overdispersion of pre-defined gene sets\n\nIn order to detect significant aspects of heterogeneity across the population of single cells, 'pagoda' identifies pathways and gene sets that exhibit statistically significant excess of coordinated variability. Specifically, for each gene set, we tested whether the amount of variance explained by the first principal component significantly exceed the background expectation. We can test both pre-defined gene sets as well as 'de novo' gene sets whose expression profiles are well-correlated within the given dataset. \n\nFor pre-defined gene sets, we'll use GO annotations. For the purposes of this vignette, in order to make calculations faster, we will only consider the first 100 GO terms plus a few that we care about. Additional tutorials on how to create and use your own gene sets can be found in [a separate tutorial](http://hms-dbmi.github.io/scde/genesets.html). \n\n```{r, goenv}\nlibrary(org.Hs.eg.db)\n# translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\nrids <- names(ids); names(rids) <- ids \n# convert GO lists from ids to gene names\ngos.interest <- unique(c(ls(org.Hs.egGO2ALLEGS)[1:100],\"GO:0022008\",\"GO:0048699\", \"GO:0000280\", \"GO:0007067\")) \ngo.env <- lapply(mget(gos.interest, org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x]))) \ngo.env <- clean.gos(go.env) # remove GOs with too few or too many genes\ngo.env <- list2env(go.env) # convert to an environment\n```\n\nNow, we can calculate weighted first principal component magnitudes for each GO gene set in the provided environment.\n\n```{r, pathwaySig}\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, n.cores = 1)\n```\n\nWe can now evaluate the statistical significance of the observed overdispersion for each GO gene set.\n\n```{r, topPathways, fig.height = 4, fig.width = 5}\ndf <- pagoda.top.aspects(pwpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\n```\n\nEach point on the plot shows the PC1 variance (lambda1) magnitude (normalized by set size) as a function of set size. The red lines show expected (solid) and 95% upper bound (dashed) magnitudes based on the Tracey-Widom model.\n\n```{r, df}\nhead(df)\n```\n\n* The z column gives the Z-score of pathway over-dispersion relative to the genome-wide model (Z-score of 1.96 corresponds to P-value of 5%, etc.). \n* \"z.adj\" column shows the Z-score adjusted for multiple hypothesis (using Benjamini-Hochberg correction). \n* \"score\" gives observed/expected variance ratio\n* \"sh.z\" and \"adj.sh.z\" columns give the raw and adjusted Z-scores of \"pathway cohesion\", which compares the observed PC1 magnitude to the magnitudes obtained when the observations for each gene are randomized with respect to cells. When such Z-score is high (e.g. for GO:0008009) then multiple genes within the pathway contribute to the coordinated pattern.\n\n\n## Evaluate overdispersion of 'de novo' gene sets\n\nWe can also test 'de novo' gene sets whose expression profiles are well-correlated within the given dataset. The following procedure will determine 'de novo' gene clusters in the data, and build a background model for the expectation of the gene cluster weighted principal component magnitudes. Note the higher trim values for the clusters, as we want to avoid clusters that are formed by outlier cells.\n\n```{r, clusterPCA, fig.height = 3, fig.width = 6}\nclpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 1, plot = TRUE)\n```\n\nThe plot above shows background distribution of the first principal component (`PC1`) variance (`lambda1`) magnitude. The blue scatterplot on the left shows `lambda1` magnitude vs. cluster size for clusters determined based on randomly-generated matrices of the same size. The black circles show top cluster in each simulation. The red lines show expected magnitude and 95% confidence interval based on Tracy-Widom distribution. The right plot shows extreme value distribution fit of residual cluster `PC1` variance magnitude relative to the Gumbel (extreme value) distribution.\n\nNow the set of top aspects can be recalculated taking these `de novo` gene clusters into account:\n\n```{r, topPathways2, fig.height = 4, fig.width = 5}\ndf <- pagoda.top.aspects(pwpca, clpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\nhead(df)\n```\n\nThe gene clusters and their corresponding model expected value and 95% upper bound are shown in green.\n\n\n## Visualize significant aspects of heterogeneity\n\nTo view top heterogeneity aspects, we will first obtain information on all the significant aspects of transcriptional heterogeneity. We will also determine the overall cell clustering based on this full information:\n\n```{r, celclust}\n# get full info on the top aspects\ntam <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))\n# determine overall cell clustering\nhc <- pagoda.cluster.cells(tam, varinfo)\n```\n\nNext, we will reduce redundant aspects in two steps. First we will combine pathways that are driven by the same sets of genes:\n\n```{r, loadingCollapse}\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca, clpca)\n```\n\nIn the second step we will combine aspects that show similar patterns (i.e. separate the same sets of cells). Here we will plot the cells using the overall cell clustering determined above:\n\n```{r, correlatedCollapse, fig.height = 6, fig.width = 10}\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, cell.clustering = hc, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)\n```\n\nIn the plot above, the columns are cells, rows are different significant aspects, clustered by their similarity pattern.The green-to-orange color scheme shows low-to-high weighted PCA scores (aspect patterns), where generally orange indicates higher expression. Blocks of color on the left margin show which aspects have been combined by the command above. Here the number of resulting aspects is relatively small. \"top\" argument (i.e. top = 10) can be used to limit further analysis to top N aspects.\n\nWe will view the top aspects, clustering them by pattern similarity (note, to view aspects in the order of increasing `lambda1` magnitude, use `row.clustering = NA`). \n\n```{r, viewAspects, fig.height = 3.5, fig.width = 8}\ncol.cols <- rbind(groups = cutree(hc, 3))\npagoda.view.aspects(tamr2, cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20), col.cols = rbind(l2cols))\n```\n\nWhile each row here represents a cluster of pathways, the row names are assigned to be the top overdispersed aspect in each cluster.\n\nTo interactively browse and explore the output, we can create a `pagoda` app:\n\n```{r, pagodaApp, eval = FALSE}\n# compile a browsable app, showing top three clusters with the top color bar\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols = col.cols, cell.clustering = hc, title = \"NPCs\")\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port = 1468) \n```\n\nThe `pagoda` app allows you to view the gene sets grouped within each aspect (row), as well as genes underlying the detected heterogeneity patterns. A screenshot of the app is provided below:\n\n![pagoda app](figures/pagoda-Screen_Shot_2015-06-07_at_4.53.46_PM.png)\n\nSimilar views can be obtained in the R session itself. For instance, here we'll view top 10 genes associated with the top two pathways in the neurogenesis cluster: \"neurogenesis\" (GO:0022008) and \"generation of neurons\" (GO:0048699)\n\n```{r, showTopPathwayGenes, fig.height = 3.5, fig.width = 8}\npagoda.show.pathways(c(\"GO:0022008\",\"GO:0048699\"), varinfo, go.env, cell.clustering = hc, margins = c(1,5), show.cell.dendrogram = TRUE, showRowLabels = TRUE, showPC = TRUE)\n```\n\n### Adding 2D embedding\nOne can add a 2D embedding of the cells to aid visualization. The code below uses PAGODA's weighted Pearson correlation distance (that was used to derive hierarchical clustering) to generate a tSNE embedding of the cells:\n\n```{r, tSNE, fig.height=3.5, fig.width=3.5}\nlibrary(Rtsne);\n# recalculate clustering distance .. we'll need to specify return.details=T\ncell.clustering <- pagoda.cluster.cells(tam,varinfo,include.aspects=TRUE,verbose=TRUE,return.details=T)\n# fix the seed to ensure reproducible results\nset.seed(0); \ntSNE.pagoda <- Rtsne(cell.clustering$distance,is_distance=T,initial_dims=100,perplexity=10)\npar(mfrow=c(1,1), mar = c(2.5,2.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);\nplot(tSNE.pagoda$Y,col=adjustcolor(col.cols,alpha=0.5),cex=1,pch=19,xlab=\"\",ylab=\"\")\n```\n\nThe resulting embedding can be passed into the PAGODA app to visualize individual aspects, genes, etc. within this embedding:\n```{r, tSNEapp, eval=FALSE}\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols = col.cols, cell.clustering = hc, title = \"NPCs\", embedding = tSNE.pagoda$Y)\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port = 1468) \n```\n\n## Controlling for undesired aspects of heterogeneity\n\nDepending on the biological setting, certain dominant aspects of transcriptional heterogeneity may not be of interest. To explicitly control for these aspects of heterogeneity that are not of interest, we will use `pagoda.subtract.aspect` method that we've previously used to control for residual patterns associated with sequencing depth differences. Here, we illustrate how to control for the mitotic cell cycle pattern (GO:0000280 nuclear division and GO:0007067 mitotic nuclear division) which showed up as one of the four significant aspects in the analysis above.\n\n```{r, controlForCellCycle}\n# get cell cycle signature and view the top genes\ncc.pattern <- pagoda.show.pathways(c(\"GO:0000280\", \"GO:0007067\"), varinfo, go.env, show.cell.dendrogram = TRUE, cell.clustering = hc, showRowLabels = TRUE)\n# subtract the pattern\nvarinfo.cc <- pagoda.subtract.aspect(varinfo, cc.pattern)\n```\n\nNow we can go through the same analysis as shown above, starting with the `pagoda.pathway.wPCA()` call, using `varinfo.cc` instead of `varinfo`, which will control for the cell cycle heterogeneity between the cells.\n"
  },
  {
    "path": "inst/pagoda.md",
    "content": "Pathway and Gene Set Overdispersion Analysis\n============================================\n\nIn this vignette, we show you how to use `pagoda` routines in the `scde` package to characterize aspects of transcriptional heterogeneity in populations of single cells.\n\nThe `pagoda` routines implemented in the `scde` resolves multiple, potentially overlapping aspects of transcriptional heterogeneity by identifying known pathways or novel gene sets that show significant excess of coordinated variability among the measured cells. Briefly, cell-specific error models derived from `scde` are used to estimate residual gene expression variance, and identify pathways and gene sets that exhibit statistically significant excess of coordinated variability (overdispersion). `pagoda` can be used to effectively recover known subpopulations and discover putative new subpopulations and their corresponding functional characteristics in single-cell samples. For more information, please refer to the original manuscript by [*Fan et al.*](http://biorxiv.org/content/early/2015/09/16/026948).\n\nPreparing data\n--------------\n\nThe analysis starts with a matrix of read counts. Here, we use the read count table and cell group annotations from [*Pollen et al.*](www.ncbi.nlm.nih.gov/pubmed/25086649) can be loaded using the `data(\"pollen\")` call. Some additional filters are also applied.\n\n``` r\ndata(pollen)\n# remove poor cells and genes\ncd <- clean.counts(pollen)\n# check the final dimensions of the read count matrix\ndim(cd)\n```\n\n    ## [1] 11310    64\n\nNext, we'll translate group and sample source data from [*Pollen et al.*](www.ncbi.nlm.nih.gov/pubmed/25086649) into color codes. These will be used later to compare [*Pollen et al.*](www.ncbi.nlm.nih.gov/pubmed/25086649)'s derived annotation with subpopulations identified by `pagoda`:\n\n``` r\nx <- gsub(\"^Hi_(.*)_.*\", \"\\\\1\", colnames(cd))\nl2cols <- c(\"coral4\", \"olivedrab3\", \"skyblue2\", \"slateblue3\")[as.integer(factor(x, levels = c(\"NPC\", \"GW16\", \"GW21\", \"GW21+3\")))]\n```\n\nFitting error models\n--------------------\n\nNext, we'll construct error models for individual cells. Here, we use k-nearest neighbor model fitting procedure implemented by `knn.error.models()` method. This is a relatively noisy dataset (non-UMI), so we raise the `min.count.threshold` to 2 (minimum number of reads for the gene to be initially classified as a non-failed measurement), requiring at least 5 non-failed measurements per gene. We're providing a rough guess to the complexity of the population, by fitting the error models based on 1/4 of most similar cells (i.e. guessing there might be ~4 subpopulations).\n\nNote this step takes a considerable amount of time unless multiple cores are used. We highly recommend use of multiple cores. You can check the number of available cores available using `detectCores()`.\n\n``` r\n# EVALUATION NOT NEEDED\nknn <- knn.error.models(cd, k = ncol(cd)/4, n.cores = 1, min.count.threshold = 2, min.nonfailed = 5, max.model.plots = 10)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n``` r\ndata(knn)\n```\n\nThe fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). The fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). Here's an example of such plot:\n\n![cell 3 model](figures/pagoda-cell.model.fits-0.png)\n\nThe two scatter plots on the left show observed (in a given cell) vs. expected (from k similar cells) expression magnitudes for each gene that is being used for model fitting. The second (from the left) scatter plot shows genes belonging to the drop-out component in red. The black dashed lines show 95% confidence band for the amplified genes (the grey dashed lines show confidence band for an alternative constant-theta model). The third plot shows drop-out probability as a function of magnitude, and the fourth plot shows negative binomial theta local regression fit as a function of magnitude (for the amplified component).\n\nNormalizing variance\n--------------------\n\nIn order to accurately quantify excess variance or overdispersion, we must normalize out expected levels of technical and intrinsic biological noise. Briefly, variance of the NB/Poisson mixture processes derived from the error modeling step are modeled as a chi-squared distribution using adjusted degrees of freedom and observation weights based on the drop-out probability of a given gene. Here, we normalize variance, trimming 3 most extreme cells and limiting maximum adjusted variance to 5.\n\n``` r\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = TRUE)\n```\n\n![](figures/pagoda-varnorm-1.png)\n\nThe plot on the left shows coefficient of variance squared (on log10 scale) as a function of expression magnitude (log10 FPM). The red line shows local regression model for the genome-wide average dependency. The plot on the right shows adjusted variance (derived based on chi-squared probability of observed/genomewide expected ratio for each gene, with degrees of freedom adjusted for each gene). The adjusted variance of 1 means that a given gene exhibits as much variance as expected for a gene of such population average expression magnitude. Genes with high adjusted variance are overdispersed within the measured population and most likely show subpopulation-specific expression:\n\n``` r\n# list top overdispersed genes\nsort(varinfo$arv, decreasing = TRUE)[1:10]\n```\n\n    ##      DCX     EGR1      FOS  IGFBPL1   MALAT1    MEF2C    STMN2    TOP2A \n    ## 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 \n    ##   BCL11A     SOX4 \n    ## 4.755811 4.522795\n\nControlling for sequencing depth\n--------------------------------\n\nEven with all the corrections, sequencing depth or gene coverage is typically still a major aspects of variability. In most studies, we would want to control for that as a technical artifact (exceptions are cell mixtures where subtypes significantly differ in the amount of total mRNA). Below we will control for the gene coverage (estimated as a number of genes with non-zero magnitude per cell) and normalize out that aspect of cell heterogeneity:\n\n``` r\nvarinfo <- pagoda.subtract.aspect(varinfo, colSums(cd[, rownames(knn)]>0))\n```\n\nEvaluate overdispersion of pre-defined gene sets\n------------------------------------------------\n\nIn order to detect significant aspects of heterogeneity across the population of single cells, 'pagoda' identifies pathways and gene sets that exhibit statistically significant excess of coordinated variability. Specifically, for each gene set, we tested whether the amount of variance explained by the first principal component significantly exceed the background expectation. We can test both pre-defined gene sets as well as 'de novo' gene sets whose expression profiles are well-correlated within the given dataset.\n\nFor pre-defined gene sets, we'll use GO annotations. For the purposes of this vignette, in order to make calculations faster, we will only consider the first 100 GO terms plus a few that we care about. Additional tutorials on how to create and use your own gene sets can be found in [a separate tutorial](http://hms-dbmi.github.io/scde/genesets.html).\n\n``` r\nlibrary(org.Hs.eg.db)\n# translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\nrids <- names(ids); names(rids) <- ids \n# convert GO lists from ids to gene names\ngos.interest <- unique(c(ls(org.Hs.egGO2ALLEGS)[1:100],\"GO:0022008\",\"GO:0048699\", \"GO:0000280\", \"GO:0007067\")) \ngo.env <- lapply(mget(gos.interest, org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x]))) \ngo.env <- clean.gos(go.env) # remove GOs with too few or too many genes\ngo.env <- list2env(go.env) # convert to an environment\n```\n\nNow, we can calculate weighted first principal component magnitudes for each GO gene set in the provided environment.\n\n``` r\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, n.cores = 1)\n```\n\nWe can now evaluate the statistical significance of the observed overdispersion for each GO gene set.\n\n``` r\ndf <- pagoda.top.aspects(pwpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\n```\n\n![](figures/pagoda-topPathways-1.png)\n\nEach point on the plot shows the PC1 variance (lambda1) magnitude (normalized by set size) as a function of set size. The red lines show expected (solid) and 95% upper bound (dashed) magnitudes based on the Tracey-Widom model.\n\n``` r\nhead(df)\n```\n\n    ##          name npc   n    score         z     adj.z sh.z adj.sh.z\n    ## 57 GO:0048699   1 864 2.261079 27.019091 26.894993   NA       NA\n    ## 56 GO:0022008   1 911 2.233116 27.062973 26.913370   NA       NA\n    ## 30 GO:0000226   1 302 1.665067 11.226626 10.989344   NA       NA\n    ## 55 GO:0007067   1 330 1.626072 11.035135 10.814193   NA       NA\n    ## 37 GO:0000280   1 395 1.617566 11.650728 11.397103   NA       NA\n    ## 9  GO:0000070   1  96 1.512958  5.888795  5.555273   NA       NA\n\n-   The z column gives the Z-score of pathway over-dispersion relative to the genome-wide model (Z-score of 1.96 corresponds to P-value of 5%, etc.).\n-   \"z.adj\" column shows the Z-score adjusted for multiple hypothesis (using Benjamini-Hochberg correction).\n-   \"score\" gives observed/expected variance ratio\n-   \"sh.z\" and \"adj.sh.z\" columns give the raw and adjusted Z-scores of \"pathway cohesion\", which compares the observed PC1 magnitude to the magnitudes obtained when the observations for each gene are randomized with respect to cells. When such Z-score is high (e.g. for <GO:0008009>) then multiple genes within the pathway contribute to the coordinated pattern.\n\nEvaluate overdispersion of 'de novo' gene sets\n----------------------------------------------\n\nWe can also test 'de novo' gene sets whose expression profiles are well-correlated within the given dataset. The following procedure will determine 'de novo' gene clusters in the data, and build a background model for the expectation of the gene cluster weighted principal component magnitudes. Note the higher trim values for the clusters, as we want to avoid clusters that are formed by outlier cells.\n\n``` r\nclpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 1, plot = TRUE)\n```\n\n![](figures/pagoda-clusterPCA-1.png)\n\nThe plot above shows background distribution of the first principal component (`PC1`) variance (`lambda1`) magnitude. The blue scatterplot on the left shows `lambda1` magnitude vs. cluster size for clusters determined based on randomly-generated matrices of the same size. The black circles show top cluster in each simulation. The red lines show expected magnitude and 95% confidence interval based on Tracy-Widom distribution. The right plot shows extreme value distribution fit of residual cluster `PC1` variance magnitude relative to the Gumbel (extreme value) distribution.\n\nNow the set of top aspects can be recalculated taking these `de novo` gene clusters into account:\n\n``` r\ndf <- pagoda.top.aspects(pwpca, clpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\nhead(df)\n```\n\n    ##              name npc   n    score         z     adj.z sh.z adj.sh.z\n    ## 65  geneCluster.8   1 307 3.235994 12.803995 12.496650   NA       NA\n    ## 57     GO:0048699   1 864 2.261079 27.019091 26.894993   NA       NA\n    ## 56     GO:0022008   1 911 2.233116 27.062973 26.913370   NA       NA\n    ## 30     GO:0000226   1 302 1.665067 11.226626 10.989344   NA       NA\n    ## 72 geneCluster.15   1 287 1.642582  6.453696  5.947129   NA       NA\n    ## 55     GO:0007067   1 330 1.626072 11.035135 10.814193   NA       NA\n\n![](figures/pagoda-topPathways2-1.png)\n\nThe gene clusters and their corresponding model expected value and 95% upper bound are shown in green.\n\nVisualize significant aspects of heterogeneity\n----------------------------------------------\n\nTo view top heterogeneity aspects, we will first obtain information on all the significant aspects of transcriptional heterogeneity. We will also determine the overall cell clustering based on this full information:\n\n``` r\n# get full info on the top aspects\ntam <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))\n# determine overall cell clustering\nhc <- pagoda.cluster.cells(tam, varinfo)\n```\n\nNext, we will reduce redundant aspects in two steps. First we will combine pathways that are driven by the same sets of genes:\n\n``` r\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca, clpca)\n```\n\nIn the second step we will combine aspects that show similar patterns (i.e. separate the same sets of cells). Here we will plot the cells using the overall cell clustering determined above:\n\n``` r\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, cell.clustering = hc, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)\n```\n\n![](figures/pagoda-correlatedCollapse-1.png)\n\nIn the plot above, the columns are cells, rows are different significant aspects, clustered by their similarity pattern.The green-to-orange color scheme shows low-to-high weighted PCA scores (aspect patterns), where generally orange indicates higher expression. Blocks of color on the left margin show which aspects have been combined by the command above. Here the number of resulting aspects is relatively small. \"top\" argument (i.e. top = 10) can be used to limit further analysis to top N aspects.\n\nWe will view the top aspects, clustering them by pattern similarity (note, to view aspects in the order of increasing `lambda1` magnitude, use `row.clustering = NA`).\n\n``` r\ncol.cols <- rbind(groups = cutree(hc, 3))\npagoda.view.aspects(tamr2, cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20), col.cols = rbind(l2cols))\n```\n\n![](figures/pagoda-viewAspects-1.png)\n\nWhile each row here represents a cluster of pathways, the row names are assigned to be the top overdispersed aspect in each cluster.\n\nTo interactively browse and explore the output, we can create a `pagoda` app:\n\n``` r\n# compile a browsable app, showing top three clusters with the top color bar\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols = col.cols, cell.clustering = hc, title = \"NPCs\")\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port = 1468) \n```\n\nThe `pagoda` app allows you to view the gene sets grouped within each aspect (row), as well as genes underlying the detected heterogeneity patterns. A screenshot of the app is provided below:\n\n![pagoda app](figures/pagoda-Screen_Shot_2015-06-07_at_4.53.46_PM.png)\n\nSimilar views can be obtained in the R session itself. For instance, here we'll view top 10 genes associated with the top two pathways in the neurogenesis cluster: \"neurogenesis\" (<GO:0022008>) and \"generation of neurons\" (<GO:0048699>)\n\n``` r\npagoda.show.pathways(c(\"GO:0022008\",\"GO:0048699\"), varinfo, go.env, cell.clustering = hc, margins = c(1,5), show.cell.dendrogram = TRUE, showRowLabels = TRUE, showPC = TRUE)\n```\n\n![](figures/pagoda-showTopPathwayGenes-1.png)\n\n### Adding 2D embedding\n\nOne can add a 2D embedding of the cells to aid visualization. The code below uses PAGODA's weighted Pearson correlation distance (that was used to derive hierarchical clustering) to generate a tSNE embedding of the cells:\n\n``` r\nlibrary(Rtsne);\n# recalculate clustering distance .. we'll need to specify return.details=T\ncell.clustering <- pagoda.cluster.cells(tam,varinfo,include.aspects=TRUE,verbose=TRUE,return.details=T)\n# fix the seed to ensure reproducible results\nset.seed(0); \ntSNE.pagoda <- Rtsne(cell.clustering$distance,is_distance=T,initial_dims=100,perplexity=10)\npar(mfrow=c(1,1), mar = c(2.5,2.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);\nplot(tSNE.pagoda$Y,col=adjustcolor(col.cols,alpha=0.5),cex=1,pch=19,xlab=\"\",ylab=\"\")\n```\n\n![](figures/pagoda-tSNE-1.png)<!-- -->\n\nThe resulting embedding can be passed into the PAGODA app to visualize individual aspects, genes, etc. within this embedding:\n\n``` r\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols = col.cols, cell.clustering = hc, title = \"NPCs\", embedding = tSNE.pagoda$Y)\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port = 1468) \n```\n\nControlling for undesired aspects of heterogeneity\n--------------------------------------------------\n\nDepending on the biological setting, certain dominant aspects of transcriptional heterogeneity may not be of interest. To explicitly control for these aspects of heterogeneity that are not of interest, we will use `pagoda.subtract.aspect` method that we've previously used to control for residual patterns associated with sequencing depth differences. Here, we illustrate how to control for the mitotic cell cycle pattern (<GO:0000280> nuclear division and <GO:0007067> mitotic nuclear division) which showed up as one of the four significant aspects in the analysis above.\n\n``` r\n# get cell cycle signature and view the top genes\ncc.pattern <- pagoda.show.pathways(c(\"GO:0000280\", \"GO:0007067\"), varinfo, go.env, show.cell.dendrogram = TRUE, cell.clustering = hc, showRowLabels = TRUE)\n# subtract the pattern\nvarinfo.cc <- pagoda.subtract.aspect(varinfo, cc.pattern)\n```\n\n![](figures/pagoda-controlForCellCycle-1.png)\n\nNow we can go through the same analysis as shown above, starting with the `pagoda.pathway.wPCA()` call, using `varinfo.cc` instead of `varinfo`, which will control for the cell cycle heterogeneity between the cells.\n"
  },
  {
    "path": "license.txt",
    "content": "Copyright (c) 2015. All Rights Reserved. Created by Jean Fan and Peter Kharchenko. \nHarvard Medical School, Department of Biomedical Informatics (Regents).  \n\nPermission to use, copy, modify, and distribute this software \nand its documentation for educational and research not-for-profit purposes,\nwithout fee and without a signed licensing agreement, is hereby granted,\nprovided that attribution to authors and Regents, as appeared in the paragraph\nabove appears in all copies, modifications, and distributions. Contact The\nauthors for commercial licensing opportunities.\n\nIN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,\nINCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF\nTHE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN\nADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nREGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.\nTHE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS\nPROVIDED \"AS IS\". REGENTS HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT,\nUPDATES, ENHANCEMENTS, OR MODIFICATIONS.\n"
  },
  {
    "path": "man/ViewPagodaApp-class.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{class}\n\\name{ViewPagodaApp-class}\n\\alias{ViewPagodaApp}\n\\alias{ViewPagodaApp-class}\n\\title{A Reference Class to represent the PAGODA application}\n\\description{\nThis ROOK application class enables communication with the client-side ExtJS framework and Inchlib HTML5 canvas libraries to create the graphical user interface for PAGODA\nRefer to the code in \\code{\\link{make.pagoda.app}} for usage example\n}\n\\section{Fields}{\n\n\\describe{\n\\item{\\code{results}}{Output of the pathway clustering and redundancy reduction}\n\n\\item{\\code{genes}}{List of genes to display in the Detailed clustering panel}\n\n\\item{\\code{mat}}{Matrix of posterior mode count estimates}\n\n\\item{\\code{matw}}{Matrix of weights associated with each estimate in \\code{mat}}\n\n\\item{\\code{goenv}}{Gene set list as an environment}\n\n\\item{\\code{renv}}{Global environment}\n\n\\item{\\code{name}}{Name of the application page; for display as the page title}\n\n\\item{\\code{trim}}{Trim quantity used for Winsorization for visualization}\n\n\\item{\\code{batch}}{Any batch or other known confounders to be included in the visualization as a column color track}\n}}\n\n"
  },
  {
    "path": "man/bwpca.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{bwpca}\n\\alias{bwpca}\n\\title{Determine principal components of a matrix using per-observation/per-variable weights}\n\\usage{\nbwpca(mat, matw = NULL, npcs = 2, nstarts = 1, smooth = 0,\n  em.tol = 1e-06, em.maxiter = 25, seed = 1, center = TRUE,\n  n.shuffles = 0)\n}\n\\arguments{\n\\item{mat}{matrix of variables (columns) and observations (rows)}\n\n\\item{matw}{corresponding weights}\n\n\\item{npcs}{number of principal components to extract}\n\n\\item{nstarts}{number of random starts to use}\n\n\\item{smooth}{smoothing span}\n\n\\item{em.tol}{desired EM algorithm tolerance}\n\n\\item{em.maxiter}{maximum number of EM iterations}\n\n\\item{seed}{random seed}\n\n\\item{center}{whether mat should be centered (weighted centering)}\n\n\\item{n.shuffles}{optional number of per-observation randomizations that should be performed in addition to the main calculations to determine the lambda1 (PC1 eigenvalue) magnitude under such randomizations (returned in $randvar)}\n}\n\\value{\na list containing eigenvector matrix ($rotation), projections ($scores), variance (weighted) explained by each component ($var), total (weighted) variance of the dataset ($totalvar)\n}\n\\description{\nImplements a weighted PCA\n}\n\\examples{\nset.seed(0)\nmat <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random matrix\nbase.pca <- bwpca(mat)  # non-weighted pca, equal weights set automatically\nmatw <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random weight matrix\nmatw <- abs(matw)/max(matw)\nbase.pca.weighted <- bwpca(mat, matw)  # weighted pca\n\n}\n\n"
  },
  {
    "path": "man/clean.counts.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{clean.counts}\n\\alias{clean.counts}\n\\title{Filter counts matrix}\n\\usage{\nclean.counts(counts, min.lib.size = 1800, min.reads = 10,\n  min.detected = 5)\n}\n\\arguments{\n\\item{counts}{read count matrix. The rows correspond to genes, columns correspond to individual cells}\n\n\\item{min.lib.size}{Minimum number of genes detected in a cell. Cells with fewer genes will be removed (default: 1.8e3)}\n\n\\item{min.reads}{Minimum number of reads per gene. Genes with fewer reads will be removed (default: 10)}\n\n\\item{min.detected}{Minimum number of cells a gene must be seen in. Genes not seen in a sufficient number of cells will be removed (default: 5)}\n}\n\\value{\na filtered read count matrix\n}\n\\description{\nFilter counts matrix based on gene and cell requirements\n}\n\\examples{\ndata(pollen)\ndim(pollen)\ncd <- clean.counts(pollen)\ndim(cd)\n\n}\n\n"
  },
  {
    "path": "man/clean.gos.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{clean.gos}\n\\alias{clean.gos}\n\\title{Filter GOs list}\n\\usage{\nclean.gos(go.env, min.size = 5, max.size = 5000, annot = FALSE)\n}\n\\arguments{\n\\item{go.env}{GO or gene set list}\n\n\\item{min.size}{Minimum size for number of genes in a gene set (default: 5)}\n\n\\item{max.size}{Maximum size for number of genes in a gene set (default: 5000)}\n\n\\item{annot}{Whether to append GO annotations for easier interpretation (default: FALSE)}\n}\n\\value{\na filtered GO list\n}\n\\description{\nFilter GOs list and append GO names when appropriate\n}\n\\examples{\n\\donttest{\n# 10 sample GOs\nlibrary(org.Hs.eg.db)\ngo.env <- mget(ls(org.Hs.egGO2ALLEGS)[1:10], org.Hs.egGO2ALLEGS)\n# Filter this list and append names for easier interpretation\ngo.env <- clean.gos(go.env)\n}\n\n}\n\n"
  },
  {
    "path": "man/es.mef.small.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{data}\n\\name{es.mef.small}\n\\alias{es.mef.small}\n\\title{Sample data}\n\\description{\nA subset of Saiful et al. 2011 dataset containing first 20 ES and 20 MEF cells.\n}\n\\references{\n\\url{http://www.ncbi.nlm.nih.gov/pubmed/21543516}\n}\n\n"
  },
  {
    "path": "man/knn.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{data}\n\\name{knn}\n\\alias{knn}\n\\title{Sample error model}\n\\description{\nSCDE error model generated from the Pollen et al. 2014 dataset.\n}\n\\references{\n\\url{www.ncbi.nlm.nih.gov/pubmed/25086649}\n}\n\n"
  },
  {
    "path": "man/knn.error.models.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{knn.error.models}\n\\alias{knn.error.models}\n\\title{Build error models for heterogeneous cell populations, based on K-nearest neighbor cells.}\n\\usage{\nknn.error.models(counts, groups = NULL, k = round(ncol(counts)/2),\n  min.nonfailed = 5, min.count.threshold = 1, save.model.plots = TRUE,\n  max.model.plots = 50, n.cores = parallel::detectCores(),\n  min.size.entries = 2000, min.fpm = 0, cor.method = \"pearson\",\n  verbose = 0, fpm.estimate.trim = 0.25, linear.fit = TRUE,\n  local.theta.fit = linear.fit, theta.fit.range = c(0.01, 100),\n  alpha.weight.power = 1/2)\n}\n\\arguments{\n\\item{counts}{count matrix (integer matrix, rows- genes, columns- cells)}\n\n\\item{groups}{optional groups partitioning known subpopulations}\n\n\\item{k}{number of nearest neighbor cells to use during fitting. If k is set sufficiently high, all of the cells within a given group will be used.}\n\n\\item{min.nonfailed}{minimum number of non-failed measurements (within the k nearest neighbor cells) required for a gene to be taken into account during error fitting procedure}\n\n\\item{min.count.threshold}{minimum number of reads required for a measurement to be considered non-failed}\n\n\\item{save.model.plots}{whether model plots should be saved (file names are (group).models.pdf, or cell.models.pdf if no group was supplied)}\n\n\\item{max.model.plots}{maximum number of models to save plots for (saves time when there are too many cells)}\n\n\\item{n.cores}{number of cores to use through the calculations}\n\n\\item{min.size.entries}{minimum number of genes to use for model fitting}\n\n\\item{min.fpm}{optional parameter to restrict model fitting to genes with group-average expression magnitude above a given value}\n\n\\item{cor.method}{correlation measure to be used in determining k nearest cells}\n\n\\item{verbose}{level of verbosity}\n\n\\item{fpm.estimate.trim}{trim fraction to be used in estimating group-average gene expression magnitude for model fitting (0.5 would be median, 0 would turn off trimming)}\n\n\\item{linear.fit}{whether newer linear model fit with zero intercept should be used (T), or the log-fit model published originally (F)}\n\n\\item{local.theta.fit}{whether local theta fitting should be used (only available for the linear fit models)}\n\n\\item{theta.fit.range}{allowed range of the theta values}\n\n\\item{alpha.weight.power}{1/theta weight power used in fitting theta dependency on the expression magnitude}\n}\n\\value{\na data frame with parameters of the fit error models (rows- cells, columns- fitted parameters)\n}\n\\description{\nBuilds cell-specific error models assuming that there are multiple subpopulations present\namong the measured cells. The models for each cell are based on average expression estimates\nobtained from K closest cells within a given group (if groups = NULL, then within the entire\nset of measured cells). The method implements fitting of both the original log-fit models\n(when linear.fit = FALSE), or newer linear-fit models (linear.fit = TRUE, default) with locally\nfit overdispersion coefficient (local.theta.fit = TRUE, default).\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\n}\n\n}\n\n"
  },
  {
    "path": "man/make.pagoda.app.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{make.pagoda.app}\n\\alias{make.pagoda.app}\n\\title{Make the PAGODA app}\n\\usage{\nmake.pagoda.app(tamr, tam, varinfo, env, pwpca, clpca = NULL,\n  col.cols = NULL, cell.clustering = NULL, row.clustering = NULL,\n  title = \"pathway clustering\", zlim = c(-1, 1) * quantile(tamr$xv, p =\n  0.95))\n}\n\\arguments{\n\\item{tamr}{Combined pathways that show similar expression patterns. Output of \\code{\\link{pagoda.reduce.redundancy}}}\n\n\\item{tam}{Combined pathways that are driven by the same gene sets. Output of \\code{\\link{pagoda.reduce.loading.redundancy}}}\n\n\\item{varinfo}{Variance information. Output of \\code{\\link{pagoda.varnorm}}}\n\n\\item{env}{Gene sets as an environment variable.}\n\n\\item{pwpca}{Weighted PC magnitudes for each gene set provided in the \\code{env}. Output of \\code{\\link{pagoda.pathway.wPCA}}}\n\n\\item{clpca}{Weighted PC magnitudes for de novo gene sets identified by clustering on expression. Output of \\code{\\link{pagoda.gene.clusters}}}\n\n\\item{col.cols}{Matrix of column colors. Useful for visualizing cell annotations such as batch labels. Default NULL.}\n\n\\item{cell.clustering}{Dendrogram of cell clustering. Output of \\code{\\link{pagoda.cluster.cells} } . Default   NULL.}\n\n\\item{row.clustering}{Dendrogram of combined pathways clustering. Default NULL.}\n\n\\item{title}{Title text to be used in the browser label for the app. Default, set as 'pathway clustering'}\n\n\\item{zlim}{Range of the normalized gene expression levels, inputted as a list: c(lower_bound, upper_bound). Values outside this range will be Winsorized. Useful for increasing the contrast of the heatmap visualizations. Default, set to the 5th and 95th percentiles.}\n}\n\\value{\nPAGODA app\n}\n\\description{\nCreate an interactive user interface to explore output of PAGODA.\n}\n\n"
  },
  {
    "path": "man/o.ifm.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{data}\n\\name{o.ifm}\n\\alias{o.ifm}\n\\title{Sample error model}\n\\description{\nSCDE error model generated from a subset of Saiful et al. 2011 dataset containing first 20 ES and 20 MEF cells.\n}\n\\references{\n\\url{http://www.ncbi.nlm.nih.gov/pubmed/21543516}\n}\n\n"
  },
  {
    "path": "man/pagoda.cluster.cells.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.cluster.cells}\n\\alias{pagoda.cluster.cells}\n\\title{Determine optimal cell clustering based on the genes driving the significant aspects}\n\\usage{\npagoda.cluster.cells(tam, varinfo, method = \"ward.D\",\n  include.aspects = FALSE, verbose = 0, return.details = FALSE)\n}\n\\arguments{\n\\item{tam}{result of pagoda.top.aspects() call}\n\n\\item{varinfo}{result of pagoda.varnorm() call}\n\n\\item{method}{clustering method ('ward.D' by default)}\n\n\\item{include.aspects}{whether the aspect patterns themselves should be included alongside with the individual genes in calculating cell distance}\n\n\\item{verbose}{0 or 1 depending on level of desired verbosity}\n\n\\item{return.details}{Boolean of whether to return just the hclust result or a list containing the hclust result plus the distance matrix and gene values}\n}\n\\value{\nhclust result\n}\n\\description{\nDetermines cell clustering (hclust result) based on a weighted correlation of genes\nunderlying the top aspects of transcriptional heterogeneity. Branch orientation is optimized\nif 'cba' package is installed.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\ntam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\nhc <- pagoda.cluster.cells(tam, varinfo)\nplot(hc)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.effective.cells.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.effective.cells}\n\\alias{pagoda.effective.cells}\n\\title{Estimate effective number of cells based on lambda1 of random gene sets}\n\\usage{\npagoda.effective.cells(pwpca, start = NULL)\n}\n\\arguments{\n\\item{pwpca}{result of the pagoda.pathway.wPCA() call with n.randomizations > 1}\n\n\\item{start}{optional starting value for the optimization (if the NLS breaks, trying high starting values usually fixed the local gradient problem)}\n}\n\\value{\neffective number of cells\n}\n\\description{\nExamines the dependency between the amount of variance explained by the first principal component\nof a gene set and the number of genes in a gene set to determine the effective number of cells\nfor the Tracy-Widom distribution\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\npagoda.effective.cells(pwpca)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.gene.clusters.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.gene.clusters}\n\\alias{pagoda.gene.clusters}\n\\title{Determine de-novo gene clusters and associated overdispersion info}\n\\usage{\npagoda.gene.clusters(varinfo, trim = 3.1/ncol(varinfo$mat),\n  n.clusters = 150, n.samples = 60, cor.method = \"p\",\n  n.internal.shuffles = 0, n.starts = 10, n.cores = detectCores(),\n  verbose = 0, plot = FALSE, show.random = FALSE, n.components = 1,\n  method = \"ward.D\", secondary.correlation = FALSE,\n  n.cells = ncol(varinfo$mat), old.results = NULL)\n}\n\\arguments{\n\\item{varinfo}{varinfo adjusted variance info from pagoda.varinfo() (or pagoda.subtract.aspect())}\n\n\\item{trim}{additional Winsorization trim value to be used in determining clusters (to remove clusters that group outliers occurring in a given cell). Use higher values (5-15) if the resulting clusters group outlier patterns}\n\n\\item{n.clusters}{number of clusters to be determined (recommended range is 100-200)}\n\n\\item{n.samples}{number of randomly generated matrix samples to test the background distribution of lambda1 on}\n\n\\item{cor.method}{correlation method (\"pearson\", \"spearman\") to be used as a distance measure for clustering}\n\n\\item{n.internal.shuffles}{number of internal shuffles to perform (only if interested in set coherence, which is quite high for clusters by definition, disabled by default; set to 10-30 shuffles to estimate)}\n\n\\item{n.starts}{number of wPCA EM algorithm starts at each iteration}\n\n\\item{n.cores}{number of cores to use}\n\n\\item{verbose}{verbosity level}\n\n\\item{plot}{whether a plot showing distribution of random lambda1 values should be shown (along with the extreme value distribution fit)}\n\n\\item{show.random}{whether the empirical random gene set values should be shown in addition to the Tracy-Widom analytical approximation}\n\n\\item{n.components}{number of PC to calculate (can be increased if the number of clusters is small and some contain strong secondary patterns - rarely the case)}\n\n\\item{method}{clustering method to be used in determining gene clusters}\n\n\\item{secondary.correlation}{whether clustering should be performed on the correlation of the correlation matrix instead}\n\n\\item{n.cells}{number of cells to use for the randomly generated cluster lambda1 model}\n\n\\item{old.results}{optionally, pass old results just to plot the model without recalculating the stats}\n}\n\\value{\na list containing the following fields:\n\\itemize{\n\\item{clusters} {a list of genes in each cluster values}\n\\item{xf} { extreme value distribution fit for the standardized lambda1 of a randomly generated pattern}\n\\item{tci} { index of a top cluster in each random iteration}\n\\item{cl.goc} {weighted PCA info for each real gene cluster}\n\\item{varm} {standardized lambda1 values for each randomly generated matrix cluster}\n\\item{clvlm} {a linear model describing dependency of the cluster lambda1 on a Tracy-Widom lambda1 expectation}\n}\n}\n\\description{\nDetermine de-novo gene clusters, their weighted PCA lambda1 values, and random matrix expectation.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\nclpca <- pagoda.gene.clusters(varinfo, trim=7.1/ncol(varinfo$mat), n.clusters=150, n.cores=10, plot=FALSE)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.pathway.wPCA.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.pathway.wPCA}\n\\alias{pagoda.pathway.wPCA}\n\\title{Run weighted PCA analysis on pre-annotated gene sets}\n\\usage{\npagoda.pathway.wPCA(varinfo, setenv, n.components = 2,\n  n.cores = detectCores(), min.pathway.size = 10, max.pathway.size = 1000,\n  n.randomizations = 10, n.internal.shuffles = 0, n.starts = 10,\n  center = TRUE, batch.center = TRUE, proper.gene.names = NULL,\n  verbose = 0)\n}\n\\arguments{\n\\item{varinfo}{adjusted variance info from pagoda.varinfo() (or pagoda.subtract.aspect())}\n\n\\item{setenv}{environment listing gene sets (contains variables with names corresponding to gene set name, and values being vectors of gene names within each gene set)}\n\n\\item{n.components}{number of principal components to determine for each gene set}\n\n\\item{n.cores}{number of cores to use}\n\n\\item{min.pathway.size}{minimum number of observed genes that should be contained in a valid gene set}\n\n\\item{max.pathway.size}{maximum number of observed genes in a valid gene set}\n\n\\item{n.randomizations}{number of random gene sets (of the same size) to be evaluated in parallel with each gene set (can be kept at 5 or 10, but should be increased to 50-100 if the significance of pathway overdispersion will be determined relative to random gene set models)}\n\n\\item{n.internal.shuffles}{number of internal (independent row shuffles) randomizations of expression data that should be evaluated for each gene set (needed only if one is interested in gene set coherence P values, disabled by default; set to 10-30 to estimate)}\n\n\\item{n.starts}{number of random starts for the EM method in each evaluation}\n\n\\item{center}{whether the expression matrix should be recentered}\n\n\\item{batch.center}{whether batch-specific centering should be used}\n\n\\item{proper.gene.names}{alternative vector of gene names (replacing rownames(varinfo$mat)) to be used in cases when the provided setenv uses different gene names}\n\n\\item{verbose}{verbosity level}\n}\n\\value{\na list of weighted PCA info for each valid gene set\n}\n\\description{\nFor each valid gene set (having appropriate number of genes) in the provided environment (setenv),\nthe method will run weighted PCA analysis, along with analogous analyses of random gene sets of the\nsame size, or shuffled expression magnitudes for the same gene set.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n# create go environment\nlibrary(org.Hs.eg.db)\n# translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\nrids <- names(ids); names(rids) <- ids\ngo.env <- lapply(mget(ls(org.Hs.egGO2ALLEGS), org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x])))\n# clean GOs\ngo.env <- clean.gos(go.env)\n# convert to an environment\ngo.env <- list2env(go.env)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.reduce.loading.redundancy.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.reduce.loading.redundancy}\n\\alias{pagoda.reduce.loading.redundancy}\n\\title{Collapse aspects driven by the same combinations of genes}\n\\usage{\npagoda.reduce.loading.redundancy(tam, pwpca, clpca = NULL, plot = FALSE,\n  cluster.method = \"complete\", distance.threshold = 0.01, corr.power = 4,\n  n.cores = detectCores(), abs = TRUE, ...)\n}\n\\arguments{\n\\item{tam}{output of pagoda.top.aspects()}\n\n\\item{pwpca}{output of pagoda.pathway.wPCA()}\n\n\\item{clpca}{output of pagoda.gene.clusters() (optional)}\n\n\\item{plot}{whether to plot the resulting clustering}\n\n\\item{cluster.method}{one of the standard clustering methods to be used (fastcluster::hclust is used if available or stats::hclust)}\n\n\\item{distance.threshold}{similarity threshold for grouping interdependent aspects}\n\n\\item{corr.power}{power to which the product of loading and score correlation is raised}\n\n\\item{n.cores}{number of cores to use during processing}\n\n\\item{abs}{Boolean of whether to use absolute correlation}\n\n\\item{...}{additional arguments are passed to the pagoda.view.aspects() method during plotting}\n}\n\\value{\na list structure analogous to that returned by pagoda.top.aspects(), but with addition of a $cnam element containing a list of aspects summarized by each row of the new (reduced) $xv and $xvw\n}\n\\description{\nExamines PC loading vectors underlying the identified aspects and clusters aspects based\non a product of loading and score correlation (raised to corr.power). Clusters of aspects\ndriven by the same genes are determined based on the distance.threshold and collapsed.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\ntam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.reduce.redundancy.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.reduce.redundancy}\n\\alias{pagoda.reduce.redundancy}\n\\title{Collapse aspects driven by similar patterns (i.e. separate the same sets of cells)}\n\\usage{\npagoda.reduce.redundancy(tamr, distance.threshold = 0.2,\n  cluster.method = \"complete\", distance = NULL,\n  weighted.correlation = TRUE, plot = FALSE, top = Inf, trim = 0,\n  abs = FALSE, ...)\n}\n\\arguments{\n\\item{tamr}{output of pagoda.reduce.loading.redundancy()}\n\n\\item{distance.threshold}{similarity threshold for grouping interdependent aspects}\n\n\\item{cluster.method}{one of the standard clustering methods to be used (fastcluster::hclust is used if available or stats::hclust)}\n\n\\item{distance}{distance matrix}\n\n\\item{weighted.correlation}{Boolean of whether to use a weighted correlation in determining the similarity of patterns}\n\n\\item{plot}{Boolean of whether to show plot}\n\n\\item{top}{Restrict output to the top n aspects of heterogeneity}\n\n\\item{trim}{Winsorization trim to use prior to determining the top aspects}\n\n\\item{abs}{Boolean of whether to use absolute correlation}\n\n\\item{...}{additional arguments are passed to the pagoda.view.aspects() method during plotting}\n}\n\\value{\na list structure analogous to that returned by pagoda.top.aspects(), but with addition of a $cnam element containing a list of aspects summarized by each row of the new (reduced) $xv and $xvw\n}\n\\description{\nExamines PC loading vectors underlying the identified aspects and clusters aspects based on score correlation. Clusters of aspects driven by the same patterns are determined based on the distance.threshold.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\ntam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca)\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.show.pathways.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.show.pathways}\n\\alias{pagoda.show.pathways}\n\\title{View pathway or gene weighted PCA}\n\\usage{\npagoda.show.pathways(pathways, varinfo, goenv = NULL, n.genes = 20,\n  two.sided = FALSE, n.pc = rep(1, length(pathways)), colcols = NULL,\n  zlim = NULL, showRowLabels = FALSE, cexCol = 1, cexRow = 1,\n  nstarts = 10, cell.clustering = NULL, show.cell.dendrogram = TRUE,\n  plot = TRUE, box = TRUE, trim = 0, return.details = FALSE, ...)\n}\n\\arguments{\n\\item{pathways}{character vector of pathway or gene names}\n\n\\item{varinfo}{output of pagoda.varnorm()}\n\n\\item{goenv}{environment mapping pathways to genes}\n\n\\item{n.genes}{number of genes to show}\n\n\\item{two.sided}{whether the set of shown genes should be split among highest and lowest loading (T) or if genes with highest absolute loading (F) should be shown}\n\n\\item{n.pc}{optional integer vector giving the number of principal component to show for each listed pathway}\n\n\\item{colcols}{optional column color matrix}\n\n\\item{zlim}{optional z color limit}\n\n\\item{showRowLabels}{controls whether row labels are shown in the plot}\n\n\\item{cexCol}{column label size (cex)}\n\n\\item{cexRow}{row label size (cex)}\n\n\\item{nstarts}{number of random starts for the wPCA}\n\n\\item{cell.clustering}{cell clustering}\n\n\\item{show.cell.dendrogram}{whether cell dendrogram should be shown}\n\n\\item{plot}{whether the plot should be shown}\n\n\\item{box}{whether to draw a box around the plotted matrix}\n\n\\item{trim}{optional Winsorization trim that should be applied}\n\n\\item{return.details}{whether the function should return the matrix as well as full PCA info instead of just PC1 vector}\n\n\\item{...}{additional arguments are passed to the \\code{c.view.pathways}}\n}\n\\value{\ncell scores along the first principal component of shown genes (returned as invisible)\n}\n\\description{\nTakes in a list of pathways (or a list of genes), runs weighted PCA, optionally showing the result.\n}\n\n"
  },
  {
    "path": "man/pagoda.subtract.aspect.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.subtract.aspect}\n\\alias{pagoda.subtract.aspect}\n\\title{Control for a particular aspect of expression heterogeneity in a given population}\n\\usage{\npagoda.subtract.aspect(varinfo, aspect, center = TRUE)\n}\n\\arguments{\n\\item{varinfo}{normalized variance info (from pagoda.varnorm())}\n\n\\item{aspect}{a vector giving a cell-to-cell variation pattern that should be controlled for (length should be corresponding to ncol(varinfo$mat))}\n\n\\item{center}{whether the matrix should be re-centered following pattern subtraction}\n}\n\\value{\na modified varinfo object with adjusted expression matrix (varinfo$mat)\n}\n\\description{\nSimilar to subtracting n-th principal component, the current procedure determines\n(weighted) projection of the expression matrix onto a specified aspect (some pattern\nacross cells, for instance sequencing depth, or PC corresponding to an undesired process\nsuch as ribosomal pathway variation) and subtracts it from the data so that it is controlled\nfor in the subsequent weighted PCA analysis.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n# create go environment\nlibrary(org.Hs.eg.db)\n# translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\nrids <- names(ids); names(rids) <- ids\ngo.env <- lapply(mget(ls(org.Hs.egGO2ALLEGS), org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x])))\n# clean GOs\ngo.env <- clean.gos(go.env)\n# convert to an environment\ngo.env <- list2env(go.env)\n# subtract the pattern\ncc.pattern <- pagoda.show.pathways(ls(go.env)[1:2], varinfo, go.env, show.cell.dendrogram = TRUE, showRowLabels = TRUE)  # Look at pattern from 2 GO annotations\nvarinfo.cc <- pagoda.subtract.aspect(varinfo, cc.pattern)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.top.aspects.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.top.aspects}\n\\alias{pagoda.top.aspects}\n\\title{Score statistical significance of gene set and cluster overdispersion}\n\\usage{\npagoda.top.aspects(pwpca, clpca = NULL, n.cells = NULL,\n  z.score = qnorm(0.05/2, lower.tail = FALSE), return.table = FALSE,\n  return.genes = FALSE, plot = FALSE, adjust.scores = TRUE,\n  score.alpha = 0.05, use.oe.scale = FALSE, effective.cells.start = NULL)\n}\n\\arguments{\n\\item{pwpca}{output of pagoda.pathway.wPCA()}\n\n\\item{clpca}{output of pagoda.gene.clusters() (optional)}\n\n\\item{n.cells}{effective number of cells (if not provided, will be determined using pagoda.effective.cells())}\n\n\\item{z.score}{Z score to be used as a cutoff for statistically significant patterns (defaults to 0.05 P-value}\n\n\\item{return.table}{whether a text table showing}\n\n\\item{return.genes}{whether a set of genes driving significant aspects should be returned}\n\n\\item{plot}{whether to plot the cv/n vs. dataset size scatter showing significance models}\n\n\\item{adjust.scores}{whether the normalization of the aspect patterns should be based on the adjusted Z scores - qnorm(0.05/2, lower.tail = FALSE)}\n\n\\item{score.alpha}{significance level of the confidence interval for determining upper/lower bounds}\n\n\\item{use.oe.scale}{whether the variance of the returned aspect patterns should be normalized using observed/expected value instead of the default chi-squared derived variance corresponding to overdispersion Z score}\n\n\\item{effective.cells.start}{starting value for the pagoda.effective.cells() call}\n}\n\\value{\nif return.table = FALSE and return.genes = FALSE (default) returns a list structure containing the following items:\n\\itemize{\n\\item{xv} {a matrix of normalized aspect patterns (rows- significant aspects, columns- cells}\n\\item{xvw} { corresponding weight matrix }\n\\item{gw} { set of genes driving the significant aspects }\n\\item{df} { text table with the significance testing results }\n}\n}\n\\description{\nEvaluates statistical significance of the gene set and cluster lambda1 values, returning\neither a text table of Z scores, etc, a structure containing normalized values of significant\naspects, or a set of genes underlying the significant aspects.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\ntam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.varnorm.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.varnorm}\n\\alias{pagoda.varnorm}\n\\title{Normalize gene expression variance relative to transcriptome-wide expectations}\n\\usage{\npagoda.varnorm(models, counts, batch = NULL, trim = 0, prior = NULL,\n  fit.genes = NULL, plot = TRUE, minimize.underdispersion = FALSE,\n  n.cores = detectCores(), n.randomizations = 100, weight.k = 0.9,\n  verbose = 0, weight.df.power = 1, smooth.df = -1, max.adj.var = 10,\n  theta.range = c(0.01, 100), gene.length = NULL)\n}\n\\arguments{\n\\item{models}{model matrix (select a subset of rows to normalize variance within a subset of cells)}\n\n\\item{counts}{read count matrix}\n\n\\item{batch}{measurement batch (optional)}\n\n\\item{trim}{trim value for Winsorization (optional, can be set to 1-3 to reduce the impact of outliers, can be as large as 5 or 10 for datasets with several thousand cells)}\n\n\\item{prior}{expression magnitude prior}\n\n\\item{fit.genes}{a vector of gene names which should be used to establish the variance fit (default is NULL: use all genes). This can be used to specify, for instance, a set spike-in control transcripts such as ERCC.}\n\n\\item{plot}{whether to plot the results}\n\n\\item{minimize.underdispersion}{whether underdispersion should be minimized (can increase sensitivity in datasets with high complexity of population, however cannot be effectively used in datasets where multiple batches are present)}\n\n\\item{n.cores}{number of cores to use}\n\n\\item{n.randomizations}{number of bootstrap sampling rounds to use in estimating average expression magnitude for each gene within the given set of cells}\n\n\\item{weight.k}{k value to use in the final weight matrix}\n\n\\item{verbose}{verbosity level}\n\n\\item{weight.df.power}{power factor to use in determining effective number of degrees of freedom (can be increased for datasets exhibiting particularly high levels of noise at low expression magnitudes)}\n\n\\item{smooth.df}{degrees of freedom to be used in calculating smoothed local regression between coefficient of variation and expression magnitude (and gene length, if provided). Leave at -1 for automated guess.}\n\n\\item{max.adj.var}{maximum value allowed for the estimated adjusted variance (capping of adjusted variance is recommended when scoring pathway overdispersion relative to randomly sampled gene sets)}\n\n\\item{theta.range}{valid theta range (should be the same as was set in knn.error.models() call}\n\n\\item{gene.length}{optional vector of gene lengths (corresponding to the rows of counts matrix)}\n}\n\\value{\na list containing the following fields:\n\\itemize{\n\\item{mat} {adjusted expression magnitude values}\n\\item{matw} { weight matrix corresponding to the expression matrix}\n\\item{arv} { a vector giving adjusted variance values for each gene}\n\\item{avmodes} {a vector estimated average expression magnitudes for each gene}\n\\item{modes} {a list of batch-specific average expression magnitudes for each gene}\n\\item{prior} {estimated (or supplied) expression magnitude prior}\n\\item{edf} { estimated effective degrees of freedom}\n\\item{fit.genes} { fit.genes parameter }\n}\n}\n\\description{\nNormalizes gene expression magnitudes to ensure that the variance follows chi-squared statistics\nwith respect to its ratio to the transcriptome-wide expectation as determined by local regression\non expression magnitude (and optionally gene length). Corrects for batch effects.\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\n}\n\n}\n\n"
  },
  {
    "path": "man/pagoda.view.aspects.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{pagoda.view.aspects}\n\\alias{pagoda.view.aspects}\n\\title{View PAGODA output}\n\\usage{\npagoda.view.aspects(tamr, row.clustering = hclust(dist(tamr$xv)), top = Inf,\n  ...)\n}\n\\arguments{\n\\item{tamr}{Combined pathways that show similar expression patterns. Output of \\code{\\link{pagoda.reduce.redundancy}}}\n\n\\item{row.clustering}{Dendrogram of combined pathways clustering}\n\n\\item{top}{Restrict output to the top n aspects of heterogeneity}\n\n\\item{...}{additional arguments are passed to the \\code{\\link{view.aspects}} method during plotting}\n}\n\\value{\nPAGODA heatmap\n}\n\\description{\nCreate static image of PAGODA output visualizing cell hierarchy and top aspects of transcriptional heterogeneity\n}\n\\examples{\ndata(pollen)\ncd <- clean.counts(pollen)\n\\donttest{\nknn <- knn.error.models(cd, k=ncol(cd)/4, n.cores=10, min.count.threshold=2, min.nonfailed=5, max.model.plots=10)\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = FALSE)\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components=1, n.cores=10, n.internal.shuffles=50)\ntam <- pagoda.top.aspects(pwpca, return.table = TRUE, plot=FALSE, z.score=1.96)  # top aspects based on GO only\npagoda.view.aspects(tam)\n}\n\n}\n\n"
  },
  {
    "path": "man/papply.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{papply}\n\\alias{papply}\n\\title{wrapper around different mclapply mechanisms}\n\\usage{\npapply(..., n.cores = n)\n}\n\\arguments{\n\\item{...}{parameters to pass to lapply, mclapply, bplapply, etc.}\n\n\\item{n.cores}{number of cores. If 1 core is requested, will default to lapply}\n}\n\\description{\nAbstracts out mclapply implementation, and defaults to lapply when only one core is requested (helps with debugging)\n}\n\n"
  },
  {
    "path": "man/pollen.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{data}\n\\name{pollen}\n\\alias{pollen}\n\\title{Sample data}\n\\description{\nSingle cell data from Pollen et al. 2014 dataset.\n}\n\\references{\n\\url{www.ncbi.nlm.nih.gov/pubmed/25086649}\n}\n\n"
  },
  {
    "path": "man/scde.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{package}\n\\name{scde}\n\\alias{scde}\n\\alias{scde-package}\n\\title{Single-cell Differential Expression (with Pathway And Gene set Overdispersion Analysis)}\n\\description{\nThe scde package implements a set of statistical methods for analyzing single-cell RNA-seq data.\nscde fits individual error models for single-cell RNA-seq measurements. These models can then be used for\nassessment of differential expression between groups of cells, as well as other types of analysis.\nThe scde package also contains the pagoda framework which applies pathway and gene set overdispersion analysis\nto identify and characterize putative cell subpopulations based on transcriptional signatures.\nSee vignette(\"diffexp\") for a brief tutorial on differential expression analysis.\nSee vignette(\"pagoda\") for a brief tutorial on pathway and gene set overdispersion analysis to identify and characterize cell subpopulations.\nMore extensive tutorials are available at \\url{http://pklab.med.harvard.edu/scde/index.html}.\n (test)\n}\n\\author{\nPeter Kharchenko \\email{Peter_Kharchenko@hms.harvard.edu}\n\nJean Fan \\email{jeanfan@fas.harvard.edu}\n}\n\n"
  },
  {
    "path": "man/scde.browse.diffexp.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.browse.diffexp}\n\\alias{scde.browse.diffexp}\n\\title{View differential expression results in a browser}\n\\usage{\nscde.browse.diffexp(results, models, counts, prior, groups = NULL,\n  batch = NULL, geneLookupURL = NULL, server = NULL, name = \"scde\",\n  port = NULL)\n}\n\\arguments{\n\\item{results}{result object returned by \\code{scde.expression.difference()}. Note to browse group posterior levels, use \\code{return.posteriors = TRUE} in the \\code{scde.expression.difference()} call.}\n\n\\item{models}{model matrix}\n\n\\item{counts}{count matrix}\n\n\\item{prior}{prior}\n\n\\item{groups}{group information}\n\n\\item{batch}{batch information}\n\n\\item{geneLookupURL}{The URL that will be used to construct links to view more information on gene names. By default (if can't guess the organism) the links will forward to ENSEMBL site search, using \\code{geneLookupURL = \"http://useast.ensembl.org/Multi/Search/Results?q = {0}\"}. The \"{0}\" in the end will be substituted with the gene name. For instance, to link to GeneCards, use \\code{\"http://www.genecards.org/cgi-bin/carddisp.pl?gene = {0}\"}.}\n\n\\item{server}{optional previously returned instance of the server, if want to reuse it.}\n\n\\item{name}{app name (needs to be altered only if adding more than one app to the server using \\code{server} parameter)}\n\n\\item{port}{Interactive browser port}\n}\n\\value{\nserver instance, on which $stop() function can be called to kill the process.\n}\n\\description{\nLaunches a browser app that shows the differential expression results, allowing to sort, filter, etc.\nThe arguments generally correspond to the \\code{scde.expression.difference()} call, except that the results of that call are also passed here. Requires \\code{Rook} and \\code{rjson} packages to be installed.\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\nnames(sg) <- colnames(cd)\n\\donttest{\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n# make sure groups corresponds to the models (o.ifm)\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\\\\\1\", rownames(o.ifm)), levels = c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, n.randomizations = 100, n.cores = 10, verbose = 1)\nscde.browse.diffexp(ediff, o.ifm, cd, o.prior, groups = groups, geneLookupURL=\"http://www.informatics.jax.org/searchtool/Search.do?query={0}\")  # creates browser\n}\n\n}\n\n"
  },
  {
    "path": "man/scde.edff.Rd",
    "content": "% Generated by roxygen2 (5.0.0): do not edit by hand\n% Please edit documentation in R/functions.R\n\\docType{data}\n\\name{scde.edff}\n\\alias{scde.edff}\n\\title{Internal model data}\n\\description{\nNumerically-derived correction for NB->chi squared approximation stored as an local regression model\n}\n\n"
  },
  {
    "path": "man/scde.error.models.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.error.models}\n\\alias{scde.error.models}\n\\title{Fit single-cell error/regression models}\n\\usage{\nscde.error.models(counts, groups = NULL, min.nonfailed = 3,\n  threshold.segmentation = TRUE, min.count.threshold = 4,\n  zero.count.threshold = min.count.threshold, zero.lambda = 0.1,\n  save.crossfit.plots = FALSE, save.model.plots = TRUE, n.cores = 12,\n  min.size.entries = 2000, max.pairs = 5000, min.pairs.per.cell = 10,\n  verbose = 0, linear.fit = TRUE, local.theta.fit = linear.fit,\n  theta.fit.range = c(0.01, 100))\n}\n\\arguments{\n\\item{counts}{read count matrix. The rows correspond to genes (should be named), columns correspond to individual cells. The matrix should contain integer counts}\n\n\\item{groups}{an optional factor describing grouping of different cells. If provided, the cross-fits and the expected expression magnitudes will be determined separately within each group. The factor should have the same length as ncol(counts).}\n\n\\item{min.nonfailed}{minimal number of non-failed observations required for a gene to be used in the final model fitting}\n\n\\item{threshold.segmentation}{use a fast threshold-based segmentation during cross-fit (default: TRUE)}\n\n\\item{min.count.threshold}{the number of reads to use to guess which genes may have \"failed\" to be detected in a given measurement during cross-cell comparison (default: 4)}\n\n\\item{zero.count.threshold}{threshold to guess the initial value (failed/non-failed) during error model fitting procedure (defaults to the min.count.threshold value)}\n\n\\item{zero.lambda}{the rate of the Poisson (failure) component (default: 0.1)}\n\n\\item{save.crossfit.plots}{whether png files showing cross-fit segmentations should be written out (default: FALSE)}\n\n\\item{save.model.plots}{whether pdf files showing model fits should be written out (default = TRUE)}\n\n\\item{n.cores}{number of cores to use}\n\n\\item{min.size.entries}{minimum number of genes to use when determining expected expression magnitude during model fitting}\n\n\\item{max.pairs}{maximum number of cross-fit comparisons that should be performed per group (default: 5000)}\n\n\\item{min.pairs.per.cell}{minimum number of pairs that each cell should be cross-compared with}\n\n\\item{verbose}{1 for increased output}\n\n\\item{linear.fit}{Boolean of whether to use a linear fit in the regression (default: TRUE).}\n\n\\item{local.theta.fit}{Boolean of whether to fit the overdispersion parameter theta, ie. the negative binomial size parameter, based on local regression (default: set to be equal to the linear.fit parameter)}\n\n\\item{theta.fit.range}{Range of valid values for the overdispersion parameter theta, ie. the negative binomial size parameter (default: c(1e-2, 1e2))}\n}\n\\value{\na model matrix, with rows corresponding to different cells, and columns representing different parameters of the determined models\n}\n\\description{\nFit error models given a set of single-cell data (counts) and an optional grouping factor (groups). The cells (within each group) are first cross-compared to determine a subset of genes showing consistent expression. The set of genes is then used to fit a mixture model (Poisson-NB mixture, with expression-dependent concomitant).\n}\n\\details{\nNote: the default implementation has been changed to use linear-scale fit with expression-dependent NB size (overdispersion) fit. This represents an interative improvement on the originally published model. Use linear.fit=F to revert back to the original fitting procedure.\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\nnames(sg) <- colnames(cd)\n\\donttest{\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\n}\n\n}\n\n"
  },
  {
    "path": "man/scde.expression.difference.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.expression.difference}\n\\alias{scde.expression.difference}\n\\title{Test for expression differences between two sets of cells}\n\\usage{\nscde.expression.difference(models, counts, prior, groups = NULL,\n  batch = NULL, n.randomizations = 150, n.cores = 10,\n  batch.models = models, return.posteriors = FALSE, verbose = 0)\n}\n\\arguments{\n\\item{models}{models determined by \\code{\\link{scde.error.models}}}\n\n\\item{counts}{read count matrix}\n\n\\item{prior}{gene expression prior as determined by \\code{\\link{scde.expression.prior}}}\n\n\\item{groups}{a factor determining the two groups of cells being compared. The factor entries should correspond to the rows of the model matrix. The factor should have two levels. NAs are allowed (cells will be omitted from comparison).}\n\n\\item{batch}{a factor (corresponding to rows of the model matrix) specifying batch assignment of each cell, to perform batch correction}\n\n\\item{n.randomizations}{number of bootstrap randomizations to be performed}\n\n\\item{n.cores}{number of cores to utilize}\n\n\\item{batch.models}{(optional) separate models for the batch data (if generated using batch-specific group argument). Normally the same models are used.}\n\n\\item{return.posteriors}{whether joint posterior matrices should be returned}\n\n\\item{verbose}{integer verbose level (1 for verbose)}\n}\n\\value{\n\\subsection{default}{\na data frame with the following fields:\n\\itemize{\n\\item{lb, mle, ub} {lower bound, maximum likelihood estimate, and upper bound of the 95% confidence interval for the expression fold change on log2 scale.}\n\\item{ce} { conservative estimate of expression-fold change (equals to the min(abs(c(lb, ub))), or 0 if the CI crosses the 0}\n\\item{Z} { uncorrected Z-score of expression difference}\n\\item{cZ} {expression difference Z-score corrected for multiple hypothesis testing using Holm procedure}\n}\n If batch correction has been performed (\\code{batch} has been supplied), analogous data frames are returned in slots \\code{$batch.adjusted} for batch-corrected results, and \\code{$batch.effect} for the differences explained by batch effects alone.\n}}\n\\subsection{return.posteriors = TRUE}{\nA list is returned, with the default results data frame given in the \\code{$results} slot.\n\\code{difference.posterior} returns a matrix of estimated expression difference posteriors (rows - genes, columns correspond to different magnitudes of fold-change - log2 values are given in the column names)\n\\code{joint.posteriors} a list of two joint posterior matrices (rows - genes, columns correspond to the expression levels, given by prior$x grid)\n}\n}\n\\description{\nUse the individual cell error models to test for differential expression between two groups of cells.\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\\\\\1\", colnames(cd)), levels = c(\"ESC\", \"MEF\"))\nnames(sg) <- colnames(cd)\n\\donttest{\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n# make sure groups corresponds to the models (o.ifm)\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\\\\\1\", rownames(o.ifm)), levels = c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, n.randomizations = 100, n.cores = n.cores, verbose = 1)\n}\n\n}\n\n"
  },
  {
    "path": "man/scde.expression.magnitude.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.expression.magnitude}\n\\alias{scde.expression.magnitude}\n\\title{Return scaled expression magnitude estimates}\n\\usage{\nscde.expression.magnitude(models, counts)\n}\n\\arguments{\n\\item{models}{models determined by \\code{\\link{scde.error.models}}}\n\n\\item{counts}{count matrix}\n}\n\\value{\na matrix of expression magnitudes on a log scale (rows - genes, columns - cells)\n}\n\\description{\nReturn point estimates of expression magnitudes of each gene across a set of cells, based on the regression slopes determined during the model fitting procedure.\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\ndata(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\n# get expression magnitude estimates\nlfpm <- scde.expression.magnitude(o.ifm, cd)\n\n}\n\n"
  },
  {
    "path": "man/scde.expression.prior.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.expression.prior}\n\\alias{scde.expression.prior}\n\\title{Estimate prior distribution for gene expression magnitudes}\n\\usage{\nscde.expression.prior(models, counts, length.out = 400, show.plot = FALSE,\n  pseudo.count = 1, bw = 0.1, max.quantile = 1 - 0.001,\n  max.value = NULL)\n}\n\\arguments{\n\\item{models}{models determined by \\code{\\link{scde.error.models}}}\n\n\\item{counts}{count matrix}\n\n\\item{length.out}{number of points (resolution) of the expression magnitude grid (default: 400). Note: larger numbers will linearly increase memory/CPU demands.}\n\n\\item{show.plot}{show the estimate posterior}\n\n\\item{pseudo.count}{pseudo-count value to use (default 1)}\n\n\\item{bw}{smoothing bandwidth to use in estimating the prior (default: 0.1)}\n\n\\item{max.quantile}{determine the maximum expression magnitude based on a quantile (default : 0.999)}\n\n\\item{max.value}{alternatively, specify the exact maximum expression magnitude value}\n}\n\\value{\na structure describing expression magnitude grid ($x, on log10 scale) and prior ($y)\n}\n\\description{\nUse existing count data to determine a prior distribution of genes in the dataset\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\ndata(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n\n}\n\n"
  },
  {
    "path": "man/scde.failure.probability.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.failure.probability}\n\\alias{scde.failure.probability}\n\\title{Calculate drop-out probabilities given a set of counts or expression magnitudes}\n\\usage{\nscde.failure.probability(models, magnitudes = NULL, counts = NULL)\n}\n\\arguments{\n\\item{models}{models determined by \\code{\\link{scde.error.models}}}\n\n\\item{magnitudes}{a vector (\\code{length(counts) == nrows(models)}) or a matrix (columns correspond to cells) of expression magnitudes, given on a log scale}\n\n\\item{counts}{a vector (\\code{length(counts) == nrows(models)}) or a matrix (columns correspond to cells) of read counts from which the expression magnitude should be estimated}\n}\n\\value{\na vector or a matrix of drop-out probabilities\n}\n\\description{\nReturns estimated drop-out probability for each cell (row of \\code{models} matrix), given either an expression magnitude\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\ndata(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n# calculate probability of observing a drop out at a given set of magnitudes in different cells\nmags <- c(1.0, 1.5, 2.0)\np <- scde.failure.probability(o.ifm, magnitudes = mags)\n# calculate probability of observing the dropout at a magnitude corresponding to the\n# number of reads actually observed in each cell\nself.p <- scde.failure.probability(o.ifm, counts = cd)\n\n}\n\n"
  },
  {
    "path": "man/scde.fit.models.to.reference.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.fit.models.to.reference}\n\\alias{scde.fit.models.to.reference}\n\\title{Fit scde models relative to provided set of expression magnitudes}\n\\usage{\nscde.fit.models.to.reference(counts, reference, n.cores = 10,\n  zero.count.threshold = 1, nrep = 1, save.plots = FALSE,\n  plot.filename = \"reference.model.fits.pdf\", verbose = 0, min.fpm = 1)\n}\n\\arguments{\n\\item{counts}{count matrix}\n\n\\item{reference}{a vector of expression magnitudes (read counts) corresponding to the rows of the count matrix}\n\n\\item{n.cores}{number of cores to use}\n\n\\item{zero.count.threshold}{read count to use as an initial guess for the zero threshold}\n\n\\item{nrep}{number independent of mixture fit iterations to try (default = 1)}\n\n\\item{save.plots}{whether to write out a pdf file showing the model fits}\n\n\\item{plot.filename}{model fit pdf filename}\n\n\\item{verbose}{verbose level}\n\n\\item{min.fpm}{minimum reference fpm of genes that will be used to fit the models (defaults to 1). Note: fpm is calculated from the reference count vector as reference/sum(reference)*1e6}\n}\n\\value{\nmatrix of scde models\n}\n\\description{\nIf group-average expression magnitudes are available (e.g. from bulk measurement), this method can be used\nto fit individual cell error models relative to that reference\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n\\donttest{\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 10, threshold.segmentation = TRUE)\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n# calculate joint posteriors across all cells\njp <- scde.posteriors(models = o.ifm, cd, o.prior, n.cores = 10, return.individual.posterior.modes = TRUE, n.randomizations = 100)\n# use expected expression magnitude for each gene\nav.mag <- as.numeric(jp$jp \\%*\\% as.numeric(colnames(jp$jp)))\n# translate into counts\nav.mag.counts <- as.integer(round(av.mag))\n# now, fit alternative models using av.mag as a reference (normally this would correspond to bulk RNA expression magnitude)\nref.models <- scde.fit.models.to.reference(cd, av.mag.counts, n.cores = 1)\n}\n\n}\n\n"
  },
  {
    "path": "man/scde.posteriors.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.posteriors}\n\\alias{scde.posteriors}\n\\title{Calculate joint expression magnitude posteriors across a set of cells}\n\\usage{\nscde.posteriors(models, counts, prior, n.randomizations = 100, batch = NULL,\n  composition = NULL, return.individual.posteriors = FALSE,\n  return.individual.posterior.modes = FALSE, ensemble.posterior = FALSE,\n  n.cores = 20)\n}\n\\arguments{\n\\item{models}{models models determined by \\code{\\link{scde.error.models}}}\n\n\\item{counts}{read count matrix}\n\n\\item{prior}{gene expression prior as determined by \\code{\\link{scde.expression.prior}}}\n\n\\item{n.randomizations}{number of bootstrap iterations to perform}\n\n\\item{batch}{a factor describing which batch group each cell (i.e. each row of \\code{models} matrix) belongs to}\n\n\\item{composition}{a vector describing the batch composition of a group to be sampled}\n\n\\item{return.individual.posteriors}{whether expression posteriors of each cell should be returned}\n\n\\item{return.individual.posterior.modes}{whether modes of expression posteriors of each cell should be returned}\n\n\\item{ensemble.posterior}{Boolean of whether to calculate the ensemble posterior (sum of individual posteriors) instead of a joint (product) posterior. (default: FALSE)}\n\n\\item{n.cores}{number of cores to utilize}\n}\n\\value{\n\\subsection{default}{ a posterior probability matrix, with rows corresponding to genes, and columns to expression levels (as defined by \\code{prior$x})\n}\n\\subsection{return.individual.posterior.modes}{ a list is returned, with the \\code{$jp} slot giving the joint posterior matrix, as described above. The \\code{$modes} slot gives a matrix of individual expression posterior mode values on log scale (rows - genes, columns -cells)}\n\\subsection{return.individual.posteriors}{ a list is returned, with the \\code{$post} slot giving a list of individual posterior matrices, in a form analogous to the joint posterior matrix, but reported on log scale }\n}\n\\description{\nCalculates expression magnitude posteriors for the individual cells, and then uses bootstrap resampling to calculate a joint expression posterior for all the specified cells. Alternatively during batch-effect correction procedure, the joint posterior can be calculated for a random composition of cells of different groups (see \\code{batch} and \\code{composition} parameters).\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\ndata(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n# calculate joint posteriors\njp <- scde.posteriors(o.ifm, cd, o.prior, n.cores = 1)\n\n}\n\n"
  },
  {
    "path": "man/scde.test.gene.expression.difference.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{scde.test.gene.expression.difference}\n\\alias{scde.test.gene.expression.difference}\n\\title{Test differential expression and plot posteriors for a particular gene}\n\\usage{\nscde.test.gene.expression.difference(gene, models, counts, prior,\n  groups = NULL, batch = NULL, batch.models = models,\n  n.randomizations = 1000, show.plots = TRUE, return.details = FALSE,\n  verbose = FALSE, ratio.range = NULL, show.individual.posteriors = TRUE,\n  n.cores = 1)\n}\n\\arguments{\n\\item{gene}{name of the gene to be tested}\n\n\\item{models}{models}\n\n\\item{counts}{read count matrix (must contain the row corresponding to the specified gene)}\n\n\\item{prior}{expression magnitude prior}\n\n\\item{groups}{a two-level factor specifying between which cells (rows of the models matrix) the comparison should be made}\n\n\\item{batch}{optional multi-level factor assigning the cells (rows of the model matrix) to different batches that should be controlled for (e.g. two or more biological replicates). The expression difference estimate will then take into account the likely difference between the two groups that is explained solely by their difference in batch composition. Not all batch configuration may be corrected this way.}\n\n\\item{batch.models}{optional set of models for batch comparison (typically the same as models, but can be more extensive, or recalculated within each batch)}\n\n\\item{n.randomizations}{number of bootstrap/sampling iterations that should be performed}\n\n\\item{show.plots}{whether the plots should be shown}\n\n\\item{return.details}{whether the posterior should be returned}\n\n\\item{verbose}{set to T for some status output}\n\n\\item{ratio.range}{optionally specifies the range of the log2 expression ratio plot}\n\n\\item{show.individual.posteriors}{whether the individual cell expression posteriors should be plotted}\n\n\\item{n.cores}{number of cores to use (default = 1)}\n}\n\\value{\nby default returns MLE of log2 expression difference, 95% CI (upper, lower bound), and a Z-score testing for expression difference. If return.details = TRUE, a list is returned containing the above structure, as well as the expression fold difference posterior itself.\n}\n\\description{\nThe function performs differential expression test and optionally plots posteriors for a specified gene.\n}\n\\examples{\ndata(es.mef.small)\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\ndata(o.ifm)  # Load precomputed model. Use ?scde.error.models to see how o.ifm was generated\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior)\n\n}\n\n"
  },
  {
    "path": "man/show.app.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{show.app}\n\\alias{show.app}\n\\title{View PAGODA application}\n\\usage{\nshow.app(app, name, browse = TRUE, port = NULL, ip = \"127.0.0.1\",\n  server = NULL)\n}\n\\arguments{\n\\item{app}{pagoda app (output of make.pagoda.app()) or another rook app}\n\n\\item{name}{URL path name for this app}\n\n\\item{browse}{whether a call should be made for browser to show the app}\n\n\\item{port}{optional port on which the server should be initiated}\n\n\\item{ip}{IP on which the server should listen (typically localhost)}\n\n\\item{server}{an (optional) Rook server instance (defaults to ___scde.server)}\n}\n\\value{\nRook server instance\n}\n\\description{\nInstalls a given pagoda app (or any other rook app) into a server, optionally\nmaking a call to show it in the browser.\n}\n\\examples{\n\\donttest{\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols=col.cols, cell.clustering=hc, title=\"NPCs\")\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port=1468)\n}\n\n}\n\n"
  },
  {
    "path": "man/view.aspects.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{view.aspects}\n\\alias{view.aspects}\n\\title{View heatmap}\n\\usage{\nview.aspects(mat, row.clustering = NA, cell.clustering = NA, zlim = c(-1,\n  1) * quantile(mat, p = 0.95), row.cols = NULL, col.cols = NULL,\n  cols = colorRampPalette(c(\"darkgreen\", \"white\", \"darkorange\"), space =\n  \"Lab\")(1024), show.row.var.colors = TRUE, top = Inf, ...)\n}\n\\arguments{\n\\item{mat}{Numeric matrix}\n\n\\item{row.clustering}{Row dendrogram}\n\n\\item{cell.clustering}{Column dendrogram}\n\n\\item{zlim}{Range of the normalized gene expression levels, inputted as a list: c(lower_bound, upper_bound). Values outside this range will be Winsorized. Useful for increasing the contrast of the heatmap visualizations. Default, set to the 5th and 95th percentiles.}\n\n\\item{row.cols}{Matrix of row colors.}\n\n\\item{col.cols}{Matrix of column colors. Useful for visualizing cell annotations such as batch labels.}\n\n\\item{cols}{Heatmap colors}\n\n\\item{show.row.var.colors}{Boolean of whether to show row variance as a color track}\n\n\\item{top}{Restrict output to the top n aspects of heterogeneity}\n\n\\item{...}{additional arguments for heatmap plotting}\n}\n\\value{\nA heatmap\n}\n\\description{\nInternal function to visualize aspects of transcriptional heterogeneity as a heatmap. Used by \\code{\\link{pagoda.view.aspects}}.\n}\n\n"
  },
  {
    "path": "man/winsorize.matrix.Rd",
    "content": "% Generated by roxygen2: do not edit by hand\n% Please edit documentation in R/functions.R\n\\name{winsorize.matrix}\n\\alias{winsorize.matrix}\n\\title{Winsorize matrix}\n\\usage{\nwinsorize.matrix(mat, trim)\n}\n\\arguments{\n\\item{mat}{matrix}\n\n\\item{trim}{fraction of outliers (on each side) that should be Winsorized, or (if the value is  >= 1) the number of outliers to be trimmed on each side}\n}\n\\value{\nWinsorized matrix\n}\n\\description{\nSets the ncol(mat)*trim top outliers in each row to the next lowest value same for the lowest outliers\n}\n\\examples{\nset.seed(0)\nmat <- matrix( c(rnorm(5*10,mean=0,sd=1), rnorm(5*10,mean=5,sd=1)), 10, 10)  # random matrix\nmat[1,1] <- 1000  # make outlier\nrange(mat)  # look at range of values\nwin.mat <- winsorize.matrix(mat, 0.1)\nrange(win.mat)  # note outliers removed\n\n}\n\n"
  },
  {
    "path": "src/Makevars",
    "content": "PKG_CXXFLAGS=$(SHLIB_OPENMP_CXXFLAGS) \nPKG_LIBS=-L/usr/lib/ -L\".\" -lpthread -lm `$(R_HOME)/bin/Rscript -e \"Rcpp:::LdFlags()\"` $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)"
  },
  {
    "path": "src/Makevars.win",
    "content": "PKG_CXXFLAGS=$(SHLIB_OPENMP_CXXFLAGS) \nPKG_LIBS=-L/usr/lib/ -L\".\" -lpthread -lm $(shell \"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe\" -e \"Rcpp:::LdFlags()\") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) \n"
  },
  {
    "path": "src/bwpca.cpp",
    "content": "#include \"bwpca.h\"\n#include <random>\n#include <vector>\n#include <algorithm>\n#include <iostream>\n#include <cmath>\n#include <numeric>\n\n\nusing namespace Rcpp ;\n\n\n#undef DEBUG\n\n\n// compare a and b positions of a data vector\nbool compare_on_other(int a, int b, arma::vec& data) {\n    return data[a]<data[b];\n}\n\n// independently randomize values within a column for a matrix\nvoid set_random_matrix(arma::mat& target,arma::mat& source) {\n    std::vector<int> ind(target.n_rows);\n    for(int j=0;j<target.n_rows;j++) {\n        ind[j]=j;\n    } // set up initial index (1,2,3)\n\n    // The random number generator that we want to use (Mersenne Twister)\n    std::mt19937 rng(std::time(nullptr));\n\n    for(int i=0;i<target.n_cols;i++) {\n        std::shuffle(ind.begin(), ind.end(), rng);  // NOTE: std::random_shuffler() was deprecated in C++14 and completely removed in C++17\n        //std::sort(ind.start(), ind.end(), std::bind(compare_on_other,  _1, _2, rv));\n        for(int j=0;j<target.n_rows;j++) {\n            target(j,i)=source(ind[j],i);\n        }\n        R_CheckUserInterrupt();\n    }\n}\n\n// independently randomize values within a column for two matrices\nvoid set_random_matrices(arma::mat& target1,arma::mat& source1,arma::mat& target2,arma::mat& source2) {\n    std::vector<int> ind(target1.n_rows);\n    for(int j=0;j<target1.n_rows;j++) {\n        ind[j]=j;\n    } // set up initial index (1,2,3)\n\n    // The random number generator that we want to use (Mersenne Twister)\n    std::mt19937 rng(std::time(nullptr));\n\n    for(int i=0;i<target1.n_cols;i++) {\n        std::shuffle(ind.begin(), ind.end(), rng);    // NOTE: std::random_shuffler() was deprecated in C++14 and completely removed in C++17\n        //std::sort(ind.start(), ind.end(), std::bind(compare_on_other,  _1, _2, rv));\n        for(int j=0;j<target1.n_rows;j++) {\n            target1(j,i)=source1(ind[j],i);\n            target2(j,i)=source2(ind[j],i);\n        }\n        R_CheckUserInterrupt();\n    }\n}\n\nSEXP baileyWPCA(SEXP Mat, SEXP Matw, SEXP Npcs, SEXP Nstarts, SEXP Smooth, SEXP EMtol, SEXP EMmaxiter, SEXP Seed, SEXP Nshuffles){\n    arma::mat m=Rcpp::as<arma::mat>(Mat); // can avoid copy here\n    arma::mat mw=Rcpp::as<arma::mat>(Matw); // can avoid copy here\n\n\n    double tol=Rcpp::as<double>(EMtol);\n    int maxiter=Rcpp::as<int>(EMmaxiter);\n    int nstarts=Rcpp::as<int>(Nstarts);\n    int smooth=Rcpp::as<int>(Smooth);\n    int npcs=Rcpp::as<int>(Npcs);\n    int seed=Rcpp::as<int>(Seed);\n    int nshuffles=Rcpp::as<int>(Nshuffles);\n\n    int d=m.n_cols; // genes\n    int n=m.n_rows; // cells\n\n    if(npcs>d) { npcs=d; } // limit the number of PCs to the number of genes\n\n\n    //arma::mat mwsq_colsum=sum(mw % mw);\n\n#ifdef DEBUG\n    std::cout<<\"starting up\"<<std::endl<<std::flush;\n#endif\n    // set up smoothing coefficients\n    arma::colvec smoothc;\n    if(smooth>0) {\n        int np=smooth/2;\n        int pol_degree=3; int diff_order=0;\n        arma::colvec x(2*np+1); for(int i=0;i<2*np+1;i++) { x[i]=i-np; };\n        arma::mat A(2*np+1,pol_degree+1);\n        for(int j=0; j<pol_degree+1; j++) {\n            A.col(j)=pow(x,j);\n        }\n        arma::mat ATA(A.t() * A);\n        arma::colvec rhs(pol_degree+1,arma::fill::zeros);\n        rhs[diff_order]=pow(-1,diff_order);\n        smoothc=A * solve(ATA,rhs);\n        //std::cout<<\"smoothc:\"; smoothc.print();\n    }\n#ifdef DEBUG\n    //std::cout<<\"calculated smoothing factors\"<<std::endl<<std::flush;\n#endif\n    // multiple random starts\n    arma::mat besteigenv,bestcoef;\n    baileyWPCAround(m,mw,nstarts,npcs,seed,maxiter,tol,smooth,smoothc,bestcoef,besteigenv);\n\n    // calculate amount of weighted variance explained by each estimated component\n    // calculate total weighted variance\n    arma::mat totvm(m % sqrt(mw));\n    totvm %=totvm;\n    //double totvar=accu(sum(totvm,0) / mwsq_colsum);\n    double totvar=accu(totvm);\n#ifdef DEBUG\n    std::cout<<\"total weighted variance : \"<<totvar<<std::endl;\n#endif\n\n    double tvarexp=0;\n    arma::vec varexp(npcs);\n    arma::mat dat(m.n_rows,m.n_cols,arma::fill::zeros);\n    for(int k=0; k<npcs; k++) {\n        dat += (bestcoef.col(k) * besteigenv.col(k).t());\n        //std::cout<<k<<\"-th total reconstruction:\"<<std::endl; dat.print();\n#ifdef DEBUG\n        //std::cout<<k<<\"-th total reconstruction delta:\"<<std::endl; (dat-m).print();\n#endif\n        arma::mat delta((dat-m) % sqrt(mw));\n        delta %=delta;\n        //double npres=accu(sum(delta,0) / mwsq_colsum);\n        double npres=accu(delta);\n#ifdef DEBUG\n        std::cout<<k<<\"-th component explains \"<<(totvar-npres-tvarexp)<<\" (\"<<((totvar-npres-tvarexp)/totvar)<<\") variance;  cumulative:\"<<(totvar-npres)<<\" (\"<<((totvar-npres)/totvar)<<\")\"<<std::endl;\n#endif\n        varexp[k]=totvar-npres-tvarexp;\n        tvarexp=totvar-npres;\n\n        R_CheckUserInterrupt();\n    }\n\n    arma::mat pcw=mw * abs(besteigenv);\n\n    if(nshuffles>0) {\n        arma::vec rvars(nshuffles);\n        arma::mat rm(n,d);\n        arma::mat rmw(n,d);\n        arma::mat reigenv,rcoef;\n        for(int i=0;i<nshuffles;i++) {\n            set_random_matrices(rm,m,rmw,mw);\n            baileyWPCAround(rm,rmw,nstarts,npcs,seed+i,maxiter,tol,smooth,smoothc,rcoef,reigenv);\n            dat.fill(0);\n            dat += (rcoef.col(0) * reigenv.col(0).t());\n            arma::mat delta((dat-rm) % sqrt(rmw));\n            delta %=delta;\n            rvars[i]=totvar-accu(delta);\n        }\n\n        return List::create(Named(\"rotation\") = wrap(besteigenv),\n                            Named(\"scores\") = wrap(bestcoef),\n                            Named(\"scoreweights\") = wrap(pcw),\n                            Named(\"var\") = wrap(varexp),\n                            Named(\"totvar\") = wrap(totvar),\n                            Named(\"randvar\") = wrap(rvars));\n\n    } else {\n        return List::create(Named(\"rotation\") = wrap(besteigenv),\n                            Named(\"scores\") = wrap(bestcoef),\n                            Named(\"scoreweights\") = wrap(pcw),\n                            Named(\"var\") = wrap(varexp),\n                            Named(\"totvar\") = wrap(totvar));\n    }\n}\n\n\n// internal function performing wPCA itself with nstarts random starts\nvoid baileyWPCAround(arma::mat& m,arma::mat& mw,int nstarts,int npcs,int seed,int maxiter,double tol,int smooth,arma::colvec& smoothc,arma::mat& bestcoef, arma::mat& besteigenv) {\n    int d=m.n_cols; // genes\n    int n=m.n_rows; // cells\n\n    double bestpres(-1);\n\n    for(int nstart=0; nstart<nstarts; nstart++) {\n#ifdef DEBUG\n        std::cout<<\"starting iteration \"<<nstart<<std::endl<<std::flush;\n#endif\n        // random orthonormal start for the\n        arma::arma_rng::set_seed(seed+nstart);\n        arma::mat X = arma::randu<arma::mat>(d,npcs);\n        arma::mat start,eigenv, R;\n        arma::qr_econ(start,R,X);\n        eigenv=start;\n\n        //std::cout<<\"starting eigenv\"<<std::endl<<std::flush;\n        //eigenv.print();\n\n\n        // initial solution for coefficients\n        arma::mat coef(n,npcs);\n\n        double pres(std::numeric_limits<double>::max());\n        double bpres(std::numeric_limits<double>::max());\n        arma::mat beigenv,bcoef; // best models of the current run\n\n        int ii=0; // iteration counter\n        while(ii<maxiter) {\n            //std::cout<<\"iteration\"<<ii<<std::endl<<std::flush;\n\n            // solve for coefficients\n            for(int j=0; j<n; j++) {  // for each observation j\n                // solving for c, m.row(j) == eigenv * c, as a weighted least squares\n                // problem with weights mw.row(j)\n\n                //std::cout<<\"m:\"; m.row(j).print();\n                //std::cout<<\"w:\"; mw.row(j).print();\n                arma::rowvec b=m.row(j) % mw.row(j);\n                //std::cout<<\"b*w:\"; b.print();\n                b= b * eigenv;\n                //std::cout<<\"b:\"; b.print();\n                arma::mat A(eigenv);\n                A.each_col() %= mw.row(j).t();\n                A=eigenv.t() * A;\n                //std::cout<<\"A:\"; A.print();\n                coef.row(j)=solve(A,b.t()).t();\n            }\n            //std::cout<<\"updated coeff\"<<std::endl<<std::flush;\n            //coef.t().print();\n\n\n            // solve for eigenvectors\n            arma::mat dat=m;\n            for(int k=0; k<npcs; k++) { // for each vector\n                arma::mat cw=mw;\n                cw.each_col() %=coef.col(k);\n                arma::rowvec xcw=sum(dat % cw,0);\n                cw.each_col() %=coef.col(k);\n                xcw /= sum(cw,0);\n                eigenv.col(k)=xcw.t();\n\n                // smoothing\n                if(smooth>0) {\n                    // convolve with smoothing coefficients\n                    int n=(smoothc.n_elem-1)/2;\n                    arma::vec res=conv(eigenv.col(k),smoothc);\n                    eigenv.col(k)=res.subvec(n,res.n_elem-n-1);\n                    //std::cout<<\"smooth res[]:\"; eigenv.col(k).print();\n                }\n\n                // subtract current eigenvector vector from the data\n                if(k!= npcs-1) {\n                    dat -= coef.col(k) * eigenv.col(k).t()  ;\n                    //std::cout<<\"dat:\"; dat.print();\n                }\n            }\n            //std::cout<<\"updated eigenvectors\"<<std::endl<<std::flush;\n\n            // renormalize and re-orthogonalize the eigenvectors\n            eigenv.col(0) /=  sqrt(dot(eigenv.col(0),eigenv.col(0)));\n            for(int k=1; k<npcs; k++) { // for each vector\n                for(int kx=0; kx<k; kx++) {\n                    double c = dot(eigenv.col(k) , eigenv.col(kx));\n                    eigenv.col(k) -= c*eigenv.col(kx);\n                }\n                eigenv.col(k) /= sqrt(dot(eigenv.col(k) , eigenv.col(k)));\n            }\n\n            //std::cout<<\"recalculated eigenvectors\"<<std::endl<<std::flush;\n            //eigenv.t().print();\n\n            // recalculate the model fit\n            arma::mat model(coef * eigenv.t());\n            //std::cout<<\"coeffs:\"; coef.print();\n            //std::cout<<\"eigenv:\"; eigenv.print();\n            //std::cout<<\"model:\"; model.print();\n\n            //std::cout<<\"delta:\"<<std::endl; (model-m).print();\n\n            arma::mat delta((model-m) % sqrt(mw));\n            delta %=delta;\n            //double npres=accu(sum(delta,0) / mwsq_colsum);\n            double npres=accu(delta);\n\n#ifdef DEBUG\n            std::cout<<\"iteration \"<<ii<<\" pres=\"<<npres;\n#endif\n            // record model if it improved the overall precision\n            if(npres<bpres) {\n                bpres=npres;\n                bcoef=coef;\n                beigenv=eigenv;\n#ifdef DEBUG\n                std::cout<<\", best so far\";\n#endif\n            }\n\n            if(tol>0 && ii>0 && (pres-npres)/npres < tol) {\n                if(pres>npres) {\n\n#ifdef DEBUG\n                    std::cout<<\", reached required tolerance\"<<std::endl;\n#endif\n                    pres=npres;\n                    break;\n                }\n                // otherwise the total variance is actually increasing, which is not a good thing\n            }\n#ifdef DEBUG\n            std::cout<<std::endl<<std::flush;\n#endif\n\n            ii++;\n            pres=npres;\n        }\n        if(nstart==0 || pres<bestpres) {\n#ifdef DEBUG\n            std::cout<<\"updating the results with \"<<nstart<<\" calculations\"<<std::endl<<std::flush;\n#endif\n            bestpres=bpres;\n            bestcoef=bcoef;\n            besteigenv=beigenv;\n        }\n\n\tR_CheckUserInterrupt();\n\n    }\n}\n"
  },
  {
    "path": "src/bwpca.h",
    "content": "#ifndef _scde_BWPCA_H\n#define _scde_BWPCA_H\n\n#include <RcppArmadillo.h>\n\n// outer function for performing Bailey's weighted PCA\n// Nshuffles - number of internal row-specific randomizations to recalculate the lambda1 (PC1 variance) on\nRcppExport SEXP baileyWPCA(SEXP Mat, SEXP Matw, SEXP Npcs, SEXP Nstarts, SEXP Smooth, SEXP EMtol, SEXP EMmaxiter, SEXP Seed, SEXP Nshuffles);\n\nvoid baileyWPCAround(arma::mat& m,arma::mat& mw,int nstarts,int npcs,int seed,int maxiter,double tol,int smooth,arma::colvec& smoothc,arma::mat& bestcoef, arma::mat& besteigenv);\n#endif\n"
  },
  {
    "path": "src/jpmatLogBoot.cpp",
    "content": "#include \"jpmatLogBoot.h\"\n#include <cmath>\n\nusing namespace Rcpp ;\n\n// maximum and minimum values of negative binomial theta (size) that will be considered\n#define MIN_THETA 1.0e-2\n#define MAX_THETA 1.0e+3\n\n\nSEXP jpmatLogBoot(SEXP Matl, SEXP Nboot, SEXP Seed){\n    Rcpp::List matl(Matl);\n    int nrows=Rcpp::as<Rcpp::NumericMatrix>(matl[0]).nrow();\n    int ncols=Rcpp::as<Rcpp::NumericMatrix>(matl[0]).ncol();\n    int nmat=matl.size();\n    int nboot=as<int>(Nboot);\n    arma::mat jp(nrows,ncols);  // joint posterior across boostraps\n    jp.zeros();\n    arma::mat tjp(nrows,ncols); // for current boostrap posterior\n    int seed=Rcpp::as<int>(Seed);\n    srand(seed);\n\n    for(int i=0;i<nboot;i++) {\n        tjp.zeros();\n        for(int j=0;j<nmat;j++) {\n            int rj;\n            while(nmat <= (rj= rand()/(RAND_MAX/nmat)))\n                ;\n            arma::mat am(Rcpp::as<Rcpp::NumericMatrix>(matl[rj]).begin(),nrows,ncols,false,true);\n            tjp +=am;\n            //R_CheckUserInterrupt();\n        }\n        arma::colvec m=max(tjp,1);\n        tjp.each_col() -= m; // shift for stability\n        tjp=exp(tjp);\n        arma::colvec s=sum(tjp,1);\n        tjp.each_col() /= s;\n        jp+=tjp;\n        //R_CheckUserInterrupt();\n    }\n    return wrap(jp);\n}\n\n\n// similar to above, however the joint is built not over boostrap samples\n// of the same pool, but by sampling a pre-defined (by Comp vector) composition\n// of several different pools (in Matll)\nSEXP jpmatLogBatchBoot(SEXP Matll, SEXP Comp, SEXP Nboot, SEXP Seed){\n    Rcpp::IntegerVector comp=as<Rcpp::IntegerVector>(Comp);\n    Rcpp::NumericMatrix nm0=as<Rcpp::NumericMatrix>(VECTOR_ELT( VECTOR_ELT(Matll,0) ,0));\n    int nrows=nm0.nrow();\n    int ncols=nm0.ncol();\n    int nboot=as<int>(Nboot);\n    arma::mat jp(nrows,ncols);  // joint posterior across boostraps\n    jp.zeros();\n    arma::mat tjp(nrows,ncols); // for current boostrap posterior\n    int seed=Rcpp::as<int>(Seed);\n    srand(seed);\n\n    for(int i=0;i<nboot;i++) {\n        tjp.zeros();\n        for(int k=0;k<comp.size();k++) { // over types\n            int nsamp=comp[k];\n            if(nsamp>0) {\n                int nmat=LENGTH( VECTOR_ELT(Matll, k) );\n                for(int j=0;j<nsamp;j++) { // over matrices\n                    int rj;\n                    while(nmat <= (rj= rand()/(RAND_MAX/nmat)))\n                        ;\n                    arma::mat am(Rcpp::as<Rcpp::NumericMatrix>(VECTOR_ELT( VECTOR_ELT(Matll,k) ,rj)).begin(),nrows,ncols,false,true);\n                    tjp +=am;\n                    //R_CheckUserInterrupt();\n                }\n            }\n            //R_CheckUserInterrupt();\n        }\n        arma::colvec m=max(tjp,1);\n        tjp.each_col() -= m; // shift for stability\n        tjp=exp(tjp);\n        arma::colvec s=sum(tjp,1);\n        tjp.each_col() /= s;\n        jp+=tjp;\n        //R_CheckUserInterrupt();\n    }\n    return wrap(jp);\n}\n\n//calculate log joint posteriors from models and unique count lists/integer on magniude grid\n// count columns must match model rows\n// Models - model matrix\n// Ucl - unique count list\n// Uci - unique count index\n// Magnitudes - marginals - (expression magnitudes)\n// Nboot - number of bootstrap iterations\n// Seed - random seed\n// ReturnIndividualPosterior: 0 - nothing, 1 - maxima, 2 - full posterior matrices\n// LocalThetaFit: 0 - constant theta; 1 - theta(fpm) fit\n// SquareLogitConc: 0 - concomitant is a function of magnitude ; 1 - of magnitude and magnitude^2\n// EnsembleProbability: 1 - calculate ensemble (i.e. sum) joint probability instead of a joint (i.e product)\nRcppExport SEXP logBootPosterior(SEXP Models, SEXP Ucl, SEXP CountsI, SEXP Magnitudes, SEXP Nboot, SEXP Seed, SEXP ReturnIndividualPosteriors, SEXP LocalThetaFit, SEXP SquareLogitConc, SEXP EnsembleProbability) {\n#define CONCB_I 0\n#define CONCA_I 1\n#define FAILR_I 2\n#define CORRB_I 3\n#define CORRA_I 4\n#define CORRT_I 5\n#define CORRlTB_I 6\n#define CORRlTT_I 7\n#define CORRlTM_I 8\n#define CORRlTS_I 9\n#define CORRlTR_I 10\n#define CONCA2_I 11\n\n    Rcpp::IntegerMatrix counti=Rcpp::as<Rcpp::IntegerMatrix>(CountsI);\n    Rcpp::List ucl(Ucl);\n    int ncells=ucl.size();\n    int returnpost=Rcpp::as<int>(ReturnIndividualPosteriors);\n    int localtheta=Rcpp::as<int>(LocalThetaFit);\n    int squarelogitconc=Rcpp::as<int>(SquareLogitConc);\n    int ensemblep=Rcpp::as<int>(EnsembleProbability);\n    arma::mat models=Rcpp::as<arma::mat>(Models);\n    arma::colvec magnitudes=Rcpp::as<arma::colvec>(Magnitudes);\n    std::vector< arma::mat > ucposteriors;\n    std::vector< std::vector < arma::uword > > ucmaxi;\n    // calculate individual posteriors for each cell, for each unique count value\n    //std::cout<<\"individual posteriors \"<<std::flush;\n    double minlogprob=-1*std::numeric_limits<double>::max()/ncells/1.1;\n    for(int i=0;i<ncells;i++) {\n        Rcpp::IntegerVector uc(Rcpp::as<Rcpp::IntegerVector>(ucl[i]));\n        int ncounts=uc.size();\n        arma::mat pm(magnitudes.n_elem,ncounts);\n        std::vector< arma::uword > maxi;\n        arma::vec mu=magnitudes * models(i,CORRA_I);\n        mu+=models(i,CORRB_I); mu=exp(mu);\n        arma::vec cfp;\n        if(squarelogitconc) {\n            cfp=models(i,CONCA_I) + magnitudes*models(i,CONCA2_I);\n            cfp%=magnitudes;\n        } else {\n            cfp=magnitudes * models(i,CONCA_I);\n        }\n        cfp+=models(i,CONCB_I);\n        cfp=1/(exp(cfp)+1);\n        arma::vec cfpr=1-cfp;\n        cfp=log(cfp); cfpr=log(cfpr);\n        double maxcfp=max(cfp);\n        arma::colvec thetas;\n        if(localtheta) { // non-constant theta model - prepare theta values\n            thetas=-1*magnitudes + models(i,CORRlTM_I);\n            thetas*=models(i,CORRlTS_I);\n            thetas=exp10(thetas)+1;\n            thetas=pow(thetas,models(i,CORRlTR_I));\n            thetas=(models(i,CORRlTT_I) - models(i,CORRlTB_I))/thetas;\n            thetas+=models(i,CORRlTB_I);\n            thetas=exp(-1*thetas);\n\n            for(unsigned int k=0;k<thetas.n_elem;k++) {\n                if((!std::isfinite(thetas[k])) || (thetas[k]<MIN_THETA)) {  thetas[k]=MIN_THETA;}\n                if(thetas[k]>MAX_THETA) {  thetas[k]=MAX_THETA;}\n                //R_CheckUserInterrupt();\n            }\n        }\n        //std::cout<<\"cfp=[\"; std::copy(cfp.begin(),cfp.end(),std::ostream_iterator<double>(std::cout,\" \")); std::cout<<\"]\"<<std::endl<<std::flush;\n        //std::cout<<\"thetas=[\"; copy(thetas.begin(),thetas.end(),std::ostream_iterator<double>(std::cout,\" \")); std::cout<<\"]\"<<std::endl<<std::flush;\n\n        for(int j=0;j<ncounts;j++) {\n            // correlated prob\n            arma::vec nbp(mu.n_elem);\n            if(localtheta) { // linear theta model\n                for(unsigned int k=0;k<mu.n_elem;k++) {\n                    double muv=mu[k]; double x=uc[j];\n                    // choose maximum probability when hitting the grid with the maximum\n                    if((k<(mu.n_elem-1) && x>muv && x<mu[k+1]) || (k==(mu.n_elem-1) && x>muv)) { muv=x; }\n                    nbp[k]=Rf_dnbinom(x,thetas[k],thetas[k]/(thetas[k]+muv),true);\n                    //R_CheckUserInterrupt();\n                }\n            } else { // constant theta\n                double theta=models(i,CORRT_I);\n                for(unsigned int k=0;k<mu.n_elem;k++) {\n                    double muv=mu[k]; double x=uc[j];\n                    // choose maximum probability when hitting the grid with the maximum\n                    if((k<(mu.n_elem-1) && x>muv && x<mu[k+1]) || (k==(mu.n_elem-1) && x>muv)) { muv=x; }\n                    nbp[k]=Rf_dnbinom(x,theta,theta/(theta+muv),true);\n                    //R_CheckUserInterrupt();\n                }\n            }\n            //std::cout<<\"nbp1=[\"; copy(nbp.begin(),nbp.end(),std::ostream_iterator<double>(std::cout,\" \")); std::cout<<\"]\"<<std::endl<<std::flush;\n            nbp+=cfpr;\n            // // failure probability\n            double fp=Rf_dpois(uc[j],exp(models(i,FAILR_I)),true);\n            double maxp=max(nbp);\n            if(maxp<(maxcfp+fp)) { maxp=maxcfp+fp; }\n            nbp=(exp(nbp-maxp) + exp(cfp+fp-maxp));\n            nbp/=sum(nbp);\n            nbp=log(nbp);\n\n            // find max point\n            if(returnpost==1 || returnpost==3) {\n                arma::uword maxij;\n                double maxv=nbp.max(maxij);\n                maxi.push_back(maxij);\n            }\n            // set the lower bound to min/n.cells\n            for(unsigned int k=0;k<nbp.n_elem;k++) { if(nbp[k]<minlogprob) nbp[k]=minlogprob; }\n            pm.col(j)=nbp;\n        }\n\n        ucposteriors.push_back(pm);\n        if(returnpost==1 || returnpost==3) { ucmaxi.push_back(maxi);}\n        //std::cout<<\".\"<<std::flush;\n    }\n    //std::cout<<\" done\"<<std::endl;\n\n    //std::cout<<\"boostrap iterations \"<<std::flush;\n    // calculate joint posterior\n    int ngenes=counti.nrow();\n    arma::mat jp(magnitudes.n_elem,ngenes);  // joint posterior across boostraps\n    jp.zeros();\n    arma::mat tjp(magnitudes.n_elem,ngenes); // for current boostrap posterior\n    int seed=Rcpp::as<int>(Seed);\n    srand(seed);\n    int nboot=as<int>(Nboot);\n\n    if(ensemblep) {\n        for(int j=0;j<ncells;j++) {\n            // exponentiate and normalize ucposteriors\n            arma::mat cellucpost = exp(ucposteriors[j]);\n            arma::rowvec s=sum(cellucpost,0);\n            cellucpost.each_row() /= s;\n            for(int k=0;k<ngenes;k++) { // fill in kth gene posterior\n                jp.col(k)+=cellucpost.col(counti(k,j));\n                //R_CheckUserInterrupt();\n            }\n            //R_CheckUserInterrupt();\n        }\n        arma::rowvec s=sum(jp,0); // pre-adjust so that jp is normalized\n        jp.each_row() /= s;\n    } else {\n        if(nboot==0) { // no bootstrapping\n            for(int j=0;j<ncells;j++) {\n                for(int k=0;k<ngenes;k++) { // fill in kth gene posterior\n                    jp.col(k)+=(ucposteriors[j]).col(counti(k,j));\n                }\n            }\n            arma::rowvec m=max(jp,0); // calculate max for each gene (column)\n            jp.each_row() -= m; // shift up for stability prior to exponentiation\n            jp=exp(jp);\n            arma::rowvec s=sum(jp,0); // pre-adjust so that jp is normalized\n            jp.each_row() /= s;\n        } else {\n            for(int i=0;i<nboot;i++) {\n                //std::cout<<\".\"<<std::flush;\n                tjp.zeros();\n                for(int j=0;j<ncells;j++) {\n                    int rj;\n                    while(ncells <= (rj= rand()/(RAND_MAX/ncells)))\n                        ;\n                    for(int k=0;k<ngenes;k++) { // fill in kth gene posterior\n                        tjp.col(k)+=(ucposteriors[rj]).col(counti(k,rj));\n                        //R_CheckUserInterrupt();\n                    }\n                    //R_CheckUserInterrupt();\n                }\n                arma::rowvec m=max(tjp,0); // calculate max for each gene (column)\n                tjp.each_row() -= m; // shift up for stability prior to exponentiation\n                tjp=exp(tjp);\n                arma::rowvec s=sum(tjp,0)*nboot; // pre-adjust for nboot so that jp is normalized\n                tjp.each_row() /= s;\n                jp+=tjp;\n                //R_CheckUserInterrupt();\n            }\n        }\n    }\n    //std::cout<<\" done\"<<std::endl;\n    jp=jp.t();\n\n    if(returnpost==1) {\n        // make return matrix of individual posterior maxima\n        Rcpp::NumericMatrix modes(ngenes,ncells);\n        for(int i=0;i<ncells;i++) {\n            for(int j=0;j<ngenes;j++) {\n                modes(j,i)=magnitudes[(ucmaxi[i])[counti(j,i)]];\n                //R_CheckUserInterrupt();\n            }\n            //R_CheckUserInterrupt();\n        }\n        return Rcpp::List::create(Rcpp::Named(\"jp\") = wrap(jp),\n                                  Rcpp::Named(\"modes\") = wrap(modes));\n    } else if(returnpost==2) {\n        // make return matrix of individual posteriors\n        Rcpp::List pl(ncells);\n        for(int i=0;i<ncells;i++) {\n            arma::mat ipost(magnitudes.n_elem,ngenes);\n            for(int j=0;j<ngenes;j++) {\n                ipost.col(j)=(ucposteriors[i]).col(counti(j,i));\n                //R_CheckUserInterrupt();\n            }\n            ipost=ipost.t();\n            pl[i]=ipost;\n            //R_CheckUserInterrupt();\n        }\n        return Rcpp::List::create(Rcpp::Named(\"jp\") = wrap(jp),\n                                  Rcpp::Named(\"post\") = wrap(pl));\n    } else if(returnpost==3) {\n        // return both modes and full posteriors\n        Rcpp::NumericMatrix modes(ngenes,ncells);\n        for(int i=0;i<ncells;i++) {\n            for(int j=0;j<ngenes;j++) {\n                modes(j,i)=magnitudes[(ucmaxi[i])[counti(j,i)]];\n                //R_CheckUserInterrupt();\n            }\n            //R_CheckUserInterrupt();\n        }\n        Rcpp::List pl(ncells);\n        for(int i=0;i<ncells;i++) {\n            arma::mat ipost(magnitudes.n_elem,ngenes);\n            for(int j=0;j<ngenes;j++) {\n                ipost.col(j)=(ucposteriors[i]).col(counti(j,i));\n                //R_CheckUserInterrupt();\n            }\n            ipost=ipost.t();\n            pl[i]=ipost;\n            //R_CheckUserInterrupt();\n        }\n        return Rcpp::List::create(Rcpp::Named(\"jp\") = wrap(jp),\n                                  Rcpp::Named(\"modes\") = wrap(modes),\n                                  Rcpp::Named(\"post\") = wrap(pl));\n    }\n    // returnpost==0 // return joint posteriors only\n    return wrap(jp);\n}\n\n\n// calculate log joint posteriors from models and unique count lists/integer on magniude grid\n// count columns must match model rows\n// ReturnIndividualPosterior: 0 - nothing, 1 - maxima, 2 - full posterior matrices\n\n// similar to above, however the joint is built not over boostrap samples\n// of the same pool, but by sampling a pre-defined (by Composition vector) composition\n// of several different pools (specified by BatchIL)\n// BatchIL - a list of indecies corresponding to the models in each batch (in the order given in Models)\n// Composition - a vector giving the number of cells from each batch (batches are ordered in the same way is BatchIL)\nRcppExport SEXP logBootBatchPosterior(SEXP Models, SEXP Ucl, SEXP CountsI, SEXP Magnitudes, SEXP BatchIL, SEXP Composition, SEXP Nboot, SEXP Seed, SEXP ReturnIndividualPosteriors, SEXP LocalThetaFit, SEXP SquareLogitConc) {\n#define CONCB_I 0\n#define CONCA_I 1\n#define FAILR_I 2\n#define CORRB_I 3\n#define CORRA_I 4\n#define CORRT_I 5\n#define CORRlTB_I 6\n#define CORRlTT_I 7\n#define CORRlTM_I 8\n#define CORRlTS_I 9\n#define CORRlTR_I 10\n#define CONCA2_I 11\n\n    Rcpp::IntegerMatrix counti=Rcpp::as<Rcpp::IntegerMatrix>(CountsI);\n    Rcpp::List ucl(Ucl);\n    int ncells=ucl.size();\n    int returnpost=Rcpp::as<int>(ReturnIndividualPosteriors);\n    int localtheta=Rcpp::as<int>(LocalThetaFit);\n    int squarelogitconc=Rcpp::as<int>(SquareLogitConc);\n    arma::mat models=Rcpp::as<arma::mat>(Models);\n    arma::colvec magnitudes=Rcpp::as<arma::colvec>(Magnitudes);\n    Rcpp::List batchil(BatchIL);\n    Rcpp::IntegerVector comp=as<Rcpp::IntegerVector>(Composition);\n\n    std::vector< arma::mat > ucposteriors;\n    std::vector< std::vector < arma::uword > > ucmaxi;\n    // calculate individual posteriors for each cell, for each unique count value\n    //std::cout<<\"individual posteriors \"<<std::flush;\n    double minlogprob=-1*std::numeric_limits<double>::max()/ncells/1.1;\n    for(int i=0;i<ncells;i++) {\n        Rcpp::IntegerVector uc(Rcpp::as<Rcpp::IntegerVector>(ucl[i]));\n        int ncounts=uc.size();\n        arma::mat pm(magnitudes.n_elem,ncounts);\n        std::vector< arma::uword > maxi;\n        arma::vec mu=magnitudes * models(i,CORRA_I);\n        mu+=models(i,CORRB_I); mu=exp(mu);\n        arma::vec cfp;\n        if(squarelogitconc) {\n            cfp=models(i,CONCA_I) + magnitudes*models(i,CONCA2_I);\n            cfp%=magnitudes;\n        } else {\n            cfp=magnitudes * models(i,CONCA_I);\n        }\n        cfp+=models(i,CONCB_I);\n        cfp=1/(exp(cfp)+1);\n        arma::vec cfpr=1-cfp;\n        cfp=log(cfp); cfpr=log(cfpr);\n        double maxcfp=max(cfp);\n        arma::colvec thetas;\n        if(localtheta) { // linear theta model - prepare theta values\n            thetas=-1*magnitudes + models(i,CORRlTM_I);\n            thetas*=models(i,CORRlTS_I);\n            thetas=exp10(thetas)+1;\n            thetas=pow(thetas,models(i,CORRlTR_I));\n            thetas=(models(i,CORRlTT_I) - models(i,CORRlTB_I))/thetas;\n            thetas+=models(i,CORRlTB_I);\n            thetas=exp(-1*thetas);\n\n            for(unsigned int k=0;k<thetas.n_elem;k++) {\n                if((!std::isfinite(thetas[k])) || (thetas[k]<MIN_THETA)) {  thetas[k]=MIN_THETA;}\n                if(thetas[k]>MAX_THETA) {  thetas[k]=MAX_THETA;}\n                //R_CheckUserInterrupt();\n            }\n        }\n        //R_CheckUserInterrupt();\n\n        for(int j=0;j<ncounts;j++) {\n            // correlated prob\n            arma::vec nbp(mu.n_elem);\n            if(localtheta) { // linear theta model\n                for(unsigned int k=0;k<mu.n_elem;k++) {\n                    double muv=mu[k]; double x=uc[j];\n                    // choose maximum probability when hitting the grid with the maximum\n                    if((k<(mu.n_elem-1) && x>muv && x<mu[k+1]) || (k==(mu.n_elem-1) && x>muv)) { muv=x; }\n                    nbp[k]=Rf_dnbinom(x,thetas[k],thetas[k]/(thetas[k]+muv),true);\n                    //R_CheckUserInterrupt();\n                }\n            } else { // constant theta\n                double theta=models(i,CORRT_I);\n                for(unsigned int k=0;k<mu.n_elem;k++) {\n                    double muv=mu[k]; double x=uc[j];\n                    // choose maximum probability when hitting the grid with the maximum\n                    if((k<(mu.n_elem-1) && x>muv && x<mu[k+1]) || (k==(mu.n_elem-1) && x>muv)) { muv=x; }\n                    nbp[k]=Rf_dnbinom(x,theta,theta/(theta+muv),true);\n                }\n            }\n            nbp+=cfpr;\n            // failure probability\n            double fp=Rf_dpois(uc[j],exp(models(i,FAILR_I)),true);\n\n            // max logp to shift by\n            double maxp=max(nbp);\n            if(maxp<(maxcfp+fp)) { maxp=maxcfp+fp; }\n            nbp=(exp(nbp-maxp) + exp(cfp+fp-maxp));\n            nbp/=sum(nbp);\n            nbp=log(nbp);\n            // find max point\n            if(returnpost==1) {\n                arma::uword maxij;\n\t\tdouble maxv=nbp.max(maxij);\n                maxi.push_back(maxij);\n            }\n            // set the lower bound to min/n.cells\n            for(unsigned int k=0;k<nbp.n_elem;k++) {\n                if(nbp[k]<minlogprob) nbp[k]=minlogprob;\n                //R_CheckUserInterrupt();\n            }\n            pm.col(j)=nbp;\n            //R_CheckUserInterrupt();\n        }\n        ucposteriors.push_back(pm);\n        if(returnpost==1) { ucmaxi.push_back(maxi);}\n        //std::cout<<\".\"<<std::flush;\n    }\n    //std::cout<<\" done\"<<std::endl;\n\n    //std::cout<<\"sampling iterations \"<<std::flush;\n    // calculate joint posterior\n    int ngenes=counti.nrow();\n    arma::mat jp(magnitudes.n_elem,ngenes);  // joint posterior across boostraps\n    jp.zeros();\n    arma::mat tjp(magnitudes.n_elem,ngenes); // for current boostrap posterior\n    int seed=Rcpp::as<int>(Seed);\n    srand(seed);\n    int nboot=as<int>(Nboot);\n\n    for(int i=0;i<nboot;i++) { // sampling iteration\n        //std::cout<<\".\"<<std::flush;\n        tjp.zeros();\n        for(int k=0;k<comp.size();k++) { // over batches\n            int nsamp=comp[k];\n            if(nsamp>0) { // sample cells from k-th batch\n                Rcpp::IntegerVector bi(Rcpp::as<Rcpp::IntegerVector>(batchil[k])); // indecies of cells within the current batch\n                int ncells=bi.size();\n                for(int j=0;j<nsamp;j++) { // over matrices\n                    int rj;\n                    while(ncells <= (rj= rand()/(RAND_MAX/ncells)))\n                        ;\n                    for(int l=0;l<ngenes;l++) { // fill in l-th gene posterior\n                        tjp.col(l)+=(ucposteriors[bi[rj]]).col(counti(l,bi[rj]));\n                        //R_CheckUserInterrupt();\n                    }\n                    //R_CheckUserInterrupt();\n                }\n            }\n            //R_CheckUserInterrupt();\n        }\n        arma::rowvec m=max(tjp,0);\n        tjp.each_row() -= m; // shift up for stability prior to exponentiation\n        tjp=exp(tjp);\n        arma::rowvec s=sum(tjp,0)*nboot;\n        tjp.each_row() /= s;\n        jp+=tjp;\n    }\n    //std::cout<<\" done\"<<std::endl;\n    jp=jp.t();\n\n    if(returnpost==1) {\n        // make return matrix of individual posterior maxima\n        Rcpp::NumericMatrix modes(ngenes,ncells);\n        for(int i=0;i<ncells;i++) {\n            for(int j=0;j<ngenes;j++) {\n                modes(j,i)=magnitudes[(ucmaxi[i])[counti(j,i)]];\n                //R_CheckUserInterrupt();\n            }\n            //R_CheckUserInterrupt();\n        }\n        return Rcpp::List::create(Rcpp::Named(\"jp\") = wrap(jp),\n                                  Rcpp::Named(\"modes\") = wrap(modes));\n    } else if(returnpost==2) {\n        // make return matrix of individual posteriors\n        Rcpp::List pl(ncells);\n        for(int i=0;i<ncells;i++) {\n            arma::mat ipost(magnitudes.n_elem,ngenes);\n            for(int j=0;j<ngenes;j++) {\n                ipost.col(j)=(ucposteriors[i]).col(counti(j,i));\n                //R_CheckUserInterrupt();\n            }\n            ipost=ipost.t();\n            pl[i]=ipost;\n            //R_CheckUserInterrupt();\n        }\n        return Rcpp::List::create(Rcpp::Named(\"jp\") = wrap(jp),\n                                  Rcpp::Named(\"post\") = wrap(pl));\n    }\n    // returnpost==0 // return joint posteriors only\n    return wrap(jp);\n}\n"
  },
  {
    "path": "src/jpmatLogBoot.h",
    "content": "#ifndef _scde_JPMATLOGBOOT_H\n#define _scde_JPMATLOGBOOT_H\n\n#include <RcppArmadillo.h>\n\nRcppExport SEXP jpmatLogBoot(SEXP Matl, SEXP Nboot, SEXP Seed) ;\nRcppExport SEXP jpmatLogBatchBoot(SEXP Matll, SEXP Comp, SEXP Nboot, SEXP Seed) ;\nRcppExport SEXP logBootPosterior(SEXP Models, SEXP Ucl, SEXP CountsI, SEXP Magnitudes, SEXP Nboot, SEXP Seed, SEXP ReturnIndividualPosteriors,SEXP LocalThetaFit, SEXP SquareLogitConc, SEXP EnsembleProbability) ;\nRcppExport SEXP logBootBatchPosterior(SEXP Models, SEXP Ucl, SEXP CountsI, SEXP Magnitudes, SEXP BatchIL, SEXP Composition, SEXP Nboot, SEXP Seed, SEXP ReturnIndividualPosteriors, SEXP LocalThetaFit, SEXP SquareLogitConc);\n#endif\n"
  },
  {
    "path": "src/matSlideMult.cpp",
    "content": "#include \"matSlideMult.h\"\n\nusing namespace Rcpp ;\n\nSEXP matSlideMult(SEXP Mat1, SEXP Mat2){\n    arma::mat m1=Rcpp::as<arma::mat>(Mat1);\n    arma::mat m2=Rcpp::as<arma::mat>(Mat2);\n    int n=m1.n_cols;\n    arma::mat rm(m1.n_rows,2*n-1);\n\n    // left half\n    for(int i=n;i>1;i--) {\n        rm.col(n-i)=sum(m1.cols(0,n-i) % m2.cols(i-1,n-1),1);\n        R_CheckUserInterrupt();\n    }\n    // right half\n    for(int i=1;i<=n;i++) {\n        rm.col(n-2+i)=sum(m1.cols(i-1,n-1) % m2.cols(0,n-i),1);\n        R_CheckUserInterrupt();\n    }\n\n    return wrap(rm);\n}\n\n"
  },
  {
    "path": "src/matSlideMult.h",
    "content": "#ifndef _scde_MATSLIDEMULT_H\n#define _scde_MATSLIDEMULT_H\n\n#include <RcppArmadillo.h>\n\nRcppExport SEXP matSlideMult(SEXP Mat1, SEXP Mat2) ;\n\n#endif\n"
  },
  {
    "path": "src/pagoda.cpp",
    "content": "#include \"pagoda.h\"\n#include <cmath>\n\nusing namespace Rcpp;\n\nSEXP winsorizeMatrix(SEXP Mat, SEXP Trim){\n    arma::mat m=Rcpp::as<arma::mat>(Mat);\n    int n=m.n_cols; int k=m.n_rows;\n    int ntr=round(n * Rcpp::as<double>(Trim)); // number of positions to trim (from each side)\n    if(ntr==0) { return wrap(m); } // nothing needs to be done\n    for(int i=0;i<k;i++) { // for every row of the matrix\n        arma::rowvec z= m.row(i);\n        // determine outliers\n        // arma::urowvec o=sort_index(abs(z-median(z)),1);\n        arma::ucolvec o=sort_index(z); // ascending order\n        // determine range\n        double minv=z(o(ntr));\n        double maxv=z(o(n-ntr-1));\n        for(int j=0;j<ntr;j++) {\n            z(o(j))=minv;\n            //R_CheckUserInterrupt();\n        }\n        for(int j=n-ntr;j<n;j++) {\n            z(o(j))=maxv;\n            //R_CheckUserInterrupt();\n        }\n        m.row(i)=z;\n        R_CheckUserInterrupt();\n    }\n    return wrap(m);\n}\n\nSEXP matCorr(SEXP X, SEXP Y) {\n    arma::mat x=Rcpp::as<arma::mat>(X);\n    arma::mat y=Rcpp::as<arma::mat>(Y);\n    arma::mat c=arma::cor(x,y);\n    return wrap(c);\n}\n\n\nSEXP matWCorr(SEXP Mat, SEXP Matw){\n    arma::mat m=Rcpp::as<arma::mat>(Mat);\n    arma::mat w=Rcpp::as<arma::mat>(Matw);\n    int n=m.n_cols; // int k=m.n_rows;\n    arma::mat c(n,n,arma::fill::eye);\n    for(int i=0;i<(n-1);i++) {\n        for(int j=i+1;j<n;j++) {\n            arma::colvec ic=m.col(i);\n            arma::colvec jc=m.col(j);\n            // weight for this i,j\n            arma::colvec jw=w.col(i) % w.col(j); jw=sqrt(jw);\n            jw/=sum(jw);\n            // shift by weighted means\n            ic-=dot(ic,jw); jc-=dot(jc,jw);\n            double nm=dot(ic % jc,jw);\n            ic%=ic; jc%=jc;\n            double dn=dot(ic,jw);\n            dn*=dot(jc,jw);\n            c(j,i)=nm/sqrt(dn);\n            //R_CheckUserInterrupt();\n        }\n        R_CheckUserInterrupt();\n    }\n    return wrap(c);\n}\n\nSEXP plSemicompleteCor2(SEXP Pl) {\n    Rcpp::List pl(Pl);\n    int np=pl.size();\n    // calculate individual var\n\n    arma::mat cm(np,np);\n    cm.eye();\n    arma::imat cn(np,np,arma::fill::zeros); // number of genes in the union\n    // calculate covariance and correlation\n    for(int i=0;i<np;i++) {\n        Rcpp::NumericVector v1=Rcpp::as<Rcpp::NumericVector>(Rcpp::as<Rcpp::List>(pl[i])[1]);\n        Rcpp::NumericVector i1=Rcpp::as<Rcpp::NumericVector>(Rcpp::as<Rcpp::List>(pl[i])[0]);\n        int v1s=v1.size();\n        for(int j=i+1;j<np;j++) {\n            Rcpp::NumericVector v2=Rcpp::as<Rcpp::NumericVector>(Rcpp::as<Rcpp::List>(pl[j])[1]);\n            Rcpp::NumericVector i2=Rcpp::as<Rcpp::NumericVector>(Rcpp::as<Rcpp::List>(pl[j])[0]);\n            int v2s=v2.size();\n            int sgc=0; // \"same gene\" counter\n            double l12=0; // covar\n            double l11=0; // var1\n            double l22=0; // var2\n            int k2=0;\n            for(int k1=0;k1<v1s;k1++) {\n                int id=i2[k2]-i1[k1]; // gene index difference\n                if(id==0) { // hit the same gene\n                    sgc++;\n                    l12+=v2[k2]*v1[k1];\n                    l11+=v2[k2]*v2[k2];\n                    l22+=v1[k1]*v1[k1];\n                } else if(id<0) { // need to advance k2\n                    do { k2++; } while(i2[k2]<i1[k1] && k2<v2s);\n                    if(k2==v2s) { break; } // v2 ended, done with this pair\n                    if(i2[k2]==i1[k1]) { // hit the same gene\n                        sgc++;\n                        l12+=v2[k2]*v1[k1];\n                        l11+=v2[k2]*v2[k2];\n                        l22+=v1[k1]*v1[k1];\n                    }\n                }\n            }\n            double cv=l11*l22;\n            if(cv>0) { cv=l12/sqrt(cv); }\n            cm(i,j)=cv; cm(j,i)=cv;\n            sgc=v1s+v2s-sgc;\n            cn(i,j)=sgc; cn(j,i)=sgc;\n        }\n        R_CheckUserInterrupt();\n    };\n    return List::create(Named(\"r\") = wrap(cm),\n                        Named(\"n\") = wrap(cn));\n}\n"
  },
  {
    "path": "src/pagoda.h",
    "content": "#ifndef _scde_PAGODA_H\n#define _scde_PAGODA_H\n\n#include <RcppArmadillo.h>\nRcppExport SEXP winsorizeMatrix(SEXP Mat, SEXP Trim); \nRcppExport SEXP matWCorr(SEXP Mat, SEXP Matw);\nRcppExport SEXP plSemicompleteCor2(SEXP Pl);\nRcppExport SEXP matCorr(SEXP X, SEXP Y);\n#endif\n"
  },
  {
    "path": "tests/tests.R",
    "content": "# tests for travis.ci\nlibrary(scde)\n\n######\n# Basic diff exp and batch correction tests\n######\n\n# load example dataset\ndata(es.mef.small)\n# factor determining cell types\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(es.mef.small)), levels=c(\"ESC\", \"MEF\"))\n# the group factor should be named accordingly\nnames(sg) <- colnames(es.mef.small)\ntable(sg)\n\n# clean up the dataset\ncd <- es.mef.small\n# omit genes that are never detected\ncd <- cd[rowSums(cd)>0, ]\n# omit cells with very poor coverage\ncd <- cd[, colSums(cd)>1e4]\n\n# calculate models\n# takes too long to run on travis...\n# o.ifm <- scde.error.models(counts=cd, groups=sg, n.cores=1, threshold.segmentation=T, save.crossfit.plots=F, save.model.plots=F, verbose=1)\n# devtools::use_data(o.ifm)  # save for later since this step takes a long time\ndata(o.ifm)\n\n# filter out cells that don't show positive correlation with\n# the expected expression magnitudes (very poor fits)\nvalid.cells <- o.ifm$corr.a > 0\ntable(valid.cells)\no.ifm <- o.ifm[valid.cells, ]\n\n# estimate gene expression prior\no.prior <- scde.expression.prior(models=o.ifm, counts=cd, length.out=400, show.plot=F)\n\n# define two groups of cells\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels=c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\n# run differential expression tests on all genes.\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups=groups, n.randomizations=100, n.cores=1, verbose=1)\n# top upregulated genes (tail would show top downregulated ones)\n\nscde.test.gene.expression.difference(\"Tdh\", models=o.ifm, counts=cd, prior=o.prior)\n\nbatch <- as.factor(ifelse(rbinom(nrow(o.ifm), 1, 0.5)==1, \"batch1\", \"batch2\"))\n# check the interaction between batches and cell types (shouldn't be any)\ntable(groups, batch)\n# test the Tdh gene again\nscde.test.gene.expression.difference(\"Tdh\", models=o.ifm, counts=cd, prior=o.prior, batch=batch)\n\n# test for all of the genes\nediff.batch <- scde.expression.difference(o.ifm, cd, o.prior, groups=groups, batch=batch, n.randomizations=100, n.cores=1, return.posteriors=T, verbose=1)\n"
  },
  {
    "path": "vignettes/diffexp.Rmd",
    "content": "---\ntitle: \"Getting Started with `scde`\"\nauthor: \"Peter Kharchenko, Jean Fan\"\ndate: '`r Sys.Date()`'\noutput: pdf_document\n---\n\n# Single-Cell Differential Expression Analysis\n\nIn this vignette, we show you how perform single cell differential expression analysis using single cell RNA-seq data with the `scde` package.\n\nThe `scde` package implements routines for fitting individual error models for single-cell RNA-seq measurements. Briefly, the read counts observed for each gene are modeled using a mixture of a negative binomial (NB) distribution (for the amplified/detected transcripts) and low-level Poisson distribution (for the unobserved or background-level signal of genes that failed to amplify or were not detected for other reasons). These models can then be used to identify robustly differentially expressed genes between groups of cells. For more information, please refer to the original manuscript by [_Kharchenko et al._](http://www.ncbi.nlm.nih.gov/pubmed/24836921).\n\n## Preparing data\n\nThe analysis starts with a matrix of read counts. Depending on the protocol, these may be raw numbers of reads mapped to each gene, or count values adjusted for potential biases (sequence dependency, splice variant coverage, etc. - the values must be integers). The `scde` package includes a subset of the ES/MEF cell dataset published by [_Islam et al._](http://www.ncbi.nlm.nih.gov/pubmed/24363023). The subset includes first 20 ES and MEF cells. Here we load the cells and define a factor separating ES and MEF cell types:\n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/scde-',\n    cache.path = 'cache/scde-',\n    cache = TRUE\n)\n```\n\n```{r, data}\nlibrary(scde)\n\n# load example dataset\ndata(es.mef.small)\n\n# factor determining cell types\nsg <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", colnames(es.mef.small)), levels = c(\"ESC\", \"MEF\"))\n# the group factor should be named accordingly\nnames(sg) <- colnames(es.mef.small)  \ntable(sg)\n\n# clean up the dataset\ncd <- clean.counts(es.mef.small, min.lib.size=1000, min.reads = 1, min.detected = 1)\n```\n\n## Fitting error models\n\nAs a next step we fit the error models on which all subsequent calculations will rely. The fitting process relies on a subset of robust genes that are detected in multiple cross-cell comparisons. Here we supply the `groups = sg` argument, so that the error models for the two cell types are fit independently (using two different sets of \"robust\" genes). If the `groups` argument is omitted, the models will be fit using a common set. \n\nNote this step takes a considerable amount of time unless multiple cores are used. \n```{r, fit, eval = FALSE}\n# EVALUATION NOT NEEDED\n# calculate models\no.ifm <- scde.error.models(counts = cd, groups = sg, n.cores = 1, threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE, verbose = 1)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n```{r, fit2, results = 'hide'}\ndata(o.ifm)\n```\n\nThe `o.ifm` is a dataframe with error model coefficients for each cell (rows).\n```{r, fit3, results = 'hide'}\nhead(o.ifm)\n```\n\nHere, `corr.a` and `corr.b` are slope and intercept of the correlated component fit, `conc.*` refer to the concomitant fit, `corr.theta` is the NB over-dispersion, and `fail.r` is the background Poisson rate (fixed).\n\nParticularly poor cells may result in abnormal fits, most commonly showing negative `corr.a`, and should be removed.\n\n```{r, fit4}\n# filter out cells that don't show positive correlation with\n# the expected expression magnitudes (very poor fits)\nvalid.cells <- o.ifm$corr.a > 0\ntable(valid.cells)\no.ifm <- o.ifm[valid.cells, ]\n```\n\nHere, all the fits were valid.\n\nFinally, we need to define an expression magnitude prior for the genes. Its main function, however, is to define a grid of expression magnitude values on which the numerical calculations will be carried out.\n\n```{r, prior}\n# estimate gene expression prior\no.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)\n```\n\nHere we used a grid of 400 points, and let the maximum expression magnitude be determined by the default 0.999 quantile (use `max.value` parameter to specify the maximum expression magnitude explicitly - on log10 scale).\n\n## Testing for differential expression\n\nTo test for differential expression, we first define a factor that specifies which two groups of cells are to be compared. The factor elements correspond to the rows of the model matrix (`o.ifm`), and can contain `NA` values (i.e. cells that won't be included in either group). Here we key off the the ES and MEF names.\n\n```{r, diffexp}\n# define two groups of cells\ngroups <- factor(gsub(\"(MEF|ESC).*\", \"\\\\1\", rownames(o.ifm)), levels  =  c(\"ESC\", \"MEF\"))\nnames(groups) <- row.names(o.ifm)\n# run differential expression tests on all genes.\nediff <- scde.expression.difference(o.ifm, cd, o.prior, groups  =  groups, n.randomizations  =  100, n.cores  =  1, verbose  =  1)\n# top upregulated genes (tail would show top downregulated ones)\nhead(ediff[order(ediff$Z, decreasing  =  TRUE), ])\n```\n\n```{r, diffexp2, eval = FALSE}\n# write out a table with all the results, showing most significantly different genes (in both directions) on top\nwrite.table(ediff[order(abs(ediff$Z), decreasing = TRUE), ], file = \"results.txt\", row.names = TRUE, col.names = TRUE, sep = \"\\t\", quote = FALSE)\n```\n\nAlternatively we can run the differential expression on a single gene, and visualize the results:\n\n```{r, diffexp3, cache = TRUE, fig.height = 9, fig.width = 6, fig.show='hold', fig.path='figures/scde-'}\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior)\n```\n\nThe top and the bottom plots show expression posteriors derived from individual cells (colored lines) and joint posteriors (black lines). The middle plot shows posterior of the expression fold difference between the two cell groups, highlighting the 95% credible interval by the red shading.\n\n## Correcting for batch effects\n\nWhen the data combines cells that were measured in different batches, it is sometimes necessary to explicitly account for the expression differences that could be explained by the batch composition of the cell groups being compared. The example below makes up a random batch composition for the ES/MEF cells, and re-test the expression difference.\n\n```{r, seed, include = FALSE}\nset.seed(1)\n```\n\n```{r, batch, fig.height = 9, fig.width = 6}\nbatch <- as.factor(ifelse(rbinom(nrow(o.ifm), 1, 0.5) == 1, \"batch1\", \"batch2\"))\n# check the interaction between batches and cell types (shouldn't be any)\ntable(groups, batch)\n# test the Tdh gene again\nscde.test.gene.expression.difference(\"Tdh\", models = o.ifm, counts = cd, prior = o.prior, batch = batch)\n```\n\nIn the plot above, the grey lines are used to show posterior distributions based on the batch composition alone. The expression magnitude posteriors (top and bottom plots) look very similar, and as a result the log2 expression ratio posterior is close to 0. The thin black line shows log2 expression ratio posterior before correction. The batch correction doesn't shift the location, but increases uncertainty in the ratio estimate (since we're controlling for another factor).\n\nSimilarly, batch correction can be performed when calculating expression differences for the entire dataset:\n\n```{r, batch2}\n# test for all of the genes\nediff.batch <- scde.expression.difference(o.ifm, cd, o.prior, groups = groups, batch = batch, n.randomizations = 100, n.cores = 1, return.posteriors = TRUE, verbose = 1)\n```\n\n### More detailed functions\n\nThe `scde.expression.difference` method can return a more extensive set of results, including joint posteriors and the expression fold difference posteriors for all of the exam\nined genes:\n```{r, detailed1, include=FALSE, eval=FALSE}\n# recalculate difference and return with joint posteriors and difference posterior\nediff.details <- scde.expression.difference(o.ifm, cd, o.prior, n.randomizations = 100, n.cores = 1, verbose = 1, return.posteriors = TRUE)\n```\n\nThe joint posteriors can also be obtained explicitly for a particular set of cells:\n```{r, detailed2, eval=FALSE}\n# calculate joint posterior for ESCs (set return.individual.posterior.modes=T if you need p.modes)\njp <- scde.posteriors(models = o.ifm[grep(\"ESC\",rownames(o.ifm)), ], cd, o.prior, n.cores = 1)\n```\n\nThe error models fit the intercept and the slope of the NB \"correlated\" component, providing more consistent expression magnitude estimates among the cells. These can be obtain\ned with a quick helper function:\n```{r, detailed3}\n# get expression magntiude estimates\no.fpm <- scde.expression.magnitude(o.ifm, counts = cd)\n```\n\nDrop-out probabilities (as a function of expression magnitudes) for different cells are useful for assessing the quality of the measurements: \n```{r, detailed4, fig.width=4, fig.height=4}\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\npar(mfrow = c(1,1), mar = c(3.5,3.5,0.5,0.5), mgp = c(2.0,0.65,0), cex = 1)\nplot(c(), c(), xlim=range(o.prior$x), ylim=c(0,1), xlab=\"expression magnitude (log10)\", ylab=\"drop-out probability\")\ninvisible(apply(o.fail.curves[, grep(\"ES\",colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y,col = \"orange\")))\ninvisible(apply(o.fail.curves[, grep(\"MEF\", colnames(o.fail.curves))], 2, function(y) lines(x = o.prior$x, y = y, col = \"dodgerblue\")))\n```\n\nThe drop-out probabilities (at a given expression magnitude, or at an observed count) can be useful in subsequent analysis\n```{r, detailed5}\n# get failure probabilities on the expresison range\no.fail.curves <- scde.failure.probability(o.ifm, magnitudes = log((10^o.prior$x)-1))\n# get self-fail probabilities (at a given observed count)\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n```\n\n## Adjusted distance meaures\n\nThe dependency of drop-out probability on the average expression magntiude captured by the cell-speicifc models can be used to adjust cell-to-cell similarity measures, for insta\nnce in the context of cell clustering. Several such measures are explored below.\n\n### Direct drop-out\n\nDirect weighting downweights the contribution of a given gene to the cell-to-cell distance based on the probability that the given measurement is a drop-out event (i.e. belongs to the drop-out component) - the \"self-fail\" probability shown in the previous section. To estimate the adjusted distance, we will simulate the drop-out events, replacing them with `NA` values, and calculating correlation using the remaining points: \n```{r, adjusted1, results='hide', eval=FALSE}\np.self.fail <- scde.failure.probability(models = o.ifm, counts = cd)\n# simulate drop-outs\n# note: using 10 sampling rounds for illustration here. ~500 or more should be used.\nn.simulations <- 10; k <- 0.9;\ncell.names <- colnames(cd); names(cell.names) <- cell.names;\ndl <- mclapply(1:n.simulations,function(i) {\n  scd1 <- do.call(cbind,lapply(cell.names,function(nam) {\n    x <- cd[,nam];\n    # replace predicted drop outs with NAs\n    x[!as.logical(rbinom(length(x),1,1-p.self.fail[,nam]*k))] <- NA;\n    x;\n    }))\n  rownames(scd1) <- rownames(cd); \n  # calculate correlation on the complete observation pairs\n  cor(log10(scd1+1),use=\"pairwise.complete.obs\");\n}, mc.cores = 1)\n# calculate average distance across sampling rounds\ndirect.dist <- as.dist(1-Reduce(\"+\",dl)/length(dl))\n```\n\n### Reciprocal weighting\nThe reciprocal weighting of the Pearson correlation will give increased weight to pairs of observations where a gene expressed (on average) at a level x1 observed in a cell c1 would not be likely to fail in a cell c2, and vice versa:\n```{r, adjusted2, results='hide', eval=FALSE}\n# load boot package for the weighted correlation implementation\nrequire(boot)\nk <- 0.95;\nreciprocal.dist <- as.dist(1 - do.call(rbind, mclapply(cell.names, function(nam1) {\n  unlist(lapply(cell.names, function(nam2) {\n    # reciprocal probabilities\n    f1 <- scde.failure.probability(models = o.ifm[nam1,,drop = FALSE], magnitudes = o.fpm[, nam2])\n    f2 <- scde.failure.probability(models = o.ifm[nam2,,drop = FALSE], magnitudes = o.fpm[, nam1])\n    # weight factor\n    pnf <- sqrt((1-f1)*(1-f2))*k +(1-k); \n    boot::corr(log10(cbind(cd[, nam1], cd[, nam2])+1), w = pnf)\n    }))\n},mc.cores = 1)), upper = FALSE)\n```\n\n### Mode-relative weighting\nA more reliable reference magnitude against which drop-out likelihood could be assessed would be an estimate of the average expression magnitude, such as joint posterior mode. Below we estimate `p.mode.fail`, a probability that a drop-out event could be observed at the level of average expression magntiude in a given cell. For each measurement we then reduce it weight if it indeed dropped out in a cell where we expect it to drop-out given its average expression magnitude `(p.self.fail*p.mode.fail)`. However we do want to give high weight to measurements where the drop-out was not observed, even though it was exected based on the average expression magnitude, so the overall weight expression is `(1-p.self.fail*sqrt(p.self.fail*p.mode.fail))` (other formulations are clearly possible here).\n```{r, adjusted3, results='hide', eval=FALSE}\n# reclculate posteriors with the individual posterior modes \njp <- scde.posteriors(models = o.ifm, cd, o.prior, return.individual.posterior.modes = TRUE, n.cores = 1)\n# find joint posterior modes for each gene - a measure of MLE of group-average expression\njp$jp.modes <- log(as.numeric(colnames(jp$jp)))[max.col(jp$jp)]\np.mode.fail <- scde.failure.probability(models = o.ifm, magnitudes = jp$jp.modes)\n# weight matrix\nmatw <- 1-sqrt(p.self.fail*sqrt(p.self.fail*p.mode.fail))\n# magnitude matrix (using individual posterior modes here)\nmat <- log10(exp(jp$modes)+1);\n# weighted distance\nmode.fail.dist <- as.dist(1-do.call(rbind,mclapply(cell.names,function(nam1) {\n  unlist(lapply(cell.names,function(nam2) {\n    corr(cbind(mat[, nam1], mat[, nam2]), w = sqrt(sqrt(matw[, nam1]*matw[, nam2])))\n  }))\n}, mc.cores = 1)), upper = FALSE)\n```\n"
  },
  {
    "path": "vignettes/pagoda.Rmd",
    "content": "---\ntitle: \"Getting Started with `pagoda` Routines\"\nauthor: \"Peter Kharchenko, Jean Fan\"\ndate: '`r Sys.Date()`'\noutput: html_document\n---\n\n# Pathway and Gene Set Overdispersion Analysis\n\nIn this vignette, we show you how to use `pagoda` routines in the `scde` package to characterize aspects of transcriptional heterogeneity in populations of single cells. \n\nThe `pagoda` routines implemented in the `scde` resolves multiple, potentially overlapping aspects of transcriptional heterogeneity by identifying known pathways or novel gene sets that show significant excess of coordinated variability among the measured cells. Briefly, cell-specific error models derived from `scde` are used to estimate residual gene expression variance, and identify pathways and gene sets that exhibit statistically significant excess of coordinated variability (overdispersion). `pagoda` can be used to effectively recover known subpopulations and discover putative new subpopulations and their corresponding functional characteristics in single-cell samples. For more information, please refer to the original manuscript by [_Fan et al._](http://biorxiv.org/content/early/2015/09/16/026948).\n\n```{r, include = FALSE}\nlibrary(knitr)\nopts_chunk$set(\n    warning = FALSE,\n    message = FALSE,\n    fig.show = 'hold',\n    fig.path = 'figures/pagoda-',\n    cache.path = 'cache/pagoda-',\n    cache = TRUE\n)\n```\n\n## Preparing data\n\nThe analysis starts with a matrix of read counts. Here, we use the read count table and cell group annotations from [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649) can be loaded using the `data(\"pollen\")` call. Some additional filters are also applied.\n\n```{r, data}\nlibrary(scde)\n\ndata(pollen)\n# remove poor cells and genes\ncd <- clean.counts(pollen)\n# check the final dimensions of the read count matrix\ndim(cd)\n```\n\nNext, we'll translate group and sample source data from [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649) into color codes. These will be used later to compare [_Pollen et al._](www.ncbi.nlm.nih.gov/pubmed/25086649)'s derived annotation with subpopulations identified by `pagoda`:\n\n```{r, colorcodes}\nx <- gsub(\"^Hi_(.*)_.*\", \"\\\\1\", colnames(cd))\nl2cols <- c(\"coral4\", \"olivedrab3\", \"skyblue2\", \"slateblue3\")[as.integer(factor(x, levels = c(\"NPC\", \"GW16\", \"GW21\", \"GW21+3\")))]\n```\n\n## Fitting error models\n\nNext, we'll construct error models for individual cells. Here, we use k-nearest neighbor model fitting procedure implemented by `knn.error.models()` method. This is a relatively noisy dataset (non-UMI), so we raise the `min.count.threshold` to 2 (minimum number of reads for the gene to be initially classified as a non-failed measurement), requiring at least 5 non-failed measurements per gene. We're providing a rough guess to the complexity of the population, by fitting the error models based on 1/4 of most similar cells (i.e. guessing there might be ~4 subpopulations). \n\nNote this step takes a considerable amount of time unless multiple cores are used. We highly recommend use of multiple cores. You can check the number of available cores available using `detectCores()`. \n\n```{r, models, eval = FALSE, hide = TRUE}\n# EVALUATION NOT NEEDED\nknn <- knn.error.models(cd, k = ncol(cd)/4, n.cores = 1, min.count.threshold = 2, min.nonfailed = 5, max.model.plots = 10)\n```\n\nFor the purposes of this vignette, the model has been precomputed and can simply be loaded.\n\n```{r, models2, results = 'hide'}\ndata(knn)\n```\n\nThe fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). The fitting process above wrote out `cell.models.pdf` file in the current directory showing model fits for the first 10 cells (see `max.model.plots` argument). Here's an example of such plot:\n\n![cell 3 model](https://github.com/hms-dbmi/scde/raw/master/vignettes/figures/pagoda-cell.model.fits-0.png)\n\nThe two scatter plots on the left show observed (in a given cell) vs. expected (from k similar cells) expression magnitudes for each gene that is being used for model fitting. The second (from the left) scatter plot shows genes belonging to the drop-out component in red. The black dashed lines show 95% confidence band for the amplified genes (the grey dashed lines show confidence band for an alternative constant-theta model). The third plot shows drop-out probability as a function of magnitude, and the fourth plot shows negative binomial theta local regression fit as a function of magnitude (for the amplified component). \n\n## Normalizing variance\n\nIn order to accurately quantify excess variance or overdispersion, we must normalize out expected levels of technical and intrinsic biological noise. Briefly, variance of the NB/Poisson mixture processes derived from the error modeling step are modeled as a chi-squared distribution using adjusted degrees of freedom and observation weights based on the drop-out probability of a given gene. Here, we normalize variance, trimming 3 most extreme cells and limiting maximum adjusted variance to 5.\n\n```{r varnorm, fig.height = 3, fig.width = 6}\nvarinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = TRUE)\n```\n\nThe plot on the left shows coefficient of variance squared (on log10 scale) as a function of expression magnitude (log10 FPM). The red line shows local regression model for the genome-wide average dependency. The plot on the right shows adjusted variance (derived based on chi-squared probability of observed/genomewide expected ratio for each gene, with degrees of freedom adjusted for each gene). The adjusted variance of 1 means that a given gene exhibits as much variance as expected for a gene of such population average expression magnitude. Genes with high adjusted variance are overdispersed within the measured population and most likely show subpopulation-specific expression:\n\n```{r, varnorm2}\n# list top overdispersed genes\nsort(varinfo$arv, decreasing = TRUE)[1:10]\n```\n\n## Controlling for sequencing depth \n\nEven with all the corrections, sequencing depth or gene coverage is typically still a major aspects of variability. In most studies, we would want to control for that as a technical artifact (exceptions are cell mixtures where subtypes significantly differ in the amount of total mRNA). Below we will control for the gene coverage (estimated as a number of genes with non-zero magnitude per cell) and normalize out that aspect of cell heterogeneity: \n\n```{r, varnorm3}\nvarinfo <- pagoda.subtract.aspect(varinfo, colSums(cd[, rownames(knn)]>0))\n```\n\n## Evaluate overdispersion of pre-defined gene sets\n\nIn order to detect significant aspects of heterogeneity across the population of single cells, 'pagoda' identifies pathways and gene sets that exhibit statistically significant excess of coordinated variability. Specifically, for each gene set, we tested whether the amount of variance explained by the first principal component significantly exceed the background expectation. We can test both pre-defined gene sets as well as 'de novo' gene sets whose expression profiles are well-correlated within the given dataset. \n\nFor pre-defined gene sets, we'll use GO annotations. For the purposes of this vignette, in order to make calculations faster, we will only consider the first 100 GO terms plus a few that we care about. Additional tutorials on how to create and use your own gene sets can be found in [a separate tutorial](http://hms-dbmi.github.io/scde/genesets.html). \n\n```{r, goenv}\nlibrary(org.Hs.eg.db)\n# translate gene names to ids\nids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))\nrids <- names(ids); names(rids) <- ids \n# convert GO lists from ids to gene names\ngos.interest <- unique(c(ls(org.Hs.egGO2ALLEGS)[1:100],\"GO:0022008\",\"GO:0048699\", \"GO:0000280\", \"GO:0007067\")) \ngo.env <- lapply(mget(gos.interest, org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x]))) \ngo.env <- clean.gos(go.env) # remove GOs with too few or too many genes\ngo.env <- list2env(go.env) # convert to an environment\n```\n\nNow, we can calculate weighted first principal component magnitudes for each GO gene set in the provided environment.\n\n```{r, pathwaySig}\npwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, n.cores = 1)\n```\n\nWe can now evaluate the statistical significance of the observed overdispersion for each GO gene set.\n\n```{r, topPathways, fig.height = 4, fig.width = 5}\ndf <- pagoda.top.aspects(pwpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\n```\n\nEach point on the plot shows the PC1 variance (lambda1) magnitude (normalized by set size) as a function of set size. The red lines show expected (solid) and 95% upper bound (dashed) magnitudes based on the Tracey-Widom model.\n\n```{r, df}\nhead(df)\n```\n\n* The z column gives the Z-score of pathway over-dispersion relative to the genome-wide model (Z-score of 1.96 corresponds to P-value of 5%, etc.). \n* \"z.adj\" column shows the Z-score adjusted for multiple hypothesis (using Benjamini-Hochberg correction). \n* \"score\" gives observed/expected variance ratio\n* \"sh.z\" and \"adj.sh.z\" columns give the raw and adjusted Z-scores of \"pathway cohesion\", which compares the observed PC1 magnitude to the magnitudes obtained when the observations for each gene are randomized with respect to cells. When such Z-score is high (e.g. for GO:0008009) then multiple genes within the pathway contribute to the coordinated pattern.\n\n\n## Evaluate overdispersion of 'de novo' gene sets\n\nWe can also test 'de novo' gene sets whose expression profiles are well-correlated within the given dataset. The following procedure will determine 'de novo' gene clusters in the data, and build a background model for the expectation of the gene cluster weighted principal component magnitudes. Note the higher trim values for the clusters, as we want to avoid clusters that are formed by outlier cells.\n\n```{r, clusterPCA, fig.height = 3, fig.width = 6}\nclpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 1, plot = TRUE)\n```\n\nThe plot above shows background distribution of the first principal component (`PC1`) variance (`lambda1`) magnitude. The blue scatterplot on the left shows `lambda1` magnitude vs. cluster size for clusters determined based on randomly-generated matrices of the same size. The black circles show top cluster in each simulation. The red lines show expected magnitude and 95% confidence interval based on Tracy-Widom distribution. The right plot shows extreme value distribution fit of residual cluster `PC1` variance magnitude relative to the Gumbel (extreme value) distribution.\n\nNow the set of top aspects can be recalculated taking these `de novo` gene clusters into account:\n\n```{r, topPathways2, fig.height = 4, fig.width = 5}\ndf <- pagoda.top.aspects(pwpca, clpca, return.table = TRUE, plot = TRUE, z.score = 1.96)\nhead(df)\n```\n\nThe gene clusters and their corresponding model expected value and 95% upper bound are shown in green.\n\n\n## Visualize significant aspects of heterogeneity\n\nTo view top heterogeneity aspects, we will first obtain information on all the significant aspects of transcriptional heterogeneity. We will also determine the overall cell clustering based on this full information:\n\n```{r, celclust}\n# get full info on the top aspects\ntam <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))\n# determine overall cell clustering\nhc <- pagoda.cluster.cells(tam, varinfo)\n```\n\nNext, we will reduce redundant aspects in two steps. First we will combine pathways that are driven by the same sets of genes:\n\n```{r, loadingCollapse}\ntamr <- pagoda.reduce.loading.redundancy(tam, pwpca, clpca)\n```\n\nIn the second step we will combine aspects that show similar patterns (i.e. separate the same sets of cells). Here we will plot the cells using the overall cell clustering determined above:\n\n```{r, correlatedCollapse, fig.height = 6, fig.width = 10}\ntamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, cell.clustering = hc, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)\n```\n\nIn the plot above, the columns are cells, rows are different significant aspects, clustered by their similarity pattern.The green-to-orange color scheme shows low-to-high weighted PCA scores (aspect patterns), where generally orange indicates higher expression. Blocks of color on the left margin show which aspects have been combined by the command above. Here the number of resulting aspects is relatively small. \"top\" argument (i.e. top = 10) can be used to limit further analysis to top N aspects.\n\nWe will view the top aspects, clustering them by pattern similarity (note, to view aspects in the order of increasing `lambda1` magnitude, use `row.clustering = NA`). \n\n```{r, viewAspects, fig.height = 3.5, fig.width = 8}\ncol.cols <- rbind(groups = cutree(hc, 3))\npagoda.view.aspects(tamr2, cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20), col.cols = rbind(l2cols))\n```\n\nWhile each row here represents a cluster of pathways, the row names are assigned to be the top overdispersed aspect in each cluster.\n\nTo interactively browse and explore the output, we can create a `pagoda` app:\n\n```{r, pagodaApp, eval = FALSE}\n# compile a browsable app, showing top three clusters with the top color bar\napp <- make.pagoda.app(tamr2, tam, varinfo, go.env, pwpca, clpca, col.cols = col.cols, cell.clustering = hc, title = \"NPCs\")\n# show app in the browser (port 1468)\nshow.app(app, \"pollen\", browse = TRUE, port = 1468) \n```\n\nThe `pagoda` app allows you to view the gene sets grouped within each aspect (row), as well as genes underlying the detected heterogeneity patterns. A screenshot of the app is provided below:\n\n![pagoda app](https://github.com/hms-dbmi/scde/raw/master/vignettes/figures/pagoda-Screen_Shot_2015-06-07_at_4.53.46_PM.png)\n\nAn interactive version of the full run (all GO terms tested) can also be found here: [http://pklab.med.harvard.edu/cgi-bin/R/rook/pollen.npc/index.html](http://pklab.med.harvard.edu/cgi-bin/R/rook/pollen.npc/index.html)\n\nSimilar views can be obtained in the R session itself. For instance, here we'll view top 10 genes associated with the top two pathways in the neurogenesis cluster: \"neurogenesis\" (GO:0022008) and \"generation of neurons\" (GO:0048699)\n\n```{r, showTopPathwayGenes, fig.height = 3.5, fig.width = 8}\npagoda.show.pathways(c(\"GO:0022008\",\"GO:0048699\"), varinfo, go.env, cell.clustering = hc, margins = c(1,5), show.cell.dendrogram = TRUE, showRowLabels = TRUE, showPC = TRUE)\n```\n\n## Controlling for undesired aspects of heterogeneity\n\nDepending on the biological setting, certain dominant aspects of transcriptional heterogeneity may not be of interest. To explicitly control for these aspects of heterogeneity that are not of interest, we will use `pagoda.subtract.aspect` method that we've previously used to control for residual patterns associated with sequencing depth differences. Here, we illustrate how to control for the mitotic cell cycle pattern (GO:0000280 nuclear division and GO:0007067 mitotic nuclear division) which showed up as one of the four significant aspects in the analysis above.\n\n```{r, controlForCellCycle}\n# get cell cycle signature and view the top genes\ncc.pattern <- pagoda.show.pathways(c(\"GO:0000280\", \"GO:0007067\"), varinfo, go.env, show.cell.dendrogram = TRUE, cell.clustering = hc, showRowLabels = TRUE)\n# subtract the pattern\nvarinfo.cc <- pagoda.subtract.aspect(varinfo, cc.pattern)\n```\n\nNow we can go through the same analysis as shown above, starting with the `pagoda.pathway.wPCA()` call, using `varinfo.cc` instead of `varinfo`, which will control for the cell cycle heterogeneity between the cells.\n"
  },
  {
    "path": "web/additional.css",
    "content": "body {\n    margin: 0px;\n    margin-top: 0;\n}\n\ntable#elevels {\n    //background-color:#FFFFFF;\n    //border: solid #000 2px;\n    width: 500px;\n }\n table#elevels th {\n     padding: 5px;\n     text-align: center;\n }\n table#elevels td {\n    padding: 5px;\n    text-align: center;\n    border: solid #000 1px;\n }"
  },
  {
    "path": "web/pathcl.css",
    "content": ".tab-cont {\n    float: left;\n    margin: 0 15px 15px 0;\n}\n\n.tab-icon {\n    background-image: url(extjs/examples/shared/icons/fam/application_view_list.png);\n}\n.pathcl-icon {\n    background-image: url(extjs/examples/shared/icons/fam/text_list_bullets.png);\n}\n\n.colormap {\n    shape-rendering: crispEdges;\n}\n\n.rowLabel {\n    text-anchor: start;\n    dominant-baseline: middle;\n    font: 18px sans-serif;\n}\n\n.geneRowLabel {\n    text-anchor: start;\n    dominant-baseline: middle;\n    font: 12px sans-serif;\n}\n\n/* cross hairs */\n.crosshair line {\n    display:none;\n    pointer-events:none;\n}\n.highlighting .crosshair line {\n  display:inline;\n  stroke-dasharray: 5,5;\n  stroke: black;\n  stroke-width: 1px;\n  pointer-events:none;\n}\nline.highlighting.crosshair {\n    display:inline;\n    stroke-dasharray: 5,5;\n    stroke: black;\n    stroke-width: 1px;\n    pointer-events:none;\n}\ng.crosshair text {\n    display:none;\n    pointer-events:none;\n}\n.highlighting g.crosshair text {\n    display:inline;\n    fill: black;\n    font: 18px sans-serif;\n    font-weight: bold;\n}\n\ndiv.positive .x-progress-bar.x-progress-bar-default{\n    background-color: #FFABAB;\n    background-image: none;\n}\ndiv.x-progress .x-progress-text.x-progress-text-back {\n    color:black;\n}\ndiv.x-progress.x-progress-default {\n    border-color: black; \n    background-color: #EEEEEE;\n\n}\ndiv.negative .x-progress-bar.x-progress-bar-default{\n    background-color: #85C2FF;\n    background-image: none;\n}\n\ndiv.x-progress .x-progress-text {\n    text-align: left;\n    color: black;\n    font-weight: normal;\n    padding-left: 5px;\n}\n\n#pathclsvg path {\n    vector-effect: non-scaling-stroke;\n}\n\nrect.datapt {\n    pointer-events: all;\n}\n\n.axis path,\n.axis line {\n    fill: none;\n    stroke: black;\n    shape-rendering: crispEdges;\n}\n\n.axis text {\n    font-family: sans-serif;\n    font-size: 11px;\n}\n\ncircle {\n    stroke: black;\n    stroke-width: 0.5;\n    fill-opacity: 0.7;\n}\n\ncircle.selected {\n    stroke: black; \n    stroke-width: 2.0;\n    fill-opacity:1;\n}"
  },
  {
    "path": "web/pathcl.js",
    "content": "Ext.require(['*']);\n\nExt.onReady(function() {\n   \n\n    var cw;\n    var currentPathCl=-1; // currently selected pathway cluster\n\n    Ext.tip.QuickTipManager.init();    \n\n    //Ext.state.Manager.setProvider(Ext.create('Ext.state.CookieProvider'));\n\n    var clusterToolbar = Ext.create('Ext.toolbar.Toolbar', {\n\t    items: [ {\n\t\t    // xtype: 'button', // default for Toolbars\n\t\t    text: 'Show'\n\t\t},'->',{\n\t\t    xtype    : 'textfield',\n\t\t    icon: 'preview.png',\n\t\t    cls: 'x-btn-text-icon',\n\t\t    fieldLabel: 'number of genes',\n\t\t    labelStyle: 'white-space: nowrap;',\n\t\t    name     : 'nGenes',\n\t\t    emptyText: 'number of genes to show'\n\t\t}]\n\t});\n\n    var tutorialWindow = new Ext.Window({\n\tid:'tutorial-window', \n\ttitle: 'Video Tutorial',\n\tlayout:'fit',  \n\tactiveItem: 0,  \n\n\tdefaults: {border:false}, \n\tbbar: Ext.create('Ext.toolbar.Toolbar', {\n\t    padding: 5,\n\t    items   : [{\n\t\txtype: 'checkbox',\n\t\tboxLabel: 'do not automatically show this tutorial video on startup when I visit next time',\n\t\tchecked: Ext.util.Cookies.get(\"hidetutorial\")!=null,\n\t\tname: 'dontshowtutorial',\n\t\tlisteners: {\n\t\t    change: function(field, value) {\n\t\t\tif(value) {\n\t\t\t    var now = new Date();\n\t\t\t    var expiry = new Date(now.getTime() + 365 * 24 * 60 * 60 * 1000);\n\t\t\t    Ext.util.Cookies.set(\"hidetutorial\",true,expiry)\n\t\t\t} else {\n\t\t\t    console.log(\"clearing hidetutorial\");\n\t\t\t}\n\t\t    }\n\t\t}\n\t    },'->',{\n\t\txtype: 'button',\n\t\ttext: 'Close',\n\t\thandler: function() {\n\t\t    tutorialWindow.hide();\n\t\t}\n\t    }\n           ]\n\t}),\n\titems : [{\n\t\t    id: \"video\",\n\t\t    html: '<iframe width=\"720\" height=\"400\" src=\"//www.youtube.com/embed/N2Ritx5yqrc?rel=0\" frameborder=\"0\" allowfullscreen></iframe>'\n\t\t}]\t\n\t});\n\n    \n\n    /* DETAILED CLUSTERING VIEW */\n    var detailMode=1; // 1: gene 2: pathway\n    var detailItemList={}; // pathway or gene list\n    var detailNGenes=20; // max genes to show \n\n    var detailPanel = Ext.create('Ext.panel.Panel', {\n\tbodyPadding: 5,\n\tautoScroll: true,\n\tshowGenes: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    Ext.Ajax.request({\n\t\turl: 'genecl.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids))\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=1;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: selected genes\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).disable()\n\t\t    //detailPanelGearMenu.getComponent(1).disable()\n\t\t    detailPanelGearMenu.getComponent(2).disable()\n\t\t    detailPanelGearMenu.getComponent(4).disable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\tshowPathways: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(4).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'pathwaygenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=2; \n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: top genes in specified pathways\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    //detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(2).enable()\n\t\t    detailPanelGearMenu.getComponent(4).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t    \n\t},\n\tsearchSimilar: function(pattern) { // request genes most closely matching current data.colcol\n\t    if(!detailPanel.hasOwnProperty('genecldata')) return;\n\t    detailPanel.setLoading(true);\n\t    if(pattern === undefined) {\n\t\t// determine the colcol or single gene\n\t\tif(detailPanel.genecldata.hasOwnProperty('colcols')) {\n\t\t    pattern=detailPanel.genecldata.colcols.data;\n\t\t} else {\n\t\t    pattern=detailPanel.genecldata.matrix.data.slice(0,detailPanel.genecldata.matrix.dim[1]);\n\t\t}\n\t    }\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(4).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'patterngenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"pattern\" : encodeURIComponent(JSON.stringify(pattern)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.pattern=pattern;\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=3;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: genes matching specified pattern\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    //detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(2).enable()\n\t\t    detailPanelGearMenu.getComponent(4).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\treload: function() { // goes back to the server to redraw the same ids\n\t    if(!detailPanel.hasOwnProperty('mode')) return;\n\t    switch(detailPanel.mode) {\n\t    case 1: detailPanel.showGenes(detailPanel.ids); break;\n\t    case 2: detailPanel.showPathways(detailPanel.ids);break;\n\t    case 3: detailPanel.searchSimilar(detailPanel.pattern);break;\n\t    default: console.log(\"reload requested with an undefined detailPanel mode\");\n\t    }\n\t},\n\tredraw: function(data) { // redraws the panels without going back to the server\n\t    if(data === undefined) {\n\t\tif(detailPanel.hasOwnProperty('genecldata')) { data=detailPanel.genecldata; } else {  return; }\n\t    }\n\t    //$('.datapt').remove();\n\t    $('#geneclsvg .datapt').remove();\n\t    $('#geneclsvg').remove();\n\t    detailPanel.update(\"\");    \n\t    s=detailPanel.getSize();\n\t    s.height=hc.geneUnitHeight*data.matrix.dim[0]+20; \n\t    if(data.hasOwnProperty('colcols')) { s.height=s.height+hc.ccheight(data); }\n\t    var el = d3.select(detailPanel.getLayout().getElementTarget().dom)\n\t    var svg = el.append(\"svg\").attr(\"id\",\"geneclsvg\").attr(\"width\",(s.width-hc.padding.width)+\"px\").attr(\"height\",(s.height-hc.padding.height)+\"px\").attr('xmlns','http://www.w3.org/2000/svg');\n\t    \n\t    if(data.hasOwnProperty('colcols')) {\n\t\tvar colcol = colcolmap(svg.append(\"g\").attr(\"transform\",\"translate(\"+hc.hmleft()+\",\"+hc.cctop(data)+\")\"), data.colcols, hc.hmwidth(s.width), hc.ccheight(data)); \n\t\tcolcol.append(\"title\").text(\"1st principal component (PC) of the selected gene set expression: green - negative, white - neutral, orange - positive\")\n\t    }\n\t    var cmap = colormap(svg.append(\"g\").attr(\"id\",\"gmapg\").attr(\"transform\",\"translate(\"+hc.hmleft()+\",\"+hc.hmtop(data)+\")\"),data.matrix,hc.hmwidth(s.width),hc.hmheight(data,s.height)); \n\t    if(data.hasOwnProperty('rowcols')) {\n\t\tvar rowcols = sidecolormap(svg.append(\"g\").attr(\"transform\",\"translate(\"+hc.margins.left+\",\"+hc.hmtop(data)+\")\"), data.rowcols, hc.rowcolWidth, hc.hmheight(data,s.height));\n\t\trowcols.append(\"title\").text(\"contribution of a gene to the 1st principal component: green - negative, white - neutral, orange - positive\")\n\t    }\n\t    var rowLabelStep = hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t    var rowlabg = svg.append(\"g\")\n\t\t.attr(\"transform\",\"translate(\"+hc.rowlableft(s.width)+\",\"+hc.hmtop(data)+\")\")\n\t\t.append(\"g\")\n\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\n\t    var rowlab = rowlabg\n\t\t.selectAll(\".rowLabelg\")\n\t\t.data(data.matrix.rows)\n\t\t.enter()\n\t\t.append(\"text\")\n\t\t.text(function(d) { return(d); })\n\t\t.attr(\"x\",0)\n\t\t.attr(\"y\",function(d,i) { return Math.floor(i*rowLabelStep + rowLabelStep/2); })\n\t\t.classed(\"geneRowLabel\",true)[0]\n\t    \n\n\t    var focus = svg.append(\"g\")\n\t\t.attr(\"class\",\"crosshair\");\n\t    var focushl = focus.append(\"line\").classed(\"crosshair\",true).attr({\"x1\":hc.margins.left,\"y1\":Math.round(hc.hmtop(data)+cmap.y(1)/2),\"x2\":(hc.hmleft()+hc.hmwidth(s.width)),\"y2\":Math.round(hc.hmtop(data)+cmap.y(1)/2)});\n\t    var focusvl = focus.append(\"line\").classed(\"crosshair\",true).attr(\"id\",\"genefocusvl\").attr({\"x1\":Math.round(hc.hmleft()+cmap.x(1)/2),\"y1\":hc.cctop(data),\"x2\":Math.round(hc.hmleft()+cmap.x(1)/2),\"y2\":(hc.hmtop(data)+hc.hmheight(data,s.height))});\n\t    var focustx = focus.append(\"text\").text(\"\").attr(\"x\",Math.round(hc.hmleft()+cmap.x(1)/2)).attr(\"y\",Math.round(hc.hmtop(data)+cmap.y(1)/2));\n\n\n\t    cmap.g.append(\"rect\").attr('x',0).attr('y',0).attr('width',hc.hmwidth(s.width)).attr('height',hc.hmheight(data,s.height)).attr('style','fill:white;fill-opacity:0.0;stroke:black;stroke-width:0.5;pointer-events:all;').attr(\"id\",\"gmapev\");\n\t    \n\t    var evg = d3.select(\"#gmapev\");\n\t    evg.on(\"mousemove\", function() {\n\t\tvar coord=d3.mouse(evg.node());\n\t\t// figure out row and column index\n\t\tvar bbox=evg.node().getBBox();\n\t\tvar colIndex=Math.floor(coord[0]/bbox.width*data.matrix.dim[1])\n\t\tvar rowIndex=Math.floor(coord[1]/bbox.height*data.matrix.dim[0])\n\t\td3.select(rowlab[rowIndex]).classed('active', true);\n\t\tvar newy=cmap.y(rowIndex); var newx=cmap.x(colIndex); \n\t\tfocushl.attr(\"transform\",\"translate(0,\"+newy+\")\");\n\t\tfocusvl.attr(\"transform\",\"translate(\"+newx+\",0)\");\n\t\tfocustx.text(\"cell: \"+data.matrix.cols[colIndex]);\n\t\t//focustx.attr(\"transform\",\"translate(\"+coord[0]+\",\"+(heatmapPos[1]+heatmapHeight-10)+\")\");\n\t\tif(newy>bbox.height/2) {  newy=newy-5; } else {   newy=newy+15; }\n\t\tif(newx>bbox.width/2) {\n\t\t    focustx.attr(\"transform\",\"translate(\"+(newx-5)+\",\"+newy+\")\");\n\t\t    focustx.attr(\"text-anchor\",\"end\")\n\t\t} else {\n\t\t    focustx.attr(\"transform\",\"translate(\"+(newx+5)+\",\"+newy+\")\");\n\t\t    focustx.attr(\"text-anchor\",\"start\")\n\t\t}\n\t\td3.select('#pathclfocusvl').attr(\"transform\",\"translate(\"+newx+\",0)\");\n\t    }).on(\"mouseenter\",function() {\n\t\tel.classed('highlighting', true);\n\t\td3.select('#pathclfocusvl').classed(\"highlighting\",true)\n\t    }).on(\"mouseleave\",function() {\n\t\tel.classed('highlighting', false);\n\t\td3.select('#pathclfocusvl').classed(\"highlighting\",false)\n\t    })\n\n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw();\n\t    }\n\t}\n\n    })\n\n\n    // clear filter filed trigger button\n    Ext.define('Ext.ux.CustomTrigger', {\n\textend: 'Ext.form.field.Trigger',\n\talias: 'widget.customtrigger',\n\tinitComponent: function () {\n            var me = this;\n            me.triggerCls = 'x-form-clear-trigger';\n            me.callParent(arguments);\n\t},\n\t// override onTriggerClick\n\tonTriggerClick: function() {\n\t    if(this.getValue()!='') {\n\t\tthis.setRawValue('');\n\t\tthis.fireEvent('change',this,'');\n\t    }\n\t}\n    });\n\n\n    /* PATHWAY CLUSTER INFO */\n\n    Ext.define('clinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'od',type: 'float'},{name: 'npc', type: 'integer'},{name: 'sign', type: 'integer'},{name: 'initsel', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var clinfostore = Ext.create('Ext.data.Store', {\n        id: 'clinfostore',\n        model: 'clinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'clinfo.json',\n\t    extraParams: {\n\t\tpathcl: currentPathCl\n\t    },\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n\tlisteners: {\n\t    load: function(r) {\n\t\t// use the supplied 'initsel' to set the initial selection\n\t\tclSelectModel.suspendEvent('selectionchange')\n\t\tfor(var i=0;i<r.data.items.length;i++) {\n\t\t    if(r.data.items[i].data.initsel==1) {\n\t\t\tclSelectModel.select(i,true);\n\t\t    }\n\t\t}\n\t\tclSelectModel.resumeEvent('selectionchange');\n\t\tclSelectModel.fireEvent('selectionchange',clSelectModel,clSelectModel.getSelection());\n\t    }\n\t}\n    });\n\n    var clSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var clInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: clinfostore,\n\tid: \"clinfo\",\n\tselModel: clSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: clinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tclinfostore.clearFilter(true);\n\t\t\t\tclinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tclinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n\t\t  //{text: \"name\", flex: 1, dataIndex: 'name', sortable: true},\n\t    {\n                text: 'overdispersion',\n                dataIndex: 'od',\n                flex:1,\n                renderer: function (v, m, r) {\n\t\t    //m.tdAttr='data-qtip=\"'+r.data.name+'\"';\n                    var id = Ext.id();\n                    Ext.defer(function () {\n                        Ext.widget('progressbar', {\n                            renderTo: id,\n\t\t\t    text: r.data.name,\n                            value: v / 1,\n                        });\n                    }, 50);\n\t\t    if(r.data.sign==\"1\") {\n\t\t\treturn Ext.String.format('<div class=\"positive\" id=\"{0}\"></div>', id);\n\t\t    } else {\n\t\t\treturn Ext.String.format('<div class=\"negative\" id=\"{0}\"></div>', id);\n\t\t    }\n                }\n            },\n            {text: \"PC\", width: 30, dataIndex: 'npc', sortable: true},\n            /*{text: \"overdispersion\", width: 100, dataIndex: 'od', sortable: true}*/\n        ]\n    })\n\n\n\n    /* GENE SET ENRICHMENT INFO */\n\n    Ext.define('geinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'fe',type: 'float'},{name: 'o', type: 'integer'},{name: 'u', type: 'integer'},{name: 'Z', type: 'float'},{name: 'Za', type: 'float'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var geinfostore = Ext.create('Ext.data.Store', {\n        id: 'geinfostore',\n        model: 'geinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'ajax',\n            url: 'testenr.json',\n\t    actionMethods: {create: 'POST', read: 'POST', update: 'POST', destroy: 'POST'},\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n    });\n\n    var geSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var geInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: geinfostore,\n\tid: \"geinfo\",\n\tselModel: geSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Enriched Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: geinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tgeinfostore.clearFilter(true);\n\t\t\t\tgeinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tgeinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true },\n\t    {text: \"FE\", width: 60, dataIndex: 'fe', sortable: true, tooltip: \"fold enrichment\"},\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, tooltip: \"enrichment Z-score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'Za', sortable: true, tooltip: \"enrichment Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 50, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes found in this pathway\"},\n\t    {text: \"u\", width: 50, dataIndex: 'u', sortable: true, hidden: true, tooltip: \"total number of genes annotated for this pathway\"}\n        ]\n    })\n\n\n    /* gene info tab */\n    Ext.define('ginfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'gene', {name: 'var',type: 'float'},{name: 'svar', type: 'float'}\n        ],\n\tidProperty: 'gene'\n    });\n\n    var ginfostore = Ext.create('Ext.data.Store', {\n        id: 'ginfostore',\n        model: 'ginfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'genes.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        sorters: [{\n            property: 'var',\n            direction: 'DESC'\n        }],\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n\n\n    var geneSelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\t    var ids = $.map(selections,function(val,i) {\n\t\t\t    return(val.data.gene)\n\t\t\t})\n\t\t    detailPanel.showGenes(ids);\n\t\t    //grid4.down('#removeButton').setDisabled(selections.length === 0);\n            }\n        }\n    });\n\n    var geneGrid = Ext.create('Ext.grid.Panel', {\n        store: ginfostore,\n\tselModel: geneSelModel,\n        columns: [\n            {text: \"Gene\", flex: 1, dataIndex: 'gene', sortable: true,\n\t     renderer: function(value) {\n\t\t return Ext.String.format('<a href=\"http://www.informatics.jax.org/searchtool/Search.do?query={0}\" target=\"_blank\">{1}</a>',value,value)\n\t     }\n\t    },\n            {text: \"Variance\", width: 80, dataIndex: 'var', sortable: true}\n        ],\n\t//features: [filters],\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Genes',\n\t//forceFit: true\n        //renderTo:'example-grid',\n        //width: 800,\n        //height: 300\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: ginfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No genes to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by gene name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tginfostore.clearFilter(true);\n\t\t\t\tginfostore.filter({property: 'gene', value: value});\n\t\t\t    } else {\n\t\t\t\tginfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n\tlisteners: {\n\t    viewready: function() {\n\t\t    // select top 20 genes \n\t\t    this.selModel.suspendEvent('selectionchange')\n\t\t    for(var i=0;i<Math.min(this.store.data.items.length,20);i++) {\n\t\t\tthis.selModel.select(i,true);\n\t\t    }\n\t\t    this.selModel.resumeEvent('selectionchange');\n\t\t    this.selModel.fireEvent('selectionchange',this,this.selModel.getSelection());\n\t    }\n\t}\n    });\n\n\n\n    /* gene info tab */\n    Ext.define('pinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'id','name', {name: 'Z',type: 'float'},{name: 'aZ',type: 'float'},{name: 'score',type: 'float'},{name: 'n', type: 'integer'},{name: 'npc', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var pinfostore = Ext.create('Ext.data.Store', {\n        id: 'pinfostore',\n        model: 'pinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'pathways.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n    var pathwaySelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tdetailPanel.showPathways(ids);\n            }\n        }\n    });\n\n    var pathwayGrid = Ext.create('Ext.grid.Panel', {\n        store: pinfostore,\n\tselModel: pathwaySelModel,\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true, tooltip: \"pathway / gene set name\" },\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, hidden: true, tooltip: \"overdispersion Z score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'aZ', sortable: true, tooltip: \"overdispersion Z score, adjusted for multiple hypothesis\"},\n\t    {text: \"co.Z\", width: 60, dataIndex: 'sh.Z', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score\"},\n            {text: \"co.cZ\", width: 60, dataIndex: 'sh.aZ', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 60, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes in the pathway\"},\n\t    {text: \"nPC\", width: 60, dataIndex: 'npc', sortable: true, hidden: true, tooltip: \"principal component number\"},\n\t    {text: \"score\", width: 60, dataIndex: 'score', sortable: true, tooltip: \"observed/expected overdispersion\"}\n        ],\n\t//features: [filters],\n\temptyText: 'No Matching Pathways',\n        height: '100%',\n\tcolumnLines:true,\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: pinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by pathway name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tpinfostore.clearFilter(true);\n\t\t\t\tpinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tpinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t})\n    });\n\n\n\n    var infotab = Ext.create('Ext.tab.Panel', {\n\t    //tabPosition: 'right',\n\t    defaults: {\n\t\tbodyPadding: 0,\n\t\tlayout: 'fit',\n\t\ticonCls: 'tab-icon'\n\t    },\n\t    items: [\n\t\t{ title: 'Pathways', items:[pathwayGrid] },\n\t\t{ title: 'Genes', items:[geneGrid] },\n\t\t{ title: 'Cluster', items:[clInfoGrid], itemId: 'clustertab', hidden:true },\n\t\t{ title: 'Enrichment', items:[geInfoGrid], itemId: 'enrichmenttab', hidden:true }\n\t    ]\n\t});\n\n    \n    function colcolmap(svg, data, width, height) {\n\t// Check for no data\n\tif (data.length === 0)\n\t    return function() {};\n\tvar cols = data.dim[1]\n\tvar rows = data.dim[0];\n\t\n\tvar merged = data.data;\n\t\n\tvar x = d3.scale.linear().domain([0, cols]).range([0, width]);\n\tvar y = d3.scale.linear().domain([0, rows]).range([0, height]);\n\n\tvar g = svg.append(\"g\").classed(\"colormap\",true);\n\tvar rect = g.selectAll(\"rect\").data(merged);\n\trect.enter().append(\"rect\").classed(\"colcoldatapt\", true);\n\t//rect.exit().remove();\n\tvar cccmap= function(d) { return d; };\n\tif(data.hasOwnProperty('domain')) {\n\t    var color = d3.scale.linear()\n\t\t.domain(data.domain)\n\t\t.range(data.colors);\n\t    cccmap= function(d) { return color(d); };\n\t};\n\n\trect.attr(\"x\", function(d, i) {\n\t\treturn x(i % cols);\n\t    })\n\t    .attr(\"y\", function(d, i) {\n\t\treturn y(Math.floor(i / cols));\n\t    })\n\t    .attr(\"width\", x(1))\n\t    .attr(\"height\", y(1))\n\t    .attr(\"fill\", cccmap);\n\tsvg.append(\"rect\").attr('x',0).attr('y',0).attr('width',width).attr('height',height).attr('style','fill:none;stroke:black;stroke-width:0.5;');\n\treturn g;\n    };\n    function sidecolormap(svg, data, width, height) {\n\t// Check for no data\n\tif(data === undefined) \n\t    return function() {};\n\t    \n\tvar cols = data.dim[1]\n\tvar rows = data.dim[0];\n\t\n\tvar merged = data.data;\n\t\n\tvar x = d3.scale.linear().domain([0, cols]).range([0, width]);\n\tvar y = d3.scale.linear().domain([0, rows]).range([0, height]);\n\tvar color = d3.scale.linear()\n\t    .domain(data.domain)\n\t    .range(data.colors);\n\t\n\n\tvar g = svg.append(\"g\").classed(\"colormap\",true);\n\tvar rect = g.selectAll(\"rect\").data(merged);\n\trect.enter().append(\"rect\").classed(\"datapt\", true);\n\t//rect.exit().remove();\n\trect.property(\"colIndex\", function(d, i) { return i % cols; })\n\t    .property(\"rowIndex\", function(d, i) { return Math.floor(i / cols); })\n\t    .attr(\"x\", function(d, i) {\n\t\treturn x(i % cols);\n\t    })\n\t    .attr(\"y\", function(d, i) {\n\t\treturn y(Math.floor(i / cols));\n\t    })\n\t    .attr(\"width\", x(1))\n\t    .attr(\"height\", y(1))\n\t    .attr(\"fill\", function(d) { return color(d); });\n\t    //.append(\"title\").text(function(d) { return d + \"\"; });\n\tsvg.append(\"rect\").attr('x',0).attr('y',0).attr('width',width).attr('height',height).attr('style','fill:none;stroke:black;stroke-width:0.5;');\n\treturn g;\n    };\n\n    function colormap(svg, data, width, height) {\n\t// Check for no data\n\tif (data.length === 0)\n\t    return function() {};\n\tvar cols = data.dim[1]\n\tvar rows = data.dim[0];\n\tvar merged = data.data;\n\t\n\tvar x = d3.scale.linear().domain([0, cols]).range([0, width]);\n\tvar y = d3.scale.linear().domain([0, rows]).range([0, height]);\n\tvar color = d3.scale.linear()\n\t    .domain(data.domain)\n\t    .range(data.colors);\n\t\n\t\n\tvar g = svg.append(\"g\").classed(\"colormap\",true);\n\t//g.append(\"rect\").attr('x',0).attr('y',0).attr('width',width).attr('height',height).attr('style','fill:white;stroke:none;');\n\t//svg.append(\"rect\").attr('x',0).attr('y',0).attr('width',width).attr('height',height).attr('style','fill-opacity:1;fill:black;stroke:none;pointer-events:all;').classed(\"mapcatcher\",true);\n\tvar rect = g.selectAll(\"rect\").data(merged);\n\trect.enter().append(\"rect\").classed(\"datapt\", true);\n\t//rect.exit().remove();\n\trect.attr(\"x\", function(d, i) {\n\t\treturn x(i % cols);\n\t    })\n\t    .attr(\"y\", function(d, i) {\n\t\treturn y(Math.floor(i / cols));\n\t    })\n\t    .attr(\"width\", x(1))\n\t    .attr(\"height\", y(1))\n\t    .attr(\"fill\", function(d) { return color(d); });\n\t    //.append(\"title\").text(function(d) { return d + \"\"; });\n\tsvg.append(\"rect\").attr('x',0).attr('y',0).attr('width',width).attr('height',height).attr('style','fill:none;stroke:black;stroke-width:0.5;');\n\treturn {\n\t    x: x,\n\t    y: y,\n\t    g: g\n\t};\n    };\n\n\n\n    function updatePathclInfo(pathcl) {\n\tif(pathcl == currentPathCl) return;\n\tclinfostore.getProxy().setExtraParam(\"pathcl\",pathcl)\n\tclinfostore.load();\n\tinfotab.child(\"#clustertab\").tab.show();\n\tinfotab.setActiveTab(2);\n\tinfotab.getActiveTab().setTitle(\"Cluster \"+pathcl)\n\tcurrentPathCl=pathcl;\n    }\n    \n    /* heatmap config */\n    var hc = {\n\tspacing: 2,\n\tgeneUnitHeight: 15,\n\tmargins: {top:2,right:60,bottom:2,left:1},\n\tpadding: {width:28,height:10},\n\tcolcolUnitHeight: 10,\n\trowcolWidth: 10,\n\tcolDendHeight: 50,\n\ttrim: 0,\n\t// heatmap left position\n\thmleft: function() { return(this.margins.left+this.rowcolWidth+this.spacing)},\n\t// colcol top position\n\tcctop: function(data) { return(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing : this.margins.top) },\n\t// colcol height\n\tccheight: function(data) { return(this.colcolUnitHeight*data.colcols.dim[0]) },\n\t// heatmap top position\n\thmtop: function(data) { \n\t    if(data.hasOwnProperty('colcols')) {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing*2 + this.ccheight(data) : this.margins.top+this.spacing*2 + this.ccheight(data))\n\t    } else {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing : this.margins.top)\n\t    }\n\t},\n\t// heatmap hight\n\thmheight: function(data,totalHeight) {\n\t    return((totalHeight-this.padding.height*2) - this.hmtop(data));\n\t},\n\t// heatmap width\n\thmwidth: function(totalWidth) { return(totalWidth-this.margins.left-this.rowcolWidth-this.spacing-this.margins.right-this.padding.width*2)},\n\trowlableft: function(totalWidth) {\n\t    return(this.hmleft()+this.hmwidth(totalWidth)+this.spacing);\n\t},\n\trowlabelsize: function(data,totalHeight) {\n\t    return(Math.round(this.hmheight(data,totalHeight)/data.matrix.dim[0]*0.97*72/96))\n\t    //return(18);\n\t}\n    };\n\n    var clusterPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding: 5,\n\t//ohtml: \"Pathway Clustering\",\n\treload: function() {\n\t    clusterPanel.setLoading(true);\n\t    d3.json('pathcl.json',function(error,data) {\n\t\tif(error) { console.log(error); return;}\n\t\tclusterPanel.pathcldata=data;\n\t\tclusterPanelGearMenu.getComponent(0).suspendEvents();\n\t\tclusterPanelGearMenu.getComponent(0).setValue(data.matrix.domain[data.matrix.domain.length-1]);\n\t\tclusterPanelGearMenu.getComponent(0).setMaxValue(data.matrix.range[1]);\n\t\tclusterPanelGearMenu.getComponent(0).resumeEvents()\n\t\tif(data.hasOwnProperty('trim')) {\n\t\t    hc.trim=data.trim;\n\t\t    detailPanelGearMenu.getComponent(2).suspendEvents();\n\t\t    detailPanelGearMenu.getComponent(2).setValue(data.trim);\n\t\t    detailPanelGearMenu.getComponent(2).resumeEvents();\n\t\t}\n\t\tclusterPanel.redraw(clusterPanel.pathcldata);\n\t\tclusterPanel.setLoading(false);\n\t    })\n\t},\n\tredraw: function(data) {\n\t    if(data === undefined) {\n\t\tif(clusterPanel.hasOwnProperty('pathcldata')) { data=clusterPanel.pathcldata; } else { clusterPanel.reload(); return; }\n\t    }\n\t    $('#pathclsvg .datapt').remove();\n\t    $('#pathclsvg').remove();\n\t    clusterPanel.update(\"\");\n\t    s=clusterPanel.getSize();\n\t    var el = d3.select(clusterPanel.getLayout().getElementTarget().dom)\n\t    var svg = el.append(\"svg\").attr(\"id\",\"pathclsvg\").attr(\"width\",(s.width-hc.padding.width)+\"px\").attr(\"height\",(s.height-hc.padding.height)+\"px\").attr('xmlns','http://www.w3.org/2000/svg');\n\t    \n\t    var dg = svg.append(\"g\").attr(\"id\",\"coldend\").attr(\"transform\", \"translate(\" + hc.hmleft() + \" \" + hc.margins.top + \") \" +\"scale(\" + (hc.hmwidth(s.width)/72) + \" \" + (hc.colDendHeight/72) + \")\");\n\t    // a workaround to append SVG elements into DOM\n\t    $(el.node()).append('<svg id=\"dummy\" style=\"display:none\"><defs>' + data.coldend + '</defs></svg>');\n\t    $(dg.node()).append($(\"#dummy g\"));\n\t    $(\"#dummy\").remove();\n\t    // colcol\n\t    var colcol = colcolmap(svg.append(\"g\").attr(\"transform\",\"translate(\"+hc.hmleft()+\",\"+hc.cctop(data)+\")\"), data.colcols, hc.hmwidth(s.width), hc.ccheight(data)); \n\t    colcol.append(\"title\").text(\"custom cell classification colors\")\n\t    // main heatmap\n\t    var cmap = colormap(svg.append(\"g\").attr(\"id\",\"cmapg\").attr(\"transform\",\"translate(\"+hc.hmleft()+\",\"+hc.hmtop(data)+\")\"),data.matrix,hc.hmwidth(s.width),hc.hmheight(data,s.height)); \n\t    // side colors\n\t    var rowcols = sidecolormap(svg.append(\"g\").attr(\"transform\",\"translate(\"+hc.margins.left+\",\"+hc.hmtop(data)+\")\"), data.rowcols, hc.rowcolWidth, hc.hmheight(data,s.height));\n\t    rowcols.append(\"title\").text(\"Overdispersion: white - low, black - high\")\n\t    \n\t    // row labels\n\t    var rowlabelsize=hc.rowlabelsize(data,s.height);\n\t    var rowLabelStep = hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t    var rowlabg = svg.append(\"g\")\n\t\t.attr(\"transform\",\"translate(\"+hc.rowlableft(s.width)+\",\"+hc.hmtop(data)+\")\")\n\t\t.append(\"g\")\n\t    var rowlab = rowlabg\n\t\t.selectAll(\".rowLabelg\")\n\t\t.data(data.matrix.rows)\n\t\t.enter()\n\t\t.append(\"text\")\n\t\t.text(function(d) { return(d); })\n\t\t.style(\"font-size\",rowlabelsize+\"pt\")\n\t\t.attr(\"x\",0)\n\t\t.attr(\"y\",function(d,i) { return Math.floor(i*rowLabelStep + rowLabelStep/2); })\n\t\t.classed(\"rowLabel\",true)[0]\n\t    \n\t    var focus = svg.append(\"g\").attr(\"class\",\"crosshair\");\n\t    var focushl = focus.append(\"line\").classed(\"crosshair\",true).attr({\"x1\":hc.margins.left,\"y1\":Math.round(hc.hmtop(data)+cmap.y(1)/2),\"x2\":(hc.hmleft()+hc.hmwidth(s.width)),\"y2\":Math.round(hc.hmtop(data)+cmap.y(1)/2)});\n\t    var focusvl = focus.append(\"line\").classed(\"crosshair\",true).attr(\"id\",\"pathclfocusvl\").attr({\"x1\":Math.round(hc.hmleft()+cmap.x(1)/2),\"y1\":hc.cctop(data),\"x2\":Math.round(hc.hmleft()+cmap.x(1)/2),\"y2\":(hc.hmtop(data)+hc.hmheight(data,s.height))});\n\t    var focustx = focus.attr(\"id\",\"pathclfocustx\").append(\"text\").text(\"\").attr(\"x\",Math.round(hc.hmleft()+cmap.x(1)/2)).attr(\"y\",Math.round(hc.hmtop(data)+cmap.y(1)/2));\n\n\t    cmap.g.append(\"rect\").attr('x',0).attr('y',0).attr('width',hc.hmwidth(s.width)).attr('height',hc.hmheight(data,s.height)).attr('style','fill:white;fill-opacity:0.0;stroke:black;stroke-width:0.5;pointer-events:all;').attr(\"id\",\"cmapev\");\n\t    \n\t    var evg = d3.select(\"#cmapev\");\n\t    evg.on(\"click\", function() {\n\t\tvar coord=d3.mouse(evg.node());\n\t\tvar bbox=evg.node().getBBox();\n\t\tvar colIndex=Math.floor(coord[0]/bbox.width*data.matrix.dim[1])\n\t\tvar rowIndex=Math.floor(coord[1]/bbox.height*data.matrix.dim[0])\n\t\tupdatePathclInfo(rowlab[rowIndex].textContent)\n\t    }).on(\"mousemove\", function() {\n\t\tvar coord=d3.mouse(evg.node());\n\t\t// figure out row and column index\n\t\tvar bbox=evg.node().getBBox();\n\t\tvar colIndex=Math.floor(coord[0]/bbox.width*data.matrix.dim[1])\n\t\tvar rowIndex=Math.floor(coord[1]/bbox.height*data.matrix.dim[0])\n\t\td3.select(rowlab[rowIndex]).classed('active', true);\n\t\tvar newy=cmap.y(rowIndex); var newx=cmap.x(colIndex); \n\t\tfocushl.attr(\"transform\",\"translate(0,\"+newy+\")\");\n\t\tfocusvl.attr(\"transform\",\"translate(\"+newx+\",0)\");\n\t\tfocustx.text(\"cell: \"+data.matrix.cols[colIndex]);\n\t\t//focustx.attr(\"transform\",\"translate(\"+coord[0]+\",\"+(heatmapPos[1]+heatmapHeight-10)+\")\");\n\t\tif(newy>bbox.height/2) {  newy=newy-5; } else {   newy=newy+15; }\n\t\tif(newx>bbox.width/2) {\n\t\t    focustx.attr(\"transform\",\"translate(\"+(newx-5)+\",\"+newy+\")\");\n\t\t    focustx.attr(\"text-anchor\",\"end\")\n\t\t} else {\n\t\t    focustx.attr(\"transform\",\"translate(\"+(newx+5)+\",\"+newy+\")\");\n\t\t    focustx.attr(\"text-anchor\",\"start\")\n\t\t}\n\t\td3.select('#genefocusvl').attr(\"transform\",\"translate(\"+newx+\",0)\");\n\t    }).on(\"mouseenter\",function() {\n\t\tel.classed('highlighting', true);\n\t\td3.select('#genefocusvl').classed(\"highlighting\",true)\n\t    }).on(\"mouseleave\",function() {\n\t\tel.classed('highlighting', false);\n\t\td3.select('#genefocusvl').classed(\"highlighting\",false)\n\t    })\n\t    \n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw(cmp.pathcldata);\n\t    }\n\t}\n    });\n\n\n    var clusterPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'clusterGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'Z limit',\n\t\tname: 'zlim',\n\t\txtype: 'numberfield',\n\t        value: -1,\n\t\tdecimalPrecision: 3,\n\t\tminValue: 0.0,\n\t        maxValue: 100,\n\t\twidth: 200,\n\t        disabled: false,\n\t\ttooltip: 'Set the range of overdispersion scores illustrated by colors',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {\n\t\t\tclusterPanel.pathcldata.matrix.domain=$.map($(Array(clusterPanel.pathcldata.matrix.domain.length)),function(val, i) { return (2*i*v/clusterPanel.pathcldata.matrix.domain.length - v); })\n\t\t\tclusterPanel.redraw()\n\t\t    }}\n\t\t}\n\t}\n\t],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n\t}\n    });\n\n    \n    var detailPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'detailGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'N genes',\n\t\txtype: 'numberfield',\n\t        tooltip: 'Number of genes to show in the Expression Details panel',\n\t\tlabel: 'N genes',\n\t\tvalue: 20,\n\t\tminValue: 1,\n\t\tmaxValue: 1000,\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {detailPanel.reload()}}\n\t\t}\n/*\t    }, '-',{\n\t\ttext: 'Color Z-limit:',\n\t\ttooltip: 'Maximum Z score to determine the color range',\n\t\tcanActivate:false\n\n\t    },{\n\t\txtype: 'slider',\n\t\tlabel: 'N genes',\n\t\tvalue: 0.5,\n\t\tincrement: 1,\n\t\tminValue: 0,\n\t\tmaxValue: 100,\n\t\ttipText: function(thumb){ return Ext.String.format('{0}', (thumb.value/100*3.6).toFixed(2)); },\n*/\n\t    }, {\n\t\tfieldLabel: 'Row height',\n\t\tname: 'rowheight',\n\t\txtype: 'numberfield',\n\t\tvalue: hc.geneUnitHeight,\n\t\tminValue: 3,\n\t\twidth: 200,\n\t\tmaxValue: 100,\n\t\ttooltip: 'Number of pixels to use for each gene row',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {hc.geneUnitHeight=v; detailPanel.redraw()}}\n\t\t}\n\t    },{\n\t\tfieldLabel: 'Trim',\n\t\tname: 'trim',\n\t\txtype: 'numberfield',\n\t\tvalue: hc.trim,\n\t\tdecimalPrecision: 5,\n\t\tminValue: 0.0,\n\t\tmaxValue: 0.5,\n\t\twidth: 200,\n\t\tmaxValue: 100,\n\t\tdisabled: true,\n\t\ttooltip: 'Winsorization trim fraction',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {hc.trim=v; detailPanel.reload()}}\n\t\t}\n            }, '-',\n\t    {\n                text: 'High/low genes',\n                checked: false,\n\t\ttooltip: 'Whether to include genes from both sides of the PC loading (true) or just high magnitude  (false)',\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    checkchange : function() {detailPanel.reload()}\n\t\t}\n\t    }],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n }\n    });\n\n    var ngenesSlider = Ext.create('Ext.slider.Single', {\n\tlabel: 'N genes',\n\ttip: 'number of genes to show',\n\ttipText: function(thumb){ return Ext.String.format('show {0} genes', thumb.value); },\n\twidth: 100,\n\tvalue: 20,\n\tincrement: 1,\n\tminValue: 0,\n\tmaxValue: 500,\n\n    });\n\n    var viewport = Ext.create('Ext.Viewport', {\n        layout: {\n            type: 'border',\n            padding: 5\n        },\n        defaults: {\n            split: true\n        },\n        items: [{ region: 'center',\n\t\t  layout: 'border',\n\t\t  items: [{ region: 'north',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'clusterPanel',\n\t\t\t    title: 'Pathway Clustering',\n\t\t\t    tools: [\n\t\t\t\t{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      clusterPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'help',\n\t\t\t\t  tooltip: 'Tutorial',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      tutorialWindow.show(); \n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n\t\t\t\t  tooltip: 'Save image as SVG file',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      var svg=d3.select(\"#pathclsvg\");\n\t\t\t\t      if(!svg.empty()) {\n\t\t\t\t\t  // update some visual attributes\n\t\t\t\t\t  svg.selectAll(\"#coldend * path\").style(\"vector-effect\",\"non-scaling-stroke\");\n\t\t\t\t\t  svg.selectAll(\".rowLabel\").style(\"dominant-baseline\",\"middle\").style(\"font\",\"18px sans-serif\");\n\t\t\t\t\t  svg.select(\"#pathclfocustx\").text(\"\");\n\t\t\t\t\t  \n\t\t\t\t\t  var b64 = window.btoa(svg.node().parentNode.innerHTML);\n\t\t\t\t\t  writeAndClickLink('data:application/octet-stream;base64,\\n'+b64,'pathway_clusters.svg')\n\t\t\t\t      }\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    minHeight: 200,\n\t\t\t    height: 300,\n\t\t\t    bodyPadding: 0,\n\t\t\t    split: true,\n\t\t\t    items:[clusterPanel]\n\t\t\t},{ region: 'center',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'expressionDetailsPane',\n\t\t\t    minHeight: 100,\n\t\t\t    collapsible: false,\n//\t\t\t    headerPosition: 'bottom',\n\t\t\t    title: 'Expression Details',\n\t\t\t    tools: [\n\t\t\t\t{ type:'search',\n\t\t\t\t  tooltip: 'Search for genes matching the current consensus pattern',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanel.searchSimilar();\n\t\t\t\t  }\n\t\t\t\t},{ type:'collapse',\n\t\t\t\t  tooltip: 'Run GO enrichment analysis on the current gene set',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      if(detailPanel.hasOwnProperty('genecldata')) {\n\t\t\t\t\t  // write out current gene set\n\t\t\t\t\t  geinfostore.getProxy().setExtraParam(\"genes\",JSON.stringify(detailPanel.genecldata.matrix.rows))\n\t\t\t\t\t  geinfostore.load();\n\t\t\t\t\t  infotab.child(\"#enrichmenttab\").tab.show();\n\t\t\t\t\t  // show the info tab\n\t\t\t\t\t  infotab.setActiveTab(3);\n\t\t\t\t      }\n\t\t\t\t  }\n\t\t\t\t},{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n \t\t\t\t  tooltip: 'Save image as SVG file',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      var svg=d3.select(\"#geneclsvg\");\n\t\t\t\t      if(!svg.empty()) {\n\t\t\t\t\t  svg.selectAll(\".geneRowLabel\").style(\"dominant-baseline\",\"middle\").style(\"font\",\"12px sans-serif\");;\n\t\t\t\t\t  //var b64 = Base64.encode(el.html());\n\t\t\t\t\t  var b64 = window.btoa(svg.node().parentNode.innerHTML);\n\t\t\t\t\t  writeAndClickLink('data:application/octet-stream;base64,\\n'+b64,'genes.svg')\n\t\t\t\t      }\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    header: true,\n\t\t\t    items:[detailPanel],\n\t\t\t    autoScroll: true,\n\t\t\t    autoShow: true,\n\t\t\t    /*listeners: {\n\t\t\t\tafterrender: function(panel) {\n\t\t\t\t    console.log(\"boo\");\n\t\t\t\t    var header=panel.getHeader();\n\t\t\t\t    header.insert(1,[ngenesSlider]);\n\t\t\t\t}\n\t\t\t    }*/\n\t\t\t}]\n\t\t},{ region: 'east',\n\t\t    collapsible: true,\n\t\t    title: 'Info',\n\t\t    split: true,\n\t\t    layout: 'fit',\n\t\t    width: '30%',\n\t\t    minWidth: 100,\n\t\t    minHeight: 140,\n\t\t    bodyPadding: 0,\n\t\t    items:[infotab]\n\t\t}]\n\t});\n\n\n    if(Ext.util.Cookies.get(\"hidetutorial\")==null) {\n\ttutorialWindow.show(); \n    }\n\n\n});\n\n\n\n// quick helper function to provide an internal download link\nfunction writeAndClickLink(url,download) {\n    var link=d3.select(\"body\").append('a');\n    link.attr('href',url).attr('download',download);\n    var evObj = document.createEvent('MouseEvents');\n    evObj.initMouseEvent('click', true, true);\n    link.node().dispatchEvent(evObj);\n    $(link.node()).remove();\n}\n"
  },
  {
    "path": "web/pathcl_canvas.js",
    "content": "Ext.require(['*']);\n\nExt.onReady(function() {\n   \n\n    var cw;\n    var currentPathCl=-1; // currently selected pathway cluster\n\n    Ext.tip.QuickTipManager.init();    \n\n    //Ext.state.Manager.setProvider(Ext.create('Ext.state.CookieProvider'));\n\n    var clusterToolbar = Ext.create('Ext.toolbar.Toolbar', {\n\t    items: [ {\n\t\t    // xtype: 'button', // default for Toolbars\n\t\t    text: 'Show'\n\t\t},'->',{\n\t\t    xtype    : 'textfield',\n\t\t    icon: 'preview.png',\n\t\t    cls: 'x-btn-text-icon',\n\t\t    fieldLabel: 'number of genes',\n\t\t    labelStyle: 'white-space: nowrap;',\n\t\t    name     : 'nGenes',\n\t\t    emptyText: 'number of genes to show'\n\t\t}]\n\t});\n\n    var tutorialWindow = new Ext.Window({\n\tid:'tutorial-window', \n\ttitle: 'Video Tutorial',\n\tlayout:'fit',  \n\tactiveItem: 0,  \n\n\tdefaults: {border:false}, \n\tbbar: Ext.create('Ext.toolbar.Toolbar', {\n\t    padding: 5,\n\t    items   : [{\n\t\txtype: 'checkbox',\n\t\tboxLabel: 'do not automatically show this tutorial video on startup when I visit next time',\n\t\tchecked: Ext.util.Cookies.get(\"hidetutorial\")!=null,\n\t\tname: 'dontshowtutorial',\n\t\tlisteners: {\n\t\t    change: function(field, value) {\n\t\t\tif(value) {\n\t\t\t    var now = new Date();\n\t\t\t    var expiry = new Date(now.getTime() + 365 * 24 * 60 * 60 * 1000);\n\t\t\t    Ext.util.Cookies.set(\"hidetutorial\",true,expiry)\n\t\t\t} else {\n\t\t\t    console.log(\"clearing hidetutorial\");\n\t\t\t}\n\t\t    }\n\t\t}\n\t    },'->',{\n\t\txtype: 'button',\n\t\ttext: 'Close',\n\t\thandler: function() {\n\t\t    tutorialWindow.hide();\n\t\t}\n\t    }\n           ]\n\t}),\n\titems : [{\n\t\t    id: \"video\",\n\t\t    html: '<iframe width=\"720\" height=\"400\" src=\"//www.youtube.com/embed/N2Ritx5yqrc?rel=0\" frameborder=\"0\" allowfullscreen></iframe>'\n\t\t}]\t\n\t});\n\n    \n\n    /* DETAILED CLUSTERING VIEW */\n    var detailMode=1; // 1: gene 2: pathway\n    var detailItemList={}; // pathway or gene list\n    var detailNGenes=20; // max genes to show \n\n    var detailPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding: 5,\n\tautoScroll: false,\n\tshowGenes: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    Ext.Ajax.request({\n\t\turl: 'genecl.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids))\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=1;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: selected genes\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).disable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).disable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\tshowPathways: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(3).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'pathwaygenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=2; \n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: top genes in specified pathways\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t    \n\t},\n\tsearchSimilar: function(pattern) { // request genes most closely matching current data.colcol\n\t    if(!detailPanel.hasOwnProperty('genecldata')) return;\n\t    detailPanel.setLoading(true);\n\t    if(pattern === undefined) {\n\t\t// determine the colcol or single gene\n\t\tif(detailPanel.genecldata.hasOwnProperty('colcols')) {\n\t\t    pattern=detailPanel.genecldata.colcols.data;\n\t\t} else {\n\t\t    pattern=detailPanel.genecldata.matrix.data.slice(0,detailPanel.genecldata.matrix.dim[1]);\n\t\t}\n\t    }\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(3).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'patterngenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"pattern\" : encodeURIComponent(JSON.stringify(pattern)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.pattern=pattern;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=3;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: genes matching specified pattern\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\treload: function() { // goes back to the server to redraw the same ids\n\t    if(!detailPanel.hasOwnProperty('mode')) return;\n\t    switch(detailPanel.mode) {\n\t    case 1: detailPanel.showGenes(detailPanel.ids); break;\n\t    case 2: detailPanel.showPathways(detailPanel.ids);break;\n\t    case 3: detailPanel.searchSimilar(detailPanel.pattern);break;\n\t    default: console.log(\"reload requested with an undefined detailPanel mode\");\n\t    }\n\t},\n\tredraw: function(data) { // redraws the panels without going back to the server\n\t    if(data === undefined) {\n\t\tif(detailPanel.hasOwnProperty('genecldata')) { data=detailPanel.genecldata; } else {  return; }\n\t    }\n\t    \n\t    \n\t    $('#genecl').remove();\n\t    delete detailPanel.evctx;\n\t    $('#geneclev').remove();\n\t    $('#gclCD').remove();\n\t    var s=detailPanel.getSize();\n\t    // adjust height to match maxRowHeight (=15) if needed\n\t    if((s.height-hc.hmtop(data)-hc.margins.bottom)/data.matrix.dim[0] > 15) {\n\t\ts.height=15*data.matrix.dim[0]+hc.margins.bottom+hc.hmtop(data);\n\t    }\n\t    detailPanel.s=s;\n\t    \n\t    detailPanel.body.update('<div id=\"gclCD\" style=\"padding:0 0 0 0; position:relative\"><canvas id=\"genecl\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 0;\"></canvas> <canvas id=\"geneclev\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 1;\"></canvas></div>')\n\t    \n\t    var gctx= $('#genecl')[0].getContext('2d');\n\t    if(data.hasOwnProperty('colcols')) {\n\t\t//colcols\n\t\tdrawHeatmap(gctx,data.colcols,hc.hmleft(),hc.cctop(data),hc.hmwidth(s.width), hc.ccheight(data));\n\t\t//rowcols\n\t\tdrawHeatmap(gctx,data.rowcols,hc.margins.left,hc.hmtop(data),hc.rowcolWidth,hc.hmheight(data,s.height),false)\n\t    }\n\n\t    //main heatmap\n\t    drawHeatmap(gctx,data.matrix,hc.hmleft(),hc.hmtop(data),hc.hmwidth(s.width),hc.hmheight(data,s.height),true)\n\t    // event handling\n\t    var evc=document.getElementById(\"geneclev\");\n\t    var evctx= evc.getContext('2d');\n\t    detailPanel.evctx=evctx;\n\t    try {\n\t\tevctx.setLineDash([5]);\n\t    } catch (err) {}\n\t    evctx.fillStyle='black'; \n\t    evctx.font=\"bold 15px Arial\";\n\t    \n\t    var evtRect = evc.getBoundingClientRect();\n\t    \n\t    evc.addEventListener('mousemove', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=clusterPanel.s;\n\t\tclusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.hmtop(data)) { \n\t\t    //mx=hc.hmleft()+(rx+0.5)*hc.hmwidth(s.width)/data.matrix.dim[1];\n\t\t    evctx.beginPath(); \n\t\t    evctx.moveTo(mx,hc.cctop(data)); evctx.lineTo(mx,s.height-hc.margins.bottom);\n\t\t    var ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t    //my=hc.hmtop(data)+(ry+0.5)*hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t\t    evctx.moveTo(hc.margins.left,my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t    var rx=Math.floor((mx-hc.hmleft())/hc.hmwidth(s.width)*data.matrix.dim[1]);\n\n\t\t    // update the line in the cluster panel\n\t\t    clusterPanel.evctx.beginPath();\n\t\t    clusterPanel.evctx.moveTo(mx,hc.cctop(clusterPanel.pathcldata))\n\t\t    clusterPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t    clusterPanel.evctx.stroke();\n\n\t\t    if(mx>s.width/2) { \n\t\t\tevctx.textAlign=\"end\"; mx-=5; \n\t\t    } else {\n\t\t\tevctx.textAlign=\"start\"; mx+=5;\n\t\t    }\n\t\t    evctx.textBaseline=\"bottom\";\n\t\t    evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx,my-3); \n\t\t    evctx.textBaseline=\"top\";\n\t\t    evctx.fillText(\"gene: \"+data.matrix.rows[ry],mx,my+3); \n\t\t    evctx.stroke();\n\t\t}\n\t    }, false);\n\n\t    evc.addEventListener('mouseout', function(evt) {\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=clusterPanel.s;\n\t\tclusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t    });\n\n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw();\n\t    }\n\t}\n\n    })\n\n\n    // clear filter filed trigger button\n    Ext.define('Ext.ux.CustomTrigger', {\n\textend: 'Ext.form.field.Trigger',\n\talias: 'widget.customtrigger',\n\tinitComponent: function () {\n            var me = this;\n            me.triggerCls = 'x-form-clear-trigger';\n            me.callParent(arguments);\n\t},\n\t// override onTriggerClick\n\tonTriggerClick: function() {\n\t    if(this.getValue()!='') {\n\t\tthis.setRawValue('');\n\t\tthis.fireEvent('change',this,'');\n\t    }\n\t}\n    });\n\n\n    /* PATHWAY CLUSTER INFO */\n\n    Ext.define('clinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'od',type: 'float'},{name: 'npc', type: 'integer'},{name: 'sign', type: 'integer'},{name: 'initsel', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var clinfostore = Ext.create('Ext.data.Store', {\n        id: 'clinfostore',\n        model: 'clinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'clinfo.json',\n\t    extraParams: {\n\t\tpathcl: currentPathCl\n\t    },\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n\tlisteners: {\n\t    load: function(r) {\n\t\t// use the supplied 'initsel' to set the initial selection\n\t\tclSelectModel.suspendEvent('selectionchange')\n\t\tfor(var i=0;i<r.data.items.length;i++) {\n\t\t    if(r.data.items[i].data.initsel==1) {\n\t\t\tclSelectModel.select(i,true);\n\t\t    }\n\t\t}\n\t\tclSelectModel.resumeEvent('selectionchange');\n\t\tclSelectModel.fireEvent('selectionchange',clSelectModel,clSelectModel.getSelection());\n\t    }\n\t}\n    });\n\n    var clSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var clInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: clinfostore,\n\tid: \"clinfo\",\n\tselModel: clSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: clinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tclinfostore.clearFilter(true);\n\t\t\t\tclinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tclinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n\t\t  //{text: \"name\", flex: 1, dataIndex: 'name', sortable: true},\n\t    {\n                text: 'overdispersion',\n                dataIndex: 'od',\n                flex:1,\n                renderer: function (v, m, r) {\n\t\t    //m.tdAttr='data-qtip=\"'+r.data.name+'\"';\n                    var id = Ext.id();\n                    Ext.defer(function () {\n                        Ext.widget('progressbar', {\n                            renderTo: id,\n\t\t\t    text: r.data.name,\n                            value: v / 1,\n                        });\n                    }, 50);\n\t\t    if(r.data.sign==\"1\") {\n\t\t\treturn Ext.String.format('<div class=\"positive\" id=\"{0}\"></div>', id);\n\t\t    } else {\n\t\t\treturn Ext.String.format('<div class=\"negative\" id=\"{0}\"></div>', id);\n\t\t    }\n                }\n            },\n            {text: \"PC\", width: 30, dataIndex: 'npc', sortable: true},\n            /*{text: \"overdispersion\", width: 100, dataIndex: 'od', sortable: true}*/\n        ]\n    })\n\n\n\n    /* GENE SET ENRICHMENT INFO */\n\n    Ext.define('geinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'fe',type: 'float'},{name: 'o', type: 'integer'},{name: 'u', type: 'integer'},{name: 'Z', type: 'float'},{name: 'Za', type: 'float'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var geinfostore = Ext.create('Ext.data.Store', {\n        id: 'geinfostore',\n        model: 'geinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'ajax',\n            url: 'testenr.json',\n\t    actionMethods: {create: 'POST', read: 'POST', update: 'POST', destroy: 'POST'},\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n    });\n\n    var geSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var geInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: geinfostore,\n\tid: \"geinfo\",\n\tselModel: geSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Enriched Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: geinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tgeinfostore.clearFilter(true);\n\t\t\t\tgeinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tgeinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true },\n\t    {text: \"FE\", width: 60, dataIndex: 'fe', sortable: true, tooltip: \"fold enrichment\"},\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, tooltip: \"enrichment Z-score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'Za', sortable: true, tooltip: \"enrichment Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 50, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes found in this pathway\"},\n\t    {text: \"u\", width: 50, dataIndex: 'u', sortable: true, hidden: true, tooltip: \"total number of genes annotated for this pathway\"}\n        ]\n    })\n\n\n    /* gene info tab */\n    Ext.define('ginfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'gene', {name: 'var',type: 'float'},{name: 'svar', type: 'float'}\n        ],\n\tidProperty: 'gene'\n    });\n\n    var ginfostore = Ext.create('Ext.data.Store', {\n        id: 'ginfostore',\n        model: 'ginfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'genes.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        sorters: [{\n            property: 'var',\n            direction: 'DESC'\n        }],\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n\n\n    var geneSelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\t    var ids = $.map(selections,function(val,i) {\n\t\t\t    return(val.data.gene)\n\t\t\t})\n\t\t    detailPanel.showGenes(ids);\n\t\t    //grid4.down('#removeButton').setDisabled(selections.length === 0);\n            }\n        }\n    });\n\n    var geneGrid = Ext.create('Ext.grid.Panel', {\n        store: ginfostore,\n\tselModel: geneSelModel,\n        columns: [\n            {text: \"Gene\", flex: 1, dataIndex: 'gene', sortable: true,\n\t     renderer: function(value) {\n\t\t return Ext.String.format('<a href=\"http://www.informatics.jax.org/searchtool/Search.do?query={0}\" target=\"_blank\">{1}</a>',value,value)\n\t     }\n\t    },\n            {text: \"Variance\", width: 100, dataIndex: 'var', sortable: true}\n        ],\n\t//features: [filters],\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Genes',\n\t//forceFit: true\n        //renderTo:'example-grid',\n        //width: 800,\n        //height: 300\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: ginfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No genes to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by gene name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 600, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tginfostore.clearFilter(true);\n\t\t\t\tginfostore.filter({property: 'gene', value: value});\n\t\t\t    } else {\n\t\t\t\tginfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n\tlisteners: {\n\t    viewready: function() {\n\t\t    // select top 20 genes \n\t\t    this.selModel.suspendEvent('selectionchange')\n\t\t    for(var i=0;i<Math.min(this.store.data.items.length,20);i++) {\n\t\t\tthis.selModel.select(i,true);\n\t\t    }\n\t\t    this.selModel.resumeEvent('selectionchange');\n\t\t    this.selModel.fireEvent('selectionchange',this,this.selModel.getSelection());\n\t    }\n\t}\n    });\n\n\n\n    /* gene info tab */\n    Ext.define('pinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'id','name', {name: 'Z',type: 'float'},{name: 'aZ',type: 'float'},{name: 'score',type: 'float'},{name: 'n', type: 'integer'},{name: 'npc', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var pinfostore = Ext.create('Ext.data.Store', {\n        id: 'pinfostore',\n        model: 'pinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'pathways.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n    var pathwaySelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tdetailPanel.showPathways(ids);\n            }\n        }\n    });\n\n    var pathwayGrid = Ext.create('Ext.grid.Panel', {\n        store: pinfostore,\n\tselModel: pathwaySelModel,\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true, tooltip: \"pathway / gene set name\" },\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, hidden: true, tooltip: \"overdispersion Z score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'aZ', sortable: true, tooltip: \"overdispersion Z score, adjusted for multiple hypothesis\"},\n\t    {text: \"co.Z\", width: 60, dataIndex: 'sh.Z', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score\"},\n            {text: \"co.cZ\", width: 60, dataIndex: 'sh.aZ', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 60, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes in the pathway\"},\n\t    {text: \"nPC\", width: 60, dataIndex: 'npc', sortable: true, hidden: true, tooltip: \"principal component number\"},\n\t    {text: \"score\", width: 60, dataIndex: 'score', sortable: true, tooltip: \"observed/expected overdispersion\"}\n        ],\n\t//features: [filters],\n\temptyText: 'No Matching Pathways',\n        height: '100%',\n\tcolumnLines:true,\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: pinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by pathway name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 400, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tpinfostore.clearFilter(true);\n\t\t\t\tpinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tpinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t})\n    });\n\n\n\n    var infotab = Ext.create('Ext.tab.Panel', {\n\t    //tabPosition: 'right',\n\t    defaults: {\n\t\tbodyPadding: 0,\n\t\tlayout: 'fit',\n\t\ticonCls: 'tab-icon'\n\t    },\n\t    items: [\n\t\t{ title: 'Pathways', items:[pathwayGrid] },\n\t\t{ title: 'Genes', items:[geneGrid] },\n\t\t{ title: 'Cluster', items:[clInfoGrid], itemId: 'clustertab', hidden:true },\n\t\t{ title: 'Enrichment', items:[geInfoGrid], itemId: 'enrichmenttab', hidden:true }\n\t    ]\n\t});\n\n    \n    // draw dendrogram in a given context\n    // ctx - canvas 2d context\n    // d - dendrogram structure (t(merge), order, height)\n    // x,y,width,height - placement\n    function drawDendrogram(ctx, d, x, y, width, height) {\n\tvar nmerges=d.merge.length;\n\tvar xstep=width/d.order.length;\n\tvar lp,lpy,rp,rpy,jx,jy;\n\tvar maxHeight=Math.max.apply(null, d.height);\n\tvar heightScale=height/maxHeight;\n\tctx.beginPath();\n\n\tfor(var i=0, mergex=[]; i<nmerges; i++) {\n\t    jy=y+(maxHeight-d.height[i])*heightScale;\n\t    lp=d.merge[i]; // left position (starts as an index)\n\t    if(lp<0) { // leaf\n\t\tlp=d.order[-1*lp-1]; \n\t\tlp=x+(lp-0.5)*xstep;\n\t\tlpy=y+height;\n\t    } else { // inner node\n\t\tlpy=y+(maxHeight-d.height[lp-1])*heightScale; lp=mergex[lp-1]; \n\t    }\n\t    i++; rp=d.merge[i]; // right position\n\t    if(rp<0) { // leaf\n\t\trp=d.order[-1*rp-1]; \n\t\trp=x+(rp-0.5)*xstep;\n\t\trpy=y+height;\n\t    } else { // inner node\n\t\trpy=y+(maxHeight-d.height[rp-1])*heightScale; rp=mergex[rp-1]; \n\t    }\n\t    jx=(lp+rp)/2;  jy=y+(maxHeight-d.height[(i-1)/2])*heightScale;\n\t    mergex.push(jx);\n\t    ctx.moveTo(lp,lpy); ctx.lineTo(lp,jy); ctx.lineTo(rp,jy); ctx.lineTo(rp,rpy);\n\t    \n\t}\n\tctx.lineWidth=1.5; ctx.lineJoin='miter'; ctx.strokeStyle='black'; ctx.stroke();\n    }\n\n     // draw heatmap\n    function drawHeatmap(ctx,d,x,y,width,height,rowNames,maxRowHeight,maxFontSize,minFontSize) {\n\trowNames = typeof rowNames !== 'undefined' ? rowNames : false;\n\tmaxFontSize = typeof maxFontSize !== 'undefined' ? maxFontSize : 100;\n\tminFontSize = typeof minFontSize !== 'undefined' ? minFontSize : 6;\n\tmaxRowHeight = typeof maxRowHeight !== 'undefined' ? maxRowHeight : 100;\n\tif(height>d.dim[0]*maxRowHeight) { height=d.dim[0]*maxRowHeight;}\n\tvar rowHeight=height/d.dim[0]; var colWidth=width/d.dim[1];\n\tvar mC=d.hasOwnProperty('colors'); // perform color mapping on the fly\n\tif(rowNames && !d.hasOwnProperty('rows')) { rowNames=false; }\n\tif(rowNames) { \n\t    var fontSize=(rowHeight*0.95); \n\t    if(fontSize>maxFontSize) { fontSize=maxFontSize; };\n\t    if(fontSize>=minFontSize) { \n\t\tctx.font=fontSize+\"px Arial\"; ctx.textAlign='left'; ctx.textBaseline=\"middle\";\n\t    } else { \n\t\trowNames=false;\n\t    }\n\t}\n\tvar zlim=[];\n\tif(mC) {\n\t    if(d.hasOwnProperty('zlim')) {\n\t\tzlim=d.zlim;\n\t    } else {\n\t\tzlim.push(-1*Math.max.apply(null,d.data.map(Math.abs)));\n\t\tzlim.push(-1*zlim[0]);\n\t    }\n\t    zlim.push(d.colors.length/(zlim[1]-zlim[0])); // zlim step\n\t}\n\tvar val=0;\n\tfor(var i=0; i<d.dim[0]; i++) { // rows\n\t    for(var j=0; j<d.dim[1]; j++) { // columns\n\t\tif(mC) { // colors were specified\n\t\t    val=(d.data[i*d.dim[1]+j] - zlim[0])*zlim[2];\n\t\t    if(val<0) { val=0; } else if(val>=d.colors.length) { val=d.colors.length-1; }\n\t\t    val=d.colors[Math.floor(val)];\n\t\t} else { // interpret values as colors directly\n\t\t    val=d.data[i*d.dim[1]+j]\n\t\t}\n\t\tctx.fillStyle=val; ctx.fillRect(x+j*colWidth,y+i*rowHeight,colWidth,rowHeight);\n\n\t    }\n\t    if(rowNames) { ctx.fillStyle='black'; ctx.fillText(d.rows[i],x+width+3,y+(i+0.5)*rowHeight); }\n\t}\n\tctx.lineWidth=1; ctx.strokeStyle='black'; ctx.strokeRect(x,y,width,height);\n\t//if(zlim!==undefined) {return(zlim[1])};\n    }\n\n    function updatePathclInfo(pathcl) {\n\tif(pathcl == currentPathCl) return;\n\tclinfostore.getProxy().setExtraParam(\"pathcl\",pathcl)\n\tclinfostore.load();\n\tinfotab.child(\"#clustertab\").tab.show();\n\tinfotab.setActiveTab(2);\n\tinfotab.getActiveTab().setTitle(\"Aspect \"+pathcl)\n\tcurrentPathCl=pathcl;\n    }\n    \n    /* heatmap config */\n    var hc = {\n\tspacing: 5,\n\tdendSpacing: 1,\n\tgeneUnitHeight: 15,\n\tmargins: {top:2,right:80,bottom:15,left:1},\n\tcolcolUnitHeight: 10,\n\trowcolWidth: 10,\n\tcolDendHeight: 50,\n\ttrim: 0,\n\t// heatmap left position\n\thmleft: function() { return(this.margins.left+this.rowcolWidth+this.spacing)},\n\t// colcol top position\n\tcctop: function(data) { return(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.dendSpacing : this.margins.top) },\n\t// colcol height\n\tccheight: function(data) { return(this.colcolUnitHeight*data.colcols.dim[0]) },\n\t// heatmap top position\n\thmtop: function(data) { \n\t    if(data.hasOwnProperty('colcols')) {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing+this.dendSpacing + this.ccheight(data) : this.margins.top+this.spacing+this.dendSpacing + this.ccheight(data))\n\t    } else {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing : this.margins.top)\n\t    }\n\t},\n\t// heatmap hight\n\thmheight: function(data,totalHeight) {\n\t    return((totalHeight-this.margins.bottom) - this.hmtop(data));\n\t},\n\t// heatmap width\n\thmwidth: function(totalWidth) { return(totalWidth-this.margins.left-this.rowcolWidth-this.spacing-this.margins.right)},\n\trowlableft: function(totalWidth) {\n\t    return(this.hmleft()+this.hmwidth(totalWidth)+this.spacing);\n\t},\n\trowlabelsize: function(data,totalHeight) {\n\t    return(Math.round(this.hmheight(data,totalHeight)/data.matrix.dim[0]*0.97*72/96))\n\t    //return(18);\n\t}\n    };\n\n    var clusterPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding: 5,\n\t//ohtml: \"Pathway Clustering\",\n\treload: function() {\n\t    clusterPanel.setLoading(true);\n\t    Ext.Ajax.request({\n\t\turl: 'pathcl.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    if(data.colcols.dim[0]==1) data.colcols.rows=[data.colcols.rows]\n\t\t    clusterPanel.pathcldata=data;\n\t\t    clusterPanelGearMenu.getComponent(0).suspendEvents();\n\t\t    clusterPanelGearMenu.getComponent(0).setValue(data.matrix.zlim[1]);\n\t\t    //clusterPanelGearMenu.getComponent(0).setMaxValue(data.matrix.range[1]);\n\t\t    clusterPanelGearMenu.getComponent(0).resumeEvents()\n\t\t    if(data.hasOwnProperty('trim')) {\n\t\t\thc.trim=data.trim;\n\t\t\tdetailPanelGearMenu.getComponent(1).suspendEvents();\n\t\t\tdetailPanelGearMenu.getComponent(1).setValue(data.trim);\n\t\t\tdetailPanelGearMenu.getComponent(1).resumeEvents();\n\t\t    }\n\t\t    clusterPanel.redraw(clusterPanel.pathcldata);\n\t\t    clusterPanel.setLoading(false);\n\t\t}\n\t    })\n\t},\n\tredraw: function(data) {\n\t    if(data === undefined) {\n\t\tif(clusterPanel.hasOwnProperty('pathcldata')) { data=clusterPanel.pathcldata; } else { clusterPanel.reload(); return; }\n\t    }\n\t    \n\t    $('#pathcl').remove();\n\t    $('#pathclev').remove();\n\t    delete clusterPanel.evctx;\n\t    $('#pclCD').remove();\n\t    //clusterPanel.update(\"\");\n\t    var s=clusterPanel.getSize();\n\t    clusterPanel.s=s;\n\t    clusterPanel.body.update('<div id=\"pclCD\" style=\"padding:0 0 0 0; position:relative\"><canvas id=\"pathcl\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 0;\"></canvas> <canvas id=\"pathclev\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 1;\"></canvas></div>')\n\t    \n\t    \n            var ctx= $('#pathcl')[0].getContext('2d');\n\t    drawDendrogram(ctx,data.coldend,hc.hmleft(),hc.margins.top,hc.hmwidth(s.width),hc.colDendHeight);\n\t    \n\t    //colcols\n\t    drawHeatmap(ctx,data.colcols,hc.hmleft(),hc.cctop(data),hc.hmwidth(s.width), hc.ccheight(data));\n\t    \n\t    //rowcols\n\t    drawHeatmap(ctx,data.rowcols,hc.margins.left,hc.hmtop(data),hc.rowcolWidth,hc.hmheight(data,s.height))\n\n\t    //main heatmap\n\t    drawHeatmap(ctx,data.matrix,hc.hmleft(),hc.hmtop(data),hc.hmwidth(s.width),hc.hmheight(data,s.height),rowNames=true)\n\t    \n\t    // event handling\n\t    var evc=document.getElementById(\"pathclev\");\n\t    var evctx= evc.getContext('2d');\n\t    clusterPanel.evctx=evctx;\n\n\t    //evctx.strokeRect(0,0,s.width-10,s.height-10);\n\t    try {\n\t\tevctx.setLineDash([5]);\n\t    } catch (err) {}\n\t    evctx.fillStyle='black'; \n\t    evctx.font=\"bold 15px Arial\";\n\n\t    var evtRect = evc.getBoundingClientRect();\n\t    \n\t    evc.addEventListener('click', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft()) { \n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\tevctx.strokeStyle='red'; evctx.lineWidth=2;\n\t\t\ttry { evctx.setLineDash([0]); } catch(err) {}\n\t\t\t//evctx.strokeRect(hc.hmleft(),hc.hmtop(data)+ry*hc.hmheight(data,s.height)/data.matrix.dim[0],hc.hmwidth(s.width),hc.hmheight(data,s.height)/data.matrix.dim[0]); \n\t\t\tevctx.strokeRect(0,hc.hmtop(data)+ry*hc.hmheight(data,s.height)/data.matrix.dim[0],s.width,hc.hmheight(data,s.height)/data.matrix.dim[0]); \n\t\t\tevctx.strokeStyle='black'; evctx.lineWidth=1;\n\t\t\ttry { evctx.setLineDash([5]); } catch(err) {}\n\t\t\tupdatePathclInfo(data.matrix.rows[ry])\n\t\t    }\n\t\t}\n\t    }, false);\n\t    evc.addEventListener('mousemove', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tif(detailPanel.hasOwnProperty('evctx')) {\n\t\t    var v=detailPanel.s;\n\t\t    detailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t}\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.cctop(data)) { \n\t\t    //mx=hc.hmleft()+(rx+0.5)*hc.hmwidth(s.width)/data.matrix.dim[1];\n\t\t    evctx.beginPath(); \n\t\t    evctx.moveTo(mx,hc.cctop(data)); evctx.lineTo(mx,s.height-hc.margins.bottom);\n\t\t    // update the line in the gene panel\n\t\t    if(detailPanel.hasOwnProperty('evctx')) {\n\t\t\tdetailPanel.evctx.beginPath();\n\t\t\tdetailPanel.evctx.moveTo(mx,hc.cctop(detailPanel.genecldata))\n\t\t\tdetailPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t\tdetailPanel.evctx.stroke();\n\t\t    }\n\t\t    var rx=Math.floor((mx-hc.hmleft())/hc.hmwidth(s.width)*data.matrix.dim[1]);\n\t\t    if(mx>s.width/2) { \n\t\t\tevctx.textAlign=\"end\"; mx-=5; \n\t\t    } else {\n\t\t\tevctx.textAlign=\"start\"; mx+=5;\n\t\t    }\n\t\t    evctx.textBaseline=\"bottom\";\n\t\t    evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx,my-3); \n\n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\t//my=hc.hmtop(data)+(ry+0.5)*hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t\t\tevctx.moveTo(hc.margins.left,my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t\t\n\t\t\t//evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx+5,my-5); \n\t\t\tevctx.textBaseline=\"top\";\n\t\t\tevctx.fillText(\"aspect: \"+data.matrix.rows[ry],mx,my+3);\n\n\t\t    } else {\n\t\t\tevctx.moveTo(hc.hmleft(),my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t\tvar ry=Math.floor((my-hc.cctop(data))/hc.ccheight(data)*data.colcols.dim[0]);\n\t\t\tvar val=data.colcols.rows[ry]; \n\t\t\tif(val!==undefined) {\n\t\t\t    evctx.textBaseline=\"top\";\n\t\t\t    evctx.fillText(\"metadata: \"+val,mx,my+3);\n\t\t\t}\n\t\t    }\n\t\t    evctx.stroke();\n\t\t}\n\t    }, false);\n\t    \n\n\t    evc.addEventListener('mouseout', function(evt) {\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=detailPanel.getSize();\n\t\tif(detailPanel.hasOwnProperty('evctx')) {\n\t\t    detailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t}\n\t    });\n\t    \n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw(cmp.pathcldata);\n\t    }\n\t}\n    });\n\n\n    var clusterPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'clusterGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'Z limit',\n\t\tname: 'zlim',\n\t\txtype: 'numberfield',\n\t        value: -1,\n\t\tdecimalPrecision: 3,\n\t\tminValue: 0.0,\n\t        maxValue: 100,\n\t\twidth: 200,\n\t        disabled: false,\n\t\ttooltip: 'Set the range of overdispersion scores illustrated by colors',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {\n\t\t\tclusterPanel.pathcldata.matrix.zlim=[-1*v,v];\n\t\t\tclusterPanel.redraw()\n\t\t    }}\n\t\t}\n\t}\n\t],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n\t}\n    });\n\n    \n    var detailPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'detailGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'N genes',\n\t\txtype: 'numberfield',\n\t        tooltip: 'Number of genes to show in the Expression Details panel',\n\t\tlabel: 'N genes',\n\t\tvalue: 20,\n\t\tminValue: 1,\n\t\tmaxValue: 1000,\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {detailPanel.reload()}}\n\t\t}\n\t    },{\n\t\tfieldLabel: 'Trim',\n\t\tname: 'trim',\n\t\txtype: 'numberfield',\n\t\tvalue: hc.trim,\n\t\tdecimalPrecision: 5,\n\t\tminValue: 0.0,\n\t\tmaxValue: 0.5,\n\t\twidth: 200,\n\t\tmaxValue: 100,\n\t\tdisabled: true,\n\t\ttooltip: 'Winsorization trim fraction',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {hc.trim=v; detailPanel.reload()}}\n\t\t}\n            }, '-',\n\t    {\n                text: 'High/low genes',\n                checked: false,\n\t\ttooltip: 'Whether to include genes from both sides of the PC loading (true) or just high magnitude  (false)',\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    checkchange : function() {detailPanel.reload()}\n\t\t}\n\t    }],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n }\n    });\n\n    var ngenesSlider = Ext.create('Ext.slider.Single', {\n\tlabel: 'N genes',\n\ttip: 'number of genes to show',\n\ttipText: function(thumb){ return Ext.String.format('show {0} genes', thumb.value); },\n\twidth: 100,\n\tvalue: 20,\n\tincrement: 1,\n\tminValue: 0,\n\tmaxValue: 500,\n\n    });\n\n    var viewport = Ext.create('Ext.Viewport', {\n        layout: {\n            type: 'border',\n            padding: 5\n        },\n        defaults: {\n            split: true\n        },\n        items: [{ region: 'center',\n\t\t  layout: 'border',\n\t\t  items: [{ region: 'north',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'clusterPanel',\n\t\t\t    title: 'Pathway Overdispersion',\n\t\t\t    tools: [\n\t\t\t\t{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      clusterPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'help',\n\t\t\t\t  tooltip: 'Tutorial',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      tutorialWindow.show(); \n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n\t\t\t\t  tooltip: 'Save image',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      if($('#pathcl').length==0) { return; }\n\t\t\t\t      var changingImage = Ext.create('Ext.Img', {\n\t\t\t\t\t  src: $('#pathcl')[0].toDataURL(\"image/png\",1.0)\n\t\t\t\t      });\n\t\t\t\t      win = new Ext.Window({\n\t\t\t\t\t  title: 'exported image: use right click to save the image',\n\t\t\t\t\t  layout: 'fit',\n\t\t\t\t\t  autoScroll: true,\n\t\t\t\t\t  modal: true,\n\t\t\t\t\t  closeAction: 'hide',\n\t\t\t\t\t  items:[changingImage]\n\t\t\t\t      });\n\t\t\t\t      win.show();\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    minHeight: 200,\n\t\t\t    height: 300,\n\t\t\t    bodyPadding: 0,\n\t\t\t    split: true,\n\t\t\t    items:[clusterPanel]\n\t\t\t},{ region: 'center',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'expressionDetailsPane',\n\t\t\t    minHeight: 100,\n\t\t\t    collapsible: false,\n//\t\t\t    headerPosition: 'bottom',\n\t\t\t    title: 'Expression Details',\n\t\t\t    tools: [\n\t\t\t\t{ type:'search',\n\t\t\t\t  tooltip: 'Search for genes matching the current consensus pattern',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanel.searchSimilar();\n\t\t\t\t  }\n\t\t\t\t},{ type:'collapse',\n\t\t\t\t  tooltip: 'Run GO enrichment analysis on the current gene set',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      if(detailPanel.hasOwnProperty('genecldata')) {\n\t\t\t\t\t  // write out current gene set\n\t\t\t\t\t  geinfostore.getProxy().setExtraParam(\"genes\",JSON.stringify(detailPanel.genecldata.matrix.rows))\n\t\t\t\t\t  geinfostore.load();\n\t\t\t\t\t  infotab.child(\"#enrichmenttab\").tab.show();\n\t\t\t\t\t  // show the info tab\n\t\t\t\t\t  infotab.setActiveTab(3);\n\t\t\t\t      }\n\t\t\t\t  }\n\t\t\t\t},{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n\t\t\t\t  tooltip: 'Save image',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      if($('#genecl').length==0) { return; }\n\t\t\t\t      var changingImage = Ext.create('Ext.Img', {\n\t\t\t\t\t  src: $('#genecl')[0].toDataURL(\"image/png\",1.0)\n\t\t\t\t      });\n\t\t\t\t      win = new Ext.Window({\n\t\t\t\t\t  title: 'exported image: use right click to save the image',\n\t\t\t\t\t  layout: 'fit',\n\t\t\t\t\t  autoScroll: true,\n\t\t\t\t\t  modal: true,\n\t\t\t\t\t  closeAction: 'hide',\n\t\t\t\t\t  items:[changingImage]\n\t\t\t\t      });\n\t\t\t\t      win.show();\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    header: true,\n\t\t\t    items:[detailPanel],\n\t\t\t    autoScroll: true,\n\t\t\t    autoShow: true,\n\t\t\t    /*listeners: {\n\t\t\t\tafterrender: function(panel) {\n\t\t\t\t    console.log(\"boo\");\n\t\t\t\t    var header=panel.getHeader();\n\t\t\t\t    header.insert(1,[ngenesSlider]);\n\t\t\t\t}\n\t\t\t    }*/\n\t\t\t}]\n\t\t},{ region: 'east',\n\t\t    collapsible: true,\n\t\t    title: 'Info',\n\t\t    split: true,\n\t\t    layout: 'fit',\n\t\t    width: '30%',\n\t\t    minWidth: 100,\n\t\t    minHeight: 140,\n\t\t    bodyPadding: 0,\n\t\t    items:[infotab]\n\t\t}]\n\t});\n\n\n    if(Ext.util.Cookies.get(\"hidetutorial\")==null) {\n\ttutorialWindow.show(); \n    }\n\n\n});\n\n"
  },
  {
    "path": "web/pathcl_canvas_1.1.js",
    "content": "Ext.require(['*']);\n\nd3.selection.prototype.moveToFront = function() {\n  return this.each(function(){\n  this.parentNode.appendChild(this);\n  });\n};\n\nExt.onReady(function() {\n   \n\n    var cw;\n    var currentPathCl=-1; // currently selected pathway cluster\n\n    Ext.tip.QuickTipManager.init();    \n\n    //Ext.state.Manager.setProvider(Ext.create('Ext.state.CookieProvider'));\n\n    var clusterToolbar = Ext.create('Ext.toolbar.Toolbar', {\n\t    items: [ {\n\t\t    // xtype: 'button', // default for Toolbars\n\t\t    text: 'Show'\n\t\t},'->',{\n\t\t    xtype    : 'textfield',\n\t\t    icon: 'preview.png',\n\t\t    cls: 'x-btn-text-icon',\n\t\t    fieldLabel: 'number of genes',\n\t\t    labelStyle: 'white-space: nowrap;',\n\t\t    name     : 'nGenes',\n\t\t    emptyText: 'number of genes to show'\n\t\t}]\n\t});\n\n    var tutorialWindow = new Ext.Window({\n\tid:'tutorial-window', \n\ttitle: 'Video Tutorial',\n\tlayout:'fit',  \n\tactiveItem: 0,  \n\n\tdefaults: {border:false}, \n\tbbar: Ext.create('Ext.toolbar.Toolbar', {\n\t    padding: 5,\n\t    items   : [{\n\t\txtype: 'checkbox',\n\t\tboxLabel: 'do not automatically show this tutorial video on startup when I visit next time',\n\t\tchecked: Ext.util.Cookies.get(\"hidetutorial\")!=null,\n\t\tname: 'dontshowtutorial',\n\t\tlisteners: {\n\t\t    change: function(field, value) {\n\t\t\tif(value) {\n\t\t\t    var now = new Date();\n\t\t\t    var expiry = new Date(now.getTime() + 365 * 24 * 60 * 60 * 1000);\n\t\t\t    Ext.util.Cookies.set(\"hidetutorial\",true,expiry)\n\t\t\t} else {\n\t\t\t    console.log(\"clearing hidetutorial\");\n\t\t\t}\n\t\t    }\n\t\t}\n\t    },'->',{\n\t\txtype: 'button',\n\t\ttext: 'Close',\n\t\thandler: function() {\n\t\t    tutorialWindow.hide();\n\t\t}\n\t    }\n           ]\n\t}),\n\titems : [{\n\t\t    id: \"video\",\n\t\t    html: '<iframe width=\"720\" height=\"400\" src=\"//www.youtube.com/embed/N2Ritx5yqrc?rel=0\" frameborder=\"0\" allowfullscreen></iframe>'\n\t\t}]\t\n\t});\n\n    \n/* 2D embedding panel */\n    var embeddingPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding:0,\n\tautoScroll: false,\n\tdraw: function(cols) {\n\t    if(!clusterPanel.pathcldata.hasOwnProperty('embedding')) {\n\t\treturn;\n\t    }\n\t    if(cols===undefined) {\n\t\t// use last colcol row for colors\n\t\tcols=[];\n\t\tif(clusterPanel.pathcldata.hasOwnProperty('colcols')) {\n\t\t    var i=clusterPanel.pathcldata.colcols.dim[0]-1;\n\t\t    for(var j=0; j<clusterPanel.pathcldata.colcols.dim[1]; j++) { \n\t\t\tcols.push(clusterPanel.pathcldata.colcols.data[i*clusterPanel.pathcldata.colcols.dim[1]+j])\n\t\t    }\n\t\t} else {\n\t\t    // color everything gray\n\t\t    for(var j=0;j<clusterPanel.pathcldata.embedding.dim[0];j++) {\n\t\t\tcols.push(\"#808080\")\n\t\t    }\n\t\t}\n\t    }\n\t    \n\t    embeddingPanel.cols=cols;\n\t    embeddingPanel.redraw();\n\t},\n\trecolor: function(cols) {\n\t    if(embeddingPanel.hasOwnProperty('cols')) {\n\t\t// recolor\n\t\tembeddingPanel.cols=cols;\n\t\td3.select(\"#embedding\").selectAll(\"circle\").attr(\"fill\",function(d,i) {\n\t\t    return(embeddingPanel.cols[d3.select(this).attr(\"celli\")-1])\n\t\t})\n\t    } else { // just redraw everything\n\t\tembeddingPanel.draw(cols);\n\t    }\n\t},\n\tredraw: function() {\n\t    if(!clusterPanel.hasOwnProperty('pathcldata') || !clusterPanel.pathcldata.hasOwnProperty('embedding') || !embeddingPanel.hasOwnProperty('cols')) {\n\t\treturn;\n\t    }\n\n\t    var padding=12;\n\t    var el = d3.select(embeddingPanel.getLayout().getElementTarget().dom)\n\t    var s=embeddingPanel.getSize();\n\t    d3.select(\"#embedding\").remove();\n\t    var svg = el.append(\"svg\").attr(\"id\",\"embedding\").attr(\"width\",s.width+\"px\").attr(\"height\",s.height+\"px\").attr('xmlns','http://www.w3.org/2000/svg');\n\n\t    var xScale=d3.scale.linear().domain(clusterPanel.pathcldata.embedding.xrange).range([padding,s.width-2*padding]);\n\t    var yScale=d3.scale.linear().domain(clusterPanel.pathcldata.embedding.yrange).range([padding,s.height-2*padding]);\n\n\t    //var yAxis=d3.svg.axis().scale(yScale).orient(\"left\");\n\t    //var xAxis=d3.svg.axis().scale(xScale).orient(\"bottom\");\n\t    //svg.append(\"g\").attr(\"class\",\"axis\").attr(\"transform\", \"translate(0,\" + (s.height - 2*padding) + \")\").call(xAxis);\n\t    //svg.append(\"g\").attr(\"class\",\"axis\").attr(\"transform\", \"translate(\" + 2*padding + \",0)\").call(yAxis);\n\t    //svg.append(\"g\").attr(\"class\",\"axis\").call(xAxis);\n\t    embeddingPanel.embarray = $.map(clusterPanel.pathcldata.embedding.data, function(value, index) {\n\t\tvalue.push(index); return [value];\n\t    });\n\t    svg.selectAll(\"circle\")\n\t\t.data(embeddingPanel.embarray)\n\t\t.enter()\n\t\t.append(\"circle\")\n\t\t.attr(\"id\",function(d) {return(\"cell\"+(d[2]-1))})\n\t\t.attr(\"celli\",function(d) {return(d[2])})\n\t\t.attr(\"cx\",function(d) { return(xScale(d[0])) })\n\t\t.attr(\"cy\",function(d) { return(yScale(d[1])) })\n\t\t.attr(\"fill\",function(d) {return(embeddingPanel.cols[d[2]-1])})\n\t\t.attr(\"r\",5).on(\"mouseover\",function() {\n\t\t    d3.selectAll(\"#embedding circle.selected\").classed(\"selected\",false);\n\t\t    var rx=d3.select(this).classed(\"selected\",true).attr(\"celli\")-1;\n\t\t    var v=clusterPanel.s;\n\t\t    clusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t    clusterPanel.evctx.beginPath();\n\t\t    var mx=hc.hmleft()+(rx+0.5)*hc.hmwidth(v.width)/clusterPanel.pathcldata.matrix.dim[1];\n\t\t    clusterPanel.evctx.moveTo(mx,hc.cctop(clusterPanel.pathcldata))\n\t\t    clusterPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t    clusterPanel.evctx.stroke();\n\t\t    if(detailPanel.hasOwnProperty('evctx')) {\n\t\t\tv=detailPanel.getSize();\n\t\t\tdetailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t\tdetailPanel.evctx.beginPath();\n\t\t\tdetailPanel.evctx.moveTo(mx,hc.cctop(detailPanel.genecldata))\n\t\t\tdetailPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t\tdetailPanel.evctx.stroke();\n\t\t    }\n\t\t    \n\t\t}).append(\"svg:title\").text(function(d){ return(d[3]) });\n\t    \n\t    svg.on('mouseout', function() {\n\t\tvar v=clusterPanel.s;\n\t\tclusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\tv=detailPanel.getSize();\n\t\tif(detailPanel.hasOwnProperty('evctx')) {\n\t\t    detailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t}\n\t    });\n\t\t\n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw();\n\t    }\n\t}\n    });\n    \n\n    /* DETAILED CLUSTERING VIEW */\n    var detailMode=1; // 1: gene 2: pathway\n    var detailItemList={}; // pathway or gene list\n    var detailNGenes=20; // max genes to show \n\n    var detailPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding: 5,\n\tautoScroll: false,\n\tshowGenes: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    Ext.Ajax.request({\n\t\turl: 'genecl.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids))\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=1;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: selected genes\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).disable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).disable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\tshowPathways: function(ids) {\n\t    if(ids === undefined || ids.length==0) return;\n\t    detailPanel.setLoading(true);\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(3).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'pathwaygenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"genes\" : encodeURIComponent(JSON.stringify(ids)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.ids=ids;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=2; \n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: top genes in specified pathways\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t    \n\t},\n\tsearchSimilar: function(pattern) { // request genes most closely matching current data.colcol\n\t    if(!detailPanel.hasOwnProperty('genecldata')) return;\n\t    detailPanel.setLoading(true);\n\t    if(pattern === undefined) {\n\t\t// determine the colcol or single gene\n\t\tif(detailPanel.genecldata.hasOwnProperty('colcols')) {\n\t\t    pattern=detailPanel.genecldata.colcols.data;\n\t\t} else {\n\t\t    pattern=detailPanel.genecldata.matrix.data.slice(0,detailPanel.genecldata.matrix.dim[1]);\n\t\t}\n\t    }\n\t    var ngenes=detailPanelGearMenu.getComponent(0).getValue();\n\t    var twosided=detailPanelGearMenu.getComponent(3).checked;\n\t    Ext.Ajax.request({\n\t\turl: 'patterngenes.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tparams: {\n\t\t    \"ngenes\" : ngenes,\n\t\t    \"twosided\" : twosided,\n\t\t    \"pattern\" : encodeURIComponent(JSON.stringify(pattern)),\n\t\t    \"trim\" : hc.trim\n\t\t},\n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    detailPanel.pattern=pattern;\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    detailPanel.genecldata=data;\n\t\t    detailPanel.mode=3;\n\t\t    Ext.getCmp('expressionDetailsPane').setTitle(\"Expression Details: genes matching specified pattern\");\n\t\t    detailPanel.redraw(detailPanel.genecldata)\n\t\t    detailPanelGearMenu.getComponent(0).enable()\n\t\t    detailPanelGearMenu.getComponent(1).enable()\n\t\t    detailPanelGearMenu.getComponent(3).enable()\n\t\t    detailPanel.setLoading(false);\n\t\t}\n\t    });\n\t},\n\treload: function() { // goes back to the server to redraw the same ids\n\t    if(!detailPanel.hasOwnProperty('mode')) return;\n\t    switch(detailPanel.mode) {\n\t    case 1: detailPanel.showGenes(detailPanel.ids); break;\n\t    case 2: detailPanel.showPathways(detailPanel.ids);break;\n\t    case 3: detailPanel.searchSimilar(detailPanel.pattern);break;\n\t    default: console.log(\"reload requested with an undefined detailPanel mode\");\n\t    }\n\t},\n\tredraw: function(data) { // redraws the panels without going back to the server\n\t    if(data === undefined) {\n\t\tif(detailPanel.hasOwnProperty('genecldata')) { data=detailPanel.genecldata; } else {  return; }\n\t    }\n\t    \n\t    \n\t    $('#genecl').remove();\n\t    delete detailPanel.evctx;\n\t    $('#geneclev').remove();\n\t    $('#gclCD').remove();\n\t    var s=detailPanel.getSize();\n\t    // adjust height to match maxRowHeight (=15) if needed\n\t    if((s.height-hc.hmtop(data)-hc.margins.bottom)/data.matrix.dim[0] > 15) {\n\t\ts.height=15*data.matrix.dim[0]+hc.margins.bottom+hc.hmtop(data);\n\t    }\n\t    detailPanel.s=s;\n\t    \n\t    detailPanel.body.update('<div id=\"gclCD\" style=\"padding:0 0 0 0; position:relative\"><canvas id=\"genecl\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 0;\"></canvas> <canvas id=\"geneclev\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 1;\"></canvas></div>')\n\t    \n\t    var gctx= $('#genecl')[0].getContext('2d');\n\t    if(data.hasOwnProperty('colcols')) {\n\t\t//colcols\n\t\tdrawHeatmap(gctx,data.colcols,hc.hmleft(),hc.cctop(data),hc.hmwidth(s.width), hc.ccheight(data));\n\t\t//rowcols\n\t\tdrawHeatmap(gctx,data.rowcols,hc.margins.left,hc.hmtop(data),hc.rowcolWidth,hc.hmheight(data,s.height),false)\n\t    }\n\n\t    //main heatmap\n\t    drawHeatmap(gctx,data.matrix,hc.hmleft(),hc.hmtop(data),hc.hmwidth(s.width),hc.hmheight(data,s.height),true)\n\t    // event handling\n\t    var evc=document.getElementById(\"geneclev\");\n\t    var evctx= evc.getContext('2d');\n\t    detailPanel.evctx=evctx;\n\t    try {\n\t\tevctx.setLineDash([5]);\n\t    } catch (err) {}\n\t    evctx.fillStyle='black'; \n\t    evctx.font=\"bold 15px Arial\";\n\t    \n\t    var evtRect = evc.getBoundingClientRect();\n\n\n\t    evc.addEventListener('click', function(evt) {\n\t\tif(!clusterPanel.hasOwnProperty(\"pathcldata\") || !clusterPanel.pathcldata.hasOwnProperty('embedding')) { return; }\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.cctop(data)) { \n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\t// update embedding\n\t\t\tembeddingPanel.recolor(heatmapRowColors(data.matrix,ry));\n\t\t\tExt.getCmp(\"embeddingDiv\").setTitle(\"2D Embedding: gene \"+data.matrix.rows[ry])\n\t\t    } else {\n\t\t\tvar ry=Math.floor((my-hc.cctop(data))/hc.ccheight(data)*data.colcols.dim[0]);\n\t\t\tembeddingPanel.recolor(heatmapRowColors(data.colcols,ry));\n\t\t\tExt.getCmp(\"embeddingDiv\").setTitle(\"2D Embedding: consensus pattern\")\n\t\t    }\n\t\t}\n\t    }, false);\n\t    \n\t    evc.addEventListener('mousemove', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=clusterPanel.s;\n\t\tclusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.cctop(data)) { \n\t\t    //mx=hc.hmleft()+(rx+0.5)*hc.hmwidth(s.width)/data.matrix.dim[1];\n\t\t    evctx.beginPath(); \n\t\t    evctx.moveTo(mx,hc.cctop(data)); evctx.lineTo(mx,s.height-hc.margins.bottom);\n\n\t\t    var rx=Math.floor((mx-hc.hmleft())/hc.hmwidth(s.width)*data.matrix.dim[1]);\n\t\t    \n\t\t    // update the line in the cluster panel\n\t\t    clusterPanel.evctx.beginPath();\n\t\t    clusterPanel.evctx.moveTo(mx,hc.cctop(clusterPanel.pathcldata))\n\t\t    clusterPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t    clusterPanel.evctx.stroke();\n\n\t\t    if(mx>s.width/2) { \n\t\t\tevctx.textAlign=\"end\"; mx-=5; \n\t\t    } else {\n\t\t\tevctx.textAlign=\"start\"; mx+=5;\n\t\t    }\n\t\t    evctx.textBaseline=\"bottom\";\n\t\t    evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx,my-3); \n\t\t    \n\n\n\t\t    d3.selectAll(\"#embedding circle.selected\").classed(\"selected\",false);\n\t\t    d3.select(\"#cell\"+rx).classed(\"selected\",true).moveToFront();\n\n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\t//my=hc.hmtop(data)+(ry+0.5)*hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t\t\tevctx.moveTo(hc.margins.left,my); evctx.lineTo(s.width-hc.margins.right,my);\n\n\t\t\tevctx.textBaseline=\"top\";\n\t\t\tevctx.fillText(\"gene: \"+data.matrix.rows[ry],mx,my+3); \t\n\t\t    } else {\n\t\t\tevctx.moveTo(hc.hmleft(),my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t    }\n\t\t    evctx.stroke();\n\n\t\t}\n\t    }, false);\n\n\t    evc.addEventListener('mouseout', function(evt) {\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=clusterPanel.s;\n\t\tclusterPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\td3.selectAll(\"#embedding circle.selected\").classed(\"selected\",false);\n\t    });\n\n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw();\n\t    }\n\t}\n\n    })\n\n\n    // clear filter filed trigger button\n    Ext.define('Ext.ux.CustomTrigger', {\n\textend: 'Ext.form.field.Trigger',\n\talias: 'widget.customtrigger',\n\tinitComponent: function () {\n            var me = this;\n            me.triggerCls = 'x-form-clear-trigger';\n            me.callParent(arguments);\n\t},\n\t// override onTriggerClick\n\tonTriggerClick: function() {\n\t    if(this.getValue()!='') {\n\t\tthis.setRawValue('');\n\t\tthis.fireEvent('change',this,'');\n\t    }\n\t}\n    });\n\n\n    /* PATHWAY CLUSTER INFO */\n\n    Ext.define('clinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'od',type: 'float'},{name: 'npc', type: 'integer'},{name: 'sign', type: 'integer'},{name: 'initsel', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var clinfostore = Ext.create('Ext.data.Store', {\n        id: 'clinfostore',\n        model: 'clinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'clinfo.json',\n\t    extraParams: {\n\t\tpathcl: currentPathCl\n\t    },\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n\tlisteners: {\n\t    load: function(r) {\n\t\t// use the supplied 'initsel' to set the initial selection\n\t\tclSelectModel.suspendEvent('selectionchange')\n\t\tfor(var i=0;i<r.data.items.length;i++) {\n\t\t    if(r.data.items[i].data.initsel==1) {\n\t\t\tclSelectModel.select(i,true);\n\t\t    }\n\t\t}\n\t\tclSelectModel.resumeEvent('selectionchange');\n\t\tclSelectModel.fireEvent('selectionchange',clSelectModel,clSelectModel.getSelection());\n\t    }\n\t}\n    });\n\n    var clSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var clInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: clinfostore,\n\tid: \"clinfo\",\n\tselModel: clSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: clinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tclinfostore.clearFilter(true);\n\t\t\t\tclinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tclinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n\t\t  //{text: \"name\", flex: 1, dataIndex: 'name', sortable: true},\n\t    {\n                text: 'overdispersion',\n                dataIndex: 'od',\n                flex:1,\n                renderer: function (v, m, r) {\n\t\t    //m.tdAttr='data-qtip=\"'+r.data.name+'\"';\n                    var id = Ext.id();\n                    Ext.defer(function () {\n                        Ext.widget('progressbar', {\n                            renderTo: id,\n\t\t\t    text: r.data.name,\n                            value: v / 1,\n                        });\n                    }, 50);\n\t\t    if(r.data.sign==\"1\") {\n\t\t\treturn Ext.String.format('<div class=\"positive\" id=\"{0}\"></div>', id);\n\t\t    } else {\n\t\t\treturn Ext.String.format('<div class=\"negative\" id=\"{0}\"></div>', id);\n\t\t    }\n                }\n            },\n            {text: \"PC\", width: 30, dataIndex: 'npc', sortable: true},\n            /*{text: \"overdispersion\", width: 100, dataIndex: 'od', sortable: true}*/\n        ]\n    })\n\n\n\n    /* GENE SET ENRICHMENT INFO */\n\n    Ext.define('geinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t    'name', 'id', {name: 'fe',type: 'float'},{name: 'o', type: 'integer'},{name: 'u', type: 'integer'},{name: 'Z', type: 'float'},{name: 'Za', type: 'float'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var geinfostore = Ext.create('Ext.data.Store', {\n        id: 'geinfostore',\n        model: 'geinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'ajax',\n            url: 'testenr.json',\n\t    actionMethods: {create: 'POST', read: 'POST', update: 'POST', destroy: 'POST'},\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        autoLoad: false,\n\tpageSize: 50,\n        remoteFilter: true,\n    });\n\n    var geSelectModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tif(ids.length>0) detailPanel.showPathways(ids);\n            }\n        }\n    });\n    \n    var geInfoGrid = Ext.create('Ext.grid.Panel', {\n        store: geinfostore,\n\tid: \"geinfo\",\n\tselModel: geSelectModel,\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Enriched Pathways',\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: geinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 200, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tgeinfostore.clearFilter(true);\n\t\t\t\tgeinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tgeinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true },\n\t    {text: \"FE\", width: 60, dataIndex: 'fe', sortable: true, tooltip: \"fold enrichment\"},\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, tooltip: \"enrichment Z-score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'Za', sortable: true, tooltip: \"enrichment Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 50, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes found in this pathway\"},\n\t    {text: \"u\", width: 50, dataIndex: 'u', sortable: true, hidden: true, tooltip: \"total number of genes annotated for this pathway\"}\n        ]\n    })\n\n\n    /* gene info tab */\n    Ext.define('ginfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'gene', {name: 'var',type: 'float'},{name: 'svar', type: 'float'}\n        ],\n\tidProperty: 'gene'\n    });\n\n    var ginfostore = Ext.create('Ext.data.Store', {\n        id: 'ginfostore',\n        model: 'ginfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'genes.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n        sorters: [{\n            property: 'var',\n            direction: 'DESC'\n        }],\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n\n\n    var geneSelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\t    var ids = $.map(selections,function(val,i) {\n\t\t\t    return(val.data.gene)\n\t\t\t})\n\t\t    detailPanel.showGenes(ids);\n\t\t    //grid4.down('#removeButton').setDisabled(selections.length === 0);\n            }\n        }\n    });\n\n    var geneGrid = Ext.create('Ext.grid.Panel', {\n        store: ginfostore,\n\tselModel: geneSelModel,\n        columns: [\n            {text: \"Gene\", flex: 1, dataIndex: 'gene', sortable: true,\n\t     renderer: function(value) {\n\t\t return Ext.String.format('<a href=\"http://www.informatics.jax.org/searchtool/Search.do?query={0}\" target=\"_blank\">{1}</a>',value,value)\n\t     }\n\t    },\n            {text: \"Variance\", width: 100, dataIndex: 'var', sortable: true}\n        ],\n\t//features: [filters],\n\theight:'100%',\n\tcolumnLines:true,\n\temptyText: 'No Matching Genes',\n\t//forceFit: true\n        //renderTo:'example-grid',\n        //width: 800,\n        //height: 300\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: ginfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No genes to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by gene name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 600, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tginfostore.clearFilter(true);\n\t\t\t\tginfostore.filter({property: 'gene', value: value});\n\t\t\t    } else {\n\t\t\t\tginfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t}),\n\tlisteners: {\n\t    viewready: function() {\n\t\t    // select top 20 genes \n\t\t    this.selModel.suspendEvent('selectionchange')\n\t\t    for(var i=0;i<Math.min(this.store.data.items.length,20);i++) {\n\t\t\tthis.selModel.select(i,true);\n\t\t    }\n\t\t    this.selModel.resumeEvent('selectionchange');\n\t\t    this.selModel.fireEvent('selectionchange',this,this.selModel.getSelection());\n\t    }\n\t}\n    });\n\n\n\n    /* gene info tab */\n    Ext.define('pinfo',{\n        extend: 'Ext.data.Model',\n        fields: [\n\t  'id','name', {name: 'Z',type: 'float'},{name: 'aZ',type: 'float'},{name: 'score',type: 'float'},{name: 'n', type: 'integer'},{name: 'npc', type: 'integer'}\n        ],\n\tidProperty: 'id'\n    });\n\n    var pinfostore = Ext.create('Ext.data.Store', {\n        id: 'pinfostore',\n        model: 'pinfo',\n        remoteSort: true,\n        proxy: {\n            type: 'jsonp',\n            url: 'pathways.json',\n\t    reader: {\n\t\troot: 'genes',\n                totalProperty: 'totalCount'\n            },\n\t    simpleSortMode: true,\n        },\n\tpageSize: 100,\n        remoteFilter: true,\n        autoLoad: true\n    });\n\n    var pathwaySelModel = Ext.create('Ext.selection.CheckboxModel', {\n        listeners: {\n            selectionchange: function(sm, selections) {\n\t\tvar ids = $.map(selections,function(val,i) {\n\t\t    return(val.data.id)\n\t\t})\n\t\tdetailPanel.showPathways(ids);\n            }\n        }\n    });\n\n    var pathwayGrid = Ext.create('Ext.grid.Panel', {\n        store: pinfostore,\n\tselModel: pathwaySelModel,\n        columns: [\n            {text: \"Pathway\", flex: 1, dataIndex: 'name', sortable: true, tooltip: \"pathway / gene set name\" },\n            {text: \"Z\", width: 60, dataIndex: 'Z', sortable: true, hidden: true, tooltip: \"overdispersion Z score\"},\n            {text: \"cZ\", width: 60, dataIndex: 'aZ', sortable: true, tooltip: \"overdispersion Z score, adjusted for multiple hypothesis\"},\n\t    {text: \"co.Z\", width: 60, dataIndex: 'sh.Z', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score\"},\n            {text: \"co.cZ\", width: 60, dataIndex: 'sh.aZ', sortable: true, hidden: true, tooltip: \"pathway coherence Z-score, corrected for multiple hypothesis testing\"},\n            {text: \"n\", width: 60, dataIndex: 'n', sortable: true, hidden: true, tooltip: \"number of genes in the pathway\"},\n\t    {text: \"nPC\", width: 60, dataIndex: 'npc', sortable: true, hidden: true, tooltip: \"principal component number\"},\n\t    {text: \"score\", width: 60, dataIndex: 'score', sortable: true, tooltip: \"observed/expected overdispersion\"}\n        ],\n\t//features: [filters],\n\temptyText: 'No Matching Pathways',\n        height: '100%',\n\tcolumnLines:true,\n\t// paging bar on the bottom\n        tbar: Ext.create('Ext.PagingToolbar', {\n            store: pinfostore,\n            displayInfo: false,\n            //displayMsg: 'Displaying genes {0} - {1} of {2}',\n            emptyMsg: \"No pathways to display\",\n\t    items:[\n                {\n\t\t    flex:1,\n\t\t    width: 500,\n\t\t    minWidth: 50,\n\t\t    xtype: 'customtrigger',\n\t\t    emptyText: 'filter by pathway name...',\n\t\t    listeners: {\n\t\t\tchange: {buffer: 400, fn: function(field, value) {\n\t\t\t    if (value.length>0) {\n\t\t\t\tpinfostore.clearFilter(true);\n\t\t\t\tpinfostore.filter({property: 'name', value: value});\n\t\t\t    } else {\n\t\t\t\tpinfostore.clearFilter(false);\n\t\t\t    }\n\t\t\t}}\n\t\t    }\n\t\t}],\n\t    listeners: {\n\t\tafterrender: function() {\n\t\t    this.down('#refresh').hide();\n\t\t}\n\t    }\n\t})\n    });\n\n\n\n    var infotab = Ext.create('Ext.tab.Panel', {\n\t    //tabPosition: 'right',\n\t    defaults: {\n\t\tbodyPadding: 0,\n\t\tlayout: 'fit',\n\t\ticonCls: 'tab-icon'\n\t    },\n\t    items: [\n\t\t{ title: 'Pathways', items:[pathwayGrid] },\n\t\t{ title: 'Genes', items:[geneGrid] },\n\t\t{ title: 'Cluster', items:[clInfoGrid], itemId: 'clustertab', hidden:true },\n\t\t{ title: 'Enrichment', items:[geInfoGrid], itemId: 'enrichmenttab', hidden:true }\n\t    ]\n\t});\n\n    \n    // draw dendrogram in a given context\n    // ctx - canvas 2d context\n    // d - dendrogram structure (t(merge), order, height)\n    // x,y,width,height - placement\n    function drawDendrogram(ctx, d, x, y, width, height) {\n\tvar nmerges=d.merge.length;\n\tvar xstep=width/d.order.length;\n\tvar lp,lpy,rp,rpy,jx,jy;\n\tvar maxHeight=Math.max.apply(null, d.height);\n\tvar heightScale=height/maxHeight;\n\tctx.beginPath();\n\n\tfor(var i=0, mergex=[]; i<nmerges; i++) {\n\t    jy=y+(maxHeight-d.height[i])*heightScale;\n\t    lp=d.merge[i]; // left position (starts as an index)\n\t    if(lp<0) { // leaf\n\t\tlp=d.order[-1*lp-1]; \n\t\tlp=x+(lp-0.5)*xstep;\n\t\tlpy=y+height;\n\t    } else { // inner node\n\t\tlpy=y+(maxHeight-d.height[lp-1])*heightScale; lp=mergex[lp-1]; \n\t    }\n\t    i++; rp=d.merge[i]; // right position\n\t    if(rp<0) { // leaf\n\t\trp=d.order[-1*rp-1]; \n\t\trp=x+(rp-0.5)*xstep;\n\t\trpy=y+height;\n\t    } else { // inner node\n\t\trpy=y+(maxHeight-d.height[rp-1])*heightScale; rp=mergex[rp-1]; \n\t    }\n\t    jx=(lp+rp)/2;  jy=y+(maxHeight-d.height[(i-1)/2])*heightScale;\n\t    mergex.push(jx);\n\t    ctx.moveTo(lp,lpy); ctx.lineTo(lp,jy); ctx.lineTo(rp,jy); ctx.lineTo(rp,rpy);\n\t    \n\t}\n\tctx.lineWidth=1.5; ctx.lineJoin='miter'; ctx.strokeStyle='black'; ctx.stroke();\n    }\n\n    // get heatmap row (in colors)\n    function heatmapRowColors(d,i) {\n\tvar cols=[];\n\tif(d.hasOwnProperty('colors')) {\n\t    // establish mapping\n\t    if(d.hasOwnProperty('zlim')) {\n\t\tzlim=d.zlim;\n\t    } else {\n\t\tzlim.push(-1*Math.max.apply(null,d.data.map(Math.abs)));\n\t\tzlim.push(-1*zlim[0]);\n\t    }\n\t    zlim.push(d.colors.length/(zlim[1]-zlim[0])); // zlim step\n\t    var val;\n\t    for(var j=0; j<d.dim[1];j++) { // columns \n\t\tval=(d.data[i*d.dim[1]+j] - zlim[0])*zlim[2];\n\t\tif(val<0) { val=0; } else if(val>=d.colors.length) { val=d.colors.length-1; }\n\t\tval=d.colors[Math.floor(val)];\n\t\tcols.push(val)\n\t    }\n\t} else {\n\t    // interpret the values as colors\n\t    for(var j=0; j<d.dim[1];j++) { // columns \n\t\tcols.push(d.data[i*d.dim[1]+j])\n\t    }\n\t}\n\treturn(cols);\n    }\n\n     // draw heatmap\n    function drawHeatmap(ctx,d,x,y,width,height,rowNames,maxRowHeight,maxFontSize,minFontSize) {\n\trowNames = typeof rowNames !== 'undefined' ? rowNames : false;\n\tmaxFontSize = typeof maxFontSize !== 'undefined' ? maxFontSize : 100;\n\tminFontSize = typeof minFontSize !== 'undefined' ? minFontSize : 6;\n\tmaxRowHeight = typeof maxRowHeight !== 'undefined' ? maxRowHeight : 100;\n\tif(height>d.dim[0]*maxRowHeight) { height=d.dim[0]*maxRowHeight;}\n\tvar rowHeight=height/d.dim[0]; var colWidth=width/d.dim[1];\n\tvar mC=d.hasOwnProperty('colors'); // perform color mapping on the fly\n\tif(rowNames && !d.hasOwnProperty('rows')) { rowNames=false; }\n\tif(rowNames) { \n\t    var fontSize=(rowHeight*0.95); \n\t    if(fontSize>maxFontSize) { fontSize=maxFontSize; };\n\t    if(fontSize>=minFontSize) { \n\t\tctx.font=fontSize+\"px Arial\"; ctx.textAlign='left'; ctx.textBaseline=\"middle\";\n\t    } else { \n\t\trowNames=false;\n\t    }\n\t}\n\tvar zlim=[];\n\tif(mC) {\n\t    if(d.hasOwnProperty('zlim')) {\n\t\tzlim=d.zlim;\n\t    } else {\n\t\tzlim.push(-1*Math.max.apply(null,d.data.map(Math.abs)));\n\t\tzlim.push(-1*zlim[0]);\n\t    }\n\t    zlim.push(d.colors.length/(zlim[1]-zlim[0])); // zlim step\n\t}\n\tvar val=0;\n\tfor(var i=0; i<d.dim[0]; i++) { // rows\n\t    for(var j=0; j<d.dim[1]; j++) { // columns\n\t\tif(mC) { // colors were specified\n\t\t    val=(d.data[i*d.dim[1]+j] - zlim[0])*zlim[2];\n\t\t    if(val<0) { val=0; } else if(val>=d.colors.length) { val=d.colors.length-1; }\n\t\t    val=d.colors[Math.floor(val)];\n\t\t} else { // interpret values as colors directly\n\t\t    val=d.data[i*d.dim[1]+j]\n\t\t}\n\t\tctx.fillStyle=val; ctx.fillRect(x+j*colWidth,y+i*rowHeight,colWidth,rowHeight);\n\n\t    }\n\t    if(rowNames) { ctx.fillStyle='black'; ctx.fillText(d.rows[i],x+width+3,y+(i+0.5)*rowHeight); }\n\t}\n\tctx.lineWidth=1; ctx.strokeStyle='black'; ctx.strokeRect(x,y,width,height);\n\t//if(zlim!==undefined) {return(zlim[1])};\n    }\n\n    function updatePathclInfo(pathcl) {\n\tif(pathcl == currentPathCl) return;\n\tclinfostore.getProxy().setExtraParam(\"pathcl\",pathcl)\n\tclinfostore.load();\n\tinfotab.child(\"#clustertab\").tab.show();\n\tinfotab.setActiveTab(2);\n\tinfotab.getActiveTab().setTitle(\"Aspect \"+pathcl)\n\tcurrentPathCl=pathcl;\n    }\n    \n    /* heatmap config */\n    var hc = {\n\tspacing: 5,\n\tdendSpacing: 1,\n\tgeneUnitHeight: 15,\n\tmargins: {top:2,right:80,bottom:15,left:1},\n\tcolcolUnitHeight: 10,\n\trowcolWidth: 10,\n\tcolDendHeight: 50,\n\ttrim: 0,\n\t// heatmap left position\n\thmleft: function() { return(this.margins.left+this.rowcolWidth+this.spacing)},\n\t// colcol top position\n\tcctop: function(data) { return(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.dendSpacing : this.margins.top) },\n\t// colcol height\n\tccheight: function(data) { return(this.colcolUnitHeight*data.colcols.dim[0]) },\n\t// heatmap top position\n\thmtop: function(data) { \n\t    if(data.hasOwnProperty('colcols')) {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing+this.dendSpacing + this.ccheight(data) : this.margins.top+this.spacing+this.dendSpacing + this.ccheight(data))\n\t    } else {\n\t\treturn(data.hasOwnProperty('coldend')? this.margins.top+this.colDendHeight+this.spacing : this.margins.top)\n\t    }\n\t},\n\t// heatmap hight\n\thmheight: function(data,totalHeight) {\n\t    return((totalHeight-this.margins.bottom) - this.hmtop(data));\n\t},\n\t// heatmap width\n\thmwidth: function(totalWidth) { return(totalWidth-this.margins.left-this.rowcolWidth-this.spacing-this.margins.right)},\n\trowlableft: function(totalWidth) {\n\t    return(this.hmleft()+this.hmwidth(totalWidth)+this.spacing);\n\t},\n\trowlabelsize: function(data,totalHeight) {\n\t    return(Math.round(this.hmheight(data,totalHeight)/data.matrix.dim[0]*0.97*72/96))\n\t    //return(18);\n\t}\n    };\n\n    var clusterPanel = Ext.create('Ext.panel.Panel', {\n\tlayout: 'fit',\n\tbodyPadding: 5,\n\t//ohtml: \"Pathway Clustering\",\n\treload: function() {\n\t    clusterPanel.setLoading(true);\n\t    Ext.Ajax.request({\n\t\turl: 'pathcl.json',\n\t\tmethod: 'POST',          \n\t\twaitTitle: 'Connecting',\n\t\twaitMsg: 'Sending data...',                                     \n\t\tscope:this,\n\t\tfailure: function(r,o){console.log('failure:'); console.log(r);},\n\t\tsuccess: function(response) {\n\t\t    var data = Ext.JSON.decode(response.responseText)\n\t\t    if(data.matrix.dim[0]==1) data.matrix.rows=[data.matrix.rows]\n\t\t    if(data.colcols.dim[0]==1) data.colcols.rows=[data.colcols.rows]\n\t\t    clusterPanel.pathcldata=data;\n\t\t    clusterPanelGearMenu.getComponent(0).suspendEvents();\n\t\t    clusterPanelGearMenu.getComponent(0).setValue(data.matrix.zlim[1]);\n\t\t    //clusterPanelGearMenu.getComponent(0).setMaxValue(data.matrix.range[1]);\n\t\t    clusterPanelGearMenu.getComponent(0).resumeEvents()\n\t\t    if(data.hasOwnProperty('trim')) {\n\t\t\thc.trim=data.trim;\n\t\t\tdetailPanelGearMenu.getComponent(1).suspendEvents();\n\t\t\tdetailPanelGearMenu.getComponent(1).setValue(data.trim);\n\t\t\tdetailPanelGearMenu.getComponent(1).resumeEvents();\n\t\t    }\n\t\t    clusterPanel.redraw(clusterPanel.pathcldata);\n\t\t    if(data.hasOwnProperty('embedding')) {\n\t\t\tExt.getCmp(\"embeddingDiv\").show();\n\t\t\tembeddingPanel.draw();\n\t\t    }\n\t\t    clusterPanel.setLoading(false);\n\t\t}\n\t    })\n\t},\n\tredraw: function(data) {\n\t    if(data === undefined) {\n\t\tif(clusterPanel.hasOwnProperty('pathcldata')) { data=clusterPanel.pathcldata; } else { clusterPanel.reload(); return; }\n\t    }\n\t    \n\t    $('#pathcl').remove();\n\t    $('#pathclev').remove();\n\t    delete clusterPanel.evctx;\n\t    $('#pclCD').remove();\n\t    //clusterPanel.update(\"\");\n\t    var s=clusterPanel.getSize();\n\t    clusterPanel.s=s;\n\t    clusterPanel.body.update('<div id=\"pclCD\" style=\"padding:0 0 0 0; position:relative\"><canvas id=\"pathcl\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 0;\"></canvas> <canvas id=\"pathclev\" width='+s.width+' height='+s.height+' style=\"background-color:transparent; position:absolute; left: 0; top: 0; z-index: 1;\"></canvas></div>')\n\t    \n\t    \n            var ctx= $('#pathcl')[0].getContext('2d');\n\t    drawDendrogram(ctx,data.coldend,hc.hmleft(),hc.margins.top,hc.hmwidth(s.width),hc.colDendHeight);\n\t    \n\t    //colcols\n\t    drawHeatmap(ctx,data.colcols,hc.hmleft(),hc.cctop(data),hc.hmwidth(s.width), hc.ccheight(data));\n\t    \n\t    //rowcols\n\t    drawHeatmap(ctx,data.rowcols,hc.margins.left,hc.hmtop(data),hc.rowcolWidth,hc.hmheight(data,s.height))\n\n\t    //main heatmap\n\t    drawHeatmap(ctx,data.matrix,hc.hmleft(),hc.hmtop(data),hc.hmwidth(s.width),hc.hmheight(data,s.height),rowNames=true)\n\t    \n\t    // event handling\n\t    var evc=document.getElementById(\"pathclev\");\n\t    var evctx= evc.getContext('2d');\n\t    clusterPanel.evctx=evctx;\n\n\t    //evctx.strokeRect(0,0,s.width-10,s.height-10);\n\t    try {\n\t\tevctx.setLineDash([5]);\n\t    } catch (err) {}\n\t    evctx.fillStyle='black'; \n\t    evctx.font=\"bold 15px Arial\";\n\n\t    var evtRect = evc.getBoundingClientRect();\n\t    \n\t    evc.addEventListener('click', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.cctop(data)) { \n\t\t//if(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft()) { \n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\tevctx.strokeStyle='red'; evctx.lineWidth=2;\n\t\t\ttry { evctx.setLineDash([0]); } catch(err) {}\n\t\t\t//evctx.strokeRect(hc.hmleft(),hc.hmtop(data)+ry*hc.hmheight(data,s.height)/data.matrix.dim[0],hc.hmwidth(s.width),hc.hmheight(data,s.height)/data.matrix.dim[0]); \n\t\t\tevctx.strokeRect(0,hc.hmtop(data)+ry*hc.hmheight(data,s.height)/data.matrix.dim[0],s.width,hc.hmheight(data,s.height)/data.matrix.dim[0]); \n\t\t\tevctx.strokeStyle='black'; evctx.lineWidth=1;\n\t\t\ttry { evctx.setLineDash([5]); } catch(err) {}\n\t\t\tupdatePathclInfo(data.matrix.rows[ry])\n\n\t\t\t// update embedding\n\t\t\t\n\t\t\tif(data.hasOwnProperty('embedding')) {\n\t\t\t    embeddingPanel.recolor(heatmapRowColors(data.matrix,ry));\n\t\t\t    Ext.getCmp(\"embeddingDiv\").setTitle(\"2D Embedding: aspect \"+data.matrix.rows[ry]);\n\t\t\t}\n\t\t    } else {\n\t\t\tif(data.hasOwnProperty('embedding')) {\n\t\t\t    var ry=Math.floor((my-hc.cctop(data))/hc.ccheight(data)*data.colcols.dim[0]);\n\t\t\t    embeddingPanel.recolor(heatmapRowColors(data.colcols,ry));\n\t\t\t    Ext.getCmp(\"embeddingDiv\").setTitle(\"2D Embedding: metadata \"+data.colcols.rows[ry])\n\t\t\t}\n\t\t\t\n\t\t    }\n\t\t}\n\t    }, false);\n\n\n\t    evc.addEventListener('mousemove', function(evt) {\n\t\tvar mx=evt.clientX-evtRect.left; var my=evt.clientY-evtRect.top;\n\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tif(detailPanel.hasOwnProperty('evctx')) {\n\t\t    var v=detailPanel.s;\n\t\t    detailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t}\n\t\tif(my<=s.height-hc.margins.bottom && mx<=s.width-hc.margins.right && mx>=hc.hmleft() && my>=hc.cctop(data)) { \n\t\t    //mx=hc.hmleft()+(rx+0.5)*hc.hmwidth(s.width)/data.matrix.dim[1];\n\t\t    evctx.beginPath(); \n\t\t    evctx.moveTo(mx,hc.cctop(data)); evctx.lineTo(mx,s.height-hc.margins.bottom);\n\t\t    // update the line in the gene panel\n\t\t    if(detailPanel.hasOwnProperty('evctx')) {\n\t\t\tdetailPanel.evctx.beginPath();\n\t\t\tdetailPanel.evctx.moveTo(mx,hc.cctop(detailPanel.genecldata))\n\t\t\tdetailPanel.evctx.lineTo(mx,v.height-hc.margins.bottom)\n\t\t\tdetailPanel.evctx.stroke();\n\t\t    }\n\t\t    var rx=Math.floor((mx-hc.hmleft())/hc.hmwidth(s.width)*data.matrix.dim[1]);\n\t\t    if(mx>s.width/2) { \n\t\t\tevctx.textAlign=\"end\"; mx-=5; \n\t\t    } else {\n\t\t\tevctx.textAlign=\"start\"; mx+=5;\n\t\t    }\n\t\t    evctx.textBaseline=\"bottom\";\n\t\t    evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx,my-3); \n\n\t\t    if(data.hasOwnProperty('embedding')) {\n\t\t\td3.selectAll(\"#embedding circle.selected\").classed(\"selected\",false);\n\t\t\td3.select(\"#cell\"+rx).classed(\"selected\",true).moveToFront();\n\t\t    }\n\t\t    \n\t\t    if(my>=hc.hmtop(data)) {\n\t\t\tvar ry=Math.floor((my-hc.hmtop(data))/hc.hmheight(data,s.height)*data.matrix.dim[0]);\n\t\t\t//my=hc.hmtop(data)+(ry+0.5)*hc.hmheight(data,s.height)/data.matrix.dim[0];\n\t\t\tevctx.moveTo(hc.margins.left,my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t\t\n\t\t\t//evctx.fillText(\"cell: \"+data.matrix.cols[rx],mx+5,my-5); \n\t\t\tevctx.textBaseline=\"top\";\n\t\t\tevctx.fillText(\"aspect: \"+data.matrix.rows[ry],mx,my+3);\n\n\t\t    } else {\n\t\t\tevctx.moveTo(hc.hmleft(),my); evctx.lineTo(s.width-hc.margins.right,my);\n\t\t\tvar ry=Math.floor((my-hc.cctop(data))/hc.ccheight(data)*data.colcols.dim[0]);\n\t\t\tvar val=data.colcols.rows[ry]; \n\t\t\tif(val!==undefined) {\n\t\t\t    evctx.textBaseline=\"top\";\n\t\t\t    evctx.fillText(\"metadata: \"+val,mx,my+3);\n\t\t\t}\n\t\t    }\n\t\t    evctx.stroke();\n\t\t}\n\t    }, false);\n\t    \n\n\t    evc.addEventListener('mouseout', function(evt) {\n\t\tevctx.clearRect(0,0,s.width,s.height);\n\t\tvar v=detailPanel.getSize();\n\t\tif(detailPanel.hasOwnProperty('evctx')) {\n\t\t    detailPanel.evctx.clearRect(0,0,v.width,v.height);\n\t\t}\n\t\td3.selectAll(\"#embedding circle.selected\").classed(\"selected\",false);\n\t    });\n\t    \n\t},\n\tlisteners: {\n\t    resize: function(cmp,width,height,oldWidth,oldHeight,opts) {\n\t\tcmp.redraw(cmp.pathcldata);\n\t    }\n\t}\n    });\n\n\n    var clusterPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'clusterGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'Z limit',\n\t\tname: 'zlim',\n\t\txtype: 'numberfield',\n\t        value: -1,\n\t\tdecimalPrecision: 3,\n\t\tminValue: 0.0,\n\t        maxValue: 100,\n\t\twidth: 200,\n\t        disabled: false,\n\t\ttooltip: 'Set the range of overdispersion scores illustrated by colors',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {\n\t\t\tclusterPanel.pathcldata.matrix.zlim=[-1*v,v];\n\t\t\tclusterPanel.redraw()\n\t\t    }}\n\t\t}\n\t}\n\t],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n\t}\n    });\n\n    \n    var detailPanelGearMenu = Ext.create('Ext.menu.Menu', {\n        id: 'detailGearMenu',\n        style: {\n            overflow: 'visible'     // For the Combo popup\n        },\n        items: [{\n\t\tfieldLabel: 'N genes',\n\t\txtype: 'numberfield',\n\t        tooltip: 'Number of genes to show in the Expression Details panel',\n\t\tlabel: 'N genes',\n\t\tvalue: 20,\n\t\tminValue: 1,\n\t\tmaxValue: 1000,\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {detailPanel.reload()}}\n\t\t}\n\t    },{\n\t\tfieldLabel: 'Trim',\n\t\tname: 'trim',\n\t\txtype: 'numberfield',\n\t\tvalue: hc.trim,\n\t\tdecimalPrecision: 5,\n\t\tminValue: 0.0,\n\t\tmaxValue: 0.5,\n\t\twidth: 200,\n\t\tmaxValue: 100,\n\t\tdisabled: true,\n\t\ttooltip: 'Winsorization trim fraction',\n\t\tlisteners : {\n\t\t    change : {buffer: 800, fn:function(f,v) {hc.trim=v; detailPanel.reload()}}\n\t\t}\n            }, '-',\n\t    {\n                text: 'High/low genes',\n                checked: false,\n\t\ttooltip: 'Whether to include genes from both sides of the PC loading (true) or just high magnitude  (false)',\n\t\tdisabled: true,\n\t\tlisteners : {\n\t\t    checkchange : function() {detailPanel.reload()}\n\t\t}\n\t    }],\n\tlisteners:{\n\t    'mouseleave': {buffer: 1000, fn:function( menu, e, eOpts){\n\t\tmenu.hide();\n\t    }}\n }\n    });\n\n    var ngenesSlider = Ext.create('Ext.slider.Single', {\n\tlabel: 'N genes',\n\ttip: 'number of genes to show',\n\ttipText: function(thumb){ return Ext.String.format('show {0} genes', thumb.value); },\n\twidth: 100,\n\tvalue: 20,\n\tincrement: 1,\n\tminValue: 0,\n\tmaxValue: 500,\n\n    });\n\n    var viewport = Ext.create('Ext.Viewport', {\n        layout: {\n            type: 'border',\n            padding: 5\n        },\n        defaults: {\n            split: true\n        },\n        items: [{ region: 'center',\n\t\t  layout: 'border',\n\t\t  items: [{ region: 'north',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'clusterPanel',\n\t\t\t    title: 'Pathway Overdispersion',\n\t\t\t    tools: [\n\t\t\t\t{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      clusterPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'help',\n\t\t\t\t  tooltip: 'Tutorial',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      tutorialWindow.show(); \n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n\t\t\t\t  tooltip: 'Save image',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      if($('#pathcl').length==0) { return; }\n\t\t\t\t      var changingImage = Ext.create('Ext.Img', {\n\t\t\t\t\t  src: $('#pathcl')[0].toDataURL(\"image/png\",1.0)\n\t\t\t\t      });\n\t\t\t\t      win = new Ext.Window({\n\t\t\t\t\t  title: 'exported image: use right click to save the image',\n\t\t\t\t\t  layout: 'fit',\n\t\t\t\t\t  autoScroll: true,\n\t\t\t\t\t  modal: true,\n\t\t\t\t\t  closeAction: 'hide',\n\t\t\t\t\t  items:[changingImage]\n\t\t\t\t      });\n\t\t\t\t      win.show();\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    minHeight: 200,\n\t\t\t    height: 300,\n\t\t\t    bodyPadding: 0,\n\t\t\t    split: true,\n\t\t\t    items:[clusterPanel]\n\t\t\t},{ region: 'center',\n\t\t\t    layout: 'fit',\n\t\t\t    id: 'expressionDetailsPane',\n\t\t\t    minHeight: 100,\n\t\t\t    collapsible: false,\n//\t\t\t    headerPosition: 'bottom',\n\t\t\t    title: 'Expression Details',\n\t\t\t    tools: [\n\t\t\t\t{ type:'search',\n\t\t\t\t  tooltip: 'Search for genes matching the current consensus pattern',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanel.searchSimilar();\n\t\t\t\t  }\n\t\t\t\t},{ type:'collapse',\n\t\t\t\t  tooltip: 'Run GO enrichment analysis on the current gene set',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      if(detailPanel.hasOwnProperty('genecldata')) {\n\t\t\t\t\t  // write out current gene set\n\t\t\t\t\t  geinfostore.getProxy().setExtraParam(\"genes\",JSON.stringify(detailPanel.genecldata.matrix.rows))\n\t\t\t\t\t  geinfostore.load();\n\t\t\t\t\t  infotab.child(\"#enrichmenttab\").tab.show();\n\t\t\t\t\t  // show the info tab\n\t\t\t\t\t  infotab.setActiveTab(3);\n\t\t\t\t      }\n\t\t\t\t  }\n\t\t\t\t},{ type:'gear',\n\t\t\t\t  tooltip: 'Settings',\n\t\t\t\t  handler: function(e, el,o,t) {\n\t\t\t\t      detailPanelGearMenu.showBy(t);\n\t\t\t\t  }\n\t\t\t\t},\n\t\t\t\t{ type:'save',\n\t\t\t\t  tooltip: 'Save image',\n\t\t\t\t  handler: function(e,el,o,t) {\n\t\t\t\t      if($('#genecl').length==0) { return; }\n\t\t\t\t      var changingImage = Ext.create('Ext.Img', {\n\t\t\t\t\t  src: $('#genecl')[0].toDataURL(\"image/png\",1.0)\n\t\t\t\t      });\n\t\t\t\t      win = new Ext.Window({\n\t\t\t\t\t  title: 'exported image: use right click to save the image',\n\t\t\t\t\t  layout: 'fit',\n\t\t\t\t\t  autoScroll: true,\n\t\t\t\t\t  modal: true,\n\t\t\t\t\t  closeAction: 'hide',\n\t\t\t\t\t  items:[changingImage]\n\t\t\t\t      });\n\t\t\t\t      win.show();\n\t\t\t\t  }\n\t\t\t\t}\n\t\t\t    ],\n\t\t\t    header: true,\n\t\t\t    items:[detailPanel],\n\t\t\t    autoScroll: true,\n\t\t\t    autoShow: true,\n\t\t\t    /*listeners: {\n\t\t\t\tafterrender: function(panel) {\n\t\t\t\t    console.log(\"boo\");\n\t\t\t\t    var header=panel.getHeader();\n\t\t\t\t    header.insert(1,[ngenesSlider]);\n\t\t\t\t}\n\t\t\t    }*/\n\t\t\t}]\n\t\t},{ region: 'east',\n\t\t    collapsible: true,\n\t\t    split: true,\n\t\t    layout: 'border',\n\t\t    width: '30%',\n\t\t    title:'Info',\n\t\t    bodyPadding: 0,\n\t\t    items:[{\n\t\t\tregion:'center',\n\t\t\tlayout:'fit',\n\t\t\tminWidth: 100,\n\t\t\tminHeight: 140,\n\t\t\tbodyPadding: 0,\n\t\t\titems:[infotab]\n\t\t    },{\n\t\t\ttitle:'2D Embedding',\n\t\t\tid:'embeddingDiv',\n\t\t\tregion:'south',\n\t\t\tlayout: 'fit',\n\t\t\tminWidth: 100,\n\t\t\tminHeight: 100,\n\t\t\theight:'40%',\n\t\t\thidden:true,\n\t\t\tcollapsible: true,\n\t\t\tsplit:true,\n\t\t\tbodyPadding: 0,\n\t\t\ttools:[{ type:'save',\n\t\t\t\t tooltip: 'Save image',\n\t\t\t\t handler: function(e,el,o,t) {\n\t\t\t\t     var svge=d3.select(\"#embedding\");\n\t\t\t\t     if(!svge.empty()) {\n\t\t\t\t\t var html=svge.attr(\"version\", 1.1)\n\t\t\t\t\t     .attr(\"xmlns\", \"http://www.w3.org/2000/svg\")\n\t\t\t\t\t     .node().parentNode.innerHTML;\n\t\t\t\t\t var imgsrc = 'data:image/svg+xml;base64,'+ btoa(html);\n\t\t\t\t\t win = new Ext.Window({\n\t\t\t\t\t     title: 'exported image: use the link below save the image',\n\t\t\t\t\t     layout: 'fit',\n\t\t\t\t\t     html: '<a href='+imgsrc+'>link</a>'\n\t\t\t\t\t });\n\t\t\t\t\t win.show();\n\t\t\t\t\t \n\t\t\t\t     }\n\t\t\t\t }\n\t\t\t       }],\n\t\t\titems:[embeddingPanel]\n\t\t    }]\n\t\t}]\n\t});\n\n\n    if(Ext.util.Cookies.get(\"hidetutorial\")==null) {\n\ttutorialWindow.show(); \n    }\n\n\n});\n// google analytics code\n  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');\n\n  ga('create', 'UA-33018606-2', 'auto');\n  ga('send', 'pageview');\n\n"
  }
]