[
  {
    "path": ".gitignore",
    "content": "Genrich\n"
  },
  {
    "path": "Genrich.c",
    "content": "/*\n  John M. Gaspar (jsh58@wildcats.unh.edu)\n  June 2018\n\n  Finding sites of enrichment from genome-wide assays.\n\n  Version 0.6.1\n*/\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <stdbool.h>\n#include <string.h>\n#include <getopt.h>\n#include <math.h>\n#include <float.h>\n#include <limits.h>\n#include <zlib.h>\n#include \"Genrich.h\"\n\n/* void printVersion()\n * Print version and copyright.\n */\nvoid printVersion(void) {\n  fprintf(stderr, \"Genrich, version %s\\n\", VERSION);\n  fprintf(stderr, \"Copyright (C) 2018 John M. Gaspar (jsh58@wildcats.unh.edu)\\n\");\n  exit(EXIT_FAILURE);\n}\n\n/* void usage()\n * Prints usage information.\n */\nvoid usage(void) {\n  fprintf(stderr, \"Usage: ./Genrich  -%c <file>  -%c <file>\", INFILE, OUTFILE);\n  fprintf(stderr, \"  [optional arguments]\\n\");\n  fprintf(stderr, \"Required arguments:\\n\");\n  fprintf(stderr, \"  -%c  <file>       Input SAM/BAM file(s) for experimental sample(s)\\n\", INFILE);\n  fprintf(stderr, \"  -%c  <file>       Output peak file (in ENCODE narrowPeak format)\\n\", OUTFILE);\n  fprintf(stderr, \"Optional I/O arguments:\\n\");\n  fprintf(stderr, \"  -%c  <file>       Input SAM/BAM file(s) for control sample(s)\\n\", CTRLFILE);\n  fprintf(stderr, \"  -%c  <file>       Output bedgraph-ish file for p/q values\\n\", LOGFILE);\n  fprintf(stderr, \"  -%c  <file>       Output bedgraph-ish file for pileups and p-values\\n\", PILEFILE);\n  fprintf(stderr, \"  -%c  <file>       Output BED file for reads/fragments/intervals\\n\", BEDFILE);\n  fprintf(stderr, \"  -%c  <file>       Output file for PCR duplicates (only with -%c)\\n\", DUPSFILE, DUPSOPT);\n  fprintf(stderr, \"Filtering options:\\n\");\n  fprintf(stderr, \"  -%c               Remove PCR duplicates\\n\", DUPSOPT);\n  fprintf(stderr, \"  -%c  <arg>        Comma-separated list of chromosomes to exclude\\n\", XCHROM);\n  fprintf(stderr, \"  -%c  <file>       Input BED file(s) of genomic regions to exclude\\n\", XFILE);\n  fprintf(stderr, \"  -%c  <int>        Minimum MAPQ to keep an alignment (def. 0)\\n\", MINMAPQ);\n  fprintf(stderr, \"  -%c  <float>      Keep sec alns with AS >= bestAS - <float> (def. 0)\\n\", ASDIFF);\n  fprintf(stderr, \"  -%c               Keep unpaired alignments (def. false)\\n\", UNPAIROPT);\n  fprintf(stderr, \"  -%c  <int>        Keep unpaired alns, lengths changed to <int>\\n\", EXTENDOPT);\n  fprintf(stderr, \"  -%c               Keep unpaired alns, lengths changed to paired avg\\n\", AVGEXTOPT);\n  fprintf(stderr, \"Options for ATAC-seq:\\n\");\n  fprintf(stderr, \"  -%c               Use ATAC-seq mode (def. false)\\n\", ATACOPT);\n  fprintf(stderr, \"  -%c  <int>        Expand cut sites to <int> bp (def. %d)\\n\", ATACLEN, DEFATAC);\n  fprintf(stderr, \"  -%c               Skip Tn5 adjustments of cut sites (def. false)\\n\", DNASEOPT);\n  fprintf(stderr, \"Options for peak-calling:\\n\");\n  fprintf(stderr, \"  -%c  <float>      Maximum p-value (def. %.2f)\\n\", PVALUE, DEFPVAL);\n  fprintf(stderr, \"  -%c  <float>      Maximum q-value (FDR-adjusted p-value; def. 1)\\n\", QVALUE);\n  fprintf(stderr, \"  -%c  <float>      Minimum AUC for a peak (def. %.1f)\\n\", MINAUC, DEFAUC);\n  fprintf(stderr, \"  -%c  <int>        Minimum length of a peak (def. %d)\\n\", MINLEN, DEFMINLEN);\n  fprintf(stderr, \"  -%c  <int>        Maximum distance between signif. sites (def. %d)\\n\", MAXGAP, DEFMAXGAP);\n  fprintf(stderr, \"Other options:\\n\");\n  fprintf(stderr, \"  -%c               Skip peak-calling\\n\", NOPEAKS);\n  fprintf(stderr, \"  -%c               Call peaks directly from a log file (-%c)\\n\", PEAKSONLY, LOGFILE);\n  fprintf(stderr, \"  -%c               Option to gzip-compress output(s)\\n\", GZOPT);\n  fprintf(stderr, \"  -%c               Option to print status updates/counts to stderr\\n\", VERBOSE);\n  exit(EXIT_FAILURE);\n}\n\n/*** Utilites ***/\n\n/* int error()\n * Prints an error message.\n */\nint error(const char* msg, enum errCode err) {\n  fprintf(stderr, \"Error! %s%s\\n\", msg, errMsg[err]);\n  return EXIT_FAILURE;\n}\n\n/* void* memalloc()\n * Allocates a heap block.\n */\nvoid* memalloc(size_t size) {\n  void* ans = malloc(size);\n  if (ans == NULL)\n    exit(error(\"\", ERRMEM));\n  return ans;\n}\n\n/* void* memrealloc()\n * Changes the size of a heap block.\n */\nvoid* memrealloc(void* ptr, size_t size) {\n  void* ans = realloc(ptr, size);\n  if (ans == NULL)\n    exit(error(\"\", ERRMEM));\n  return ans;\n}\n\n/* float getFloat(char*)\n * Converts the given char* to a float.\n */\nfloat getFloat(char* in) {\n  char* endptr;\n  float ans = strtof(in, &endptr);\n  if (*endptr != '\\0')\n    exit(error(in, ERRFLOAT));\n  return ans;\n}\n\n/* int getInt(char*)\n * Converts the given char* to an int.\n */\nint getInt(char* in) {\n  char* endptr;\n  int ans = (int) strtol(in, &endptr, 10);\n  if (*endptr != '\\0')\n    exit(error(in, ERRINT));\n  return ans;\n}\n\n/* uint64_t getLong(char*)\n * Converts the given char* to an uint64_t.\n */\nuint64_t getLong(char* in) {\n  char* endptr;\n  uint64_t ans = (uint64_t) strtol(in, &endptr, 10);\n  if (*endptr != '\\0')\n    exit(error(in, ERRINT));\n  return ans;\n}\n\n/* char* getLine()\n * Reads the next line from a file.\n */\nchar* getLine(char* line, int size, File in, bool gz) {\n  if (gz)\n    return gzgets(in.gzf, line, size);\n  else\n    return fgets(line, size, in.f);\n}\n\n/*** Quicksort (of p-values, for q-value calculation) ***/\n// adapted from https://www.geeksforgeeks.org/quick-sort/\n\n/* void swapFloat(): Swap two float values (pileup->cov)\n * void swapInt():   Swap two int values (pileup->end)\n * int partition():  Place last elt into correct spot\n * void quickSort(): Control quickSort process recursively\n */\nvoid swapFloat(float* a, float* b) {\n  float t = *a;\n  *a = *b;\n  *b = t;\n}\nvoid swapInt(uint64_t* a, uint64_t* b) {\n  uint64_t t = *a;\n  *a = *b;\n  *b = t;\n}\nint64_t partition(float* pVal, uint64_t* pEnd,\n    int64_t low, int64_t high) {\n  float pivot = pVal[high];  // pivot value: last elt\n  int64_t idx = low - 1;\n\n  for (int64_t j = low; j < high; j++) {\n    if (pVal[j] < pivot) {\n      idx++;\n      swapFloat(pVal + idx, pVal + j);\n      swapInt(pEnd + idx, pEnd + j);  // swap int values too\n    }\n  }\n  idx++;\n  swapFloat(pVal + idx, pVal + high);\n  swapInt(pEnd + idx, pEnd + high);\n  return idx;\n}\nvoid quickSort(float* pVal, uint64_t* pEnd,\n    int64_t low, int64_t high) {\n  if (low < high) {\n    int64_t idx = partition(pVal, pEnd, low, high);\n    quickSort(pVal, pEnd, low, idx - 1);\n    quickSort(pVal, pEnd, idx + 1, high);\n  }\n}\n\n/*** Calculate q-values ***/\n\n/* float lookup()\n * Return the pre-computed q-value for a given p-value,\n *   using parallel arrays (pVal and qVal).\n */\nfloat lookup(float* pVal, uint64_t low, uint64_t high,\n    float* qVal, float p) {\n  if (low == high)\n    return qVal[low];\n  uint64_t idx = (low + high) / 2;\n  if (pVal[idx] == p)\n    return qVal[idx];\n  if (pVal[idx] > p)\n    return lookup(pVal, low, idx - 1, qVal, p);\n  return lookup(pVal, idx + 1, high, qVal, p);\n}\n\n/* void saveQval()\n * Calculate and save q-values, given the pre-compiled\n *   arrays of p-values (pVal) and lengths (pEnd).\n */\nvoid saveQval(Chrom* chrom, int chromLen, int n,\n    uint64_t genomeLen, float* pVal, uint64_t* pEnd,\n    int64_t pLen, bool verbose) {\n\n  // sort pileup by p-values\n  quickSort(pVal, pEnd, 0, pLen - 1);\n\n  // calculate q-values for each p-value: -log(q) = -log(p*N/k)\n  uint64_t k = 1;  // 1 + number of bases with higher -log(p)\n  float logN = -log10f(genomeLen);\n  float* qVal = (float*) memalloc((pLen + 1) * sizeof(float));\n  qVal[pLen] = FLT_MAX;\n  for (int64_t i = pLen - 1; i > -1; i--) {\n    // ensure monotonicity\n    qVal[i] = MAX( MIN( pVal[i] + logN + log10f(k),\n      qVal[i + 1]), 0.0f);\n    k += pEnd[i];\n  }\n\n  // save pileups of q-values for each chrom\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || chr->pval[n] == NULL)\n      continue;\n    for (uint32_t j = 0; j < chr->pvalLen[n]; j++)\n      if (chr->pval[n]->cov[j] == SKIP)\n        chr->qval->cov[j] = SKIP; // skipped region\n      else\n        chr->qval->cov[j] = lookup(pVal, 0, pLen,\n          qVal, chr->pval[n]->cov[j]);\n  }\n\n  // check if all q-values are 1\n  if (verbose && qVal[pLen-1] == 0.0f)\n    fprintf(stderr, \"Warning! All q-values are 1\\n\");\n\n  // free memory\n  free(qVal);\n}\n\n/*** Save p-values in hashtable ***/\n\n/* uint32_t jenkins_one_at_a_time_hash()\n * Adapted from http://www.burtleburtle.net/bob/hash/doobs.html\n *   Modified to take a float (p-value) as input.\n *   Returns index into hashtable.\n */\nuint32_t jenkins_one_at_a_time_hash(float f) {\n  uint32_t hash = 0;\n  unsigned char* p = (unsigned char*) &f;\n  for (int i = 0; i < sizeof(float); i++) {\n    hash += p[i];\n    hash += hash << 10;\n    hash ^= hash >> 6;\n  }\n  hash += hash << 3;\n  hash ^= hash >> 11;\n  hash += hash << 15;\n  return hash % HASH_SIZE;\n}\n\n/* int recordPval()\n * Save length of given p-value into hashtable.\n *   Return 1 if new entry made, else 0.\n */\nint recordPval(Hash** table, float p, uint32_t length) {\n\n  // check hashtable for matching p-value\n  uint32_t idx = jenkins_one_at_a_time_hash(p);\n  for (Hash* h = table[idx]; h != NULL; h = h->next)\n    if (p == h->val) {\n      // match: add length to bucket\n      h->len += length;\n      return 0;\n    }\n\n  // no match: add info into bucket\n  Hash* newVal = (Hash*) memalloc(sizeof(Hash));\n  newVal->val = p;\n  newVal->len = length;\n  newVal->next = table[idx];\n  table[idx] = newVal;\n  return 1;\n}\n\n/* Hash** hashPval()\n * Collect p-values in a hashtable.\n */\nHash** hashPval(Chrom* chrom, int chromLen, int n,\n    int64_t* pLen) {\n\n  // create hashtable for conversion of p-values to q-values\n  Hash** table = (Hash**) memalloc(HASH_SIZE * sizeof(Hash*));\n  for (int i = 0; i < HASH_SIZE; i++)\n    table[i] = NULL;\n\n  // loop through chroms\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || chr->pval[n] == NULL)\n      continue;\n\n    // populate hashtable\n    Pileup* p = chr->pval[n]; // use the last p-value array\n    uint32_t start = 0;\n    for (uint32_t m = 0; m < chr->pvalLen[n]; m++) {\n      // record p-value and length in hashtable\n      if (p->cov[m] != SKIP)\n        *pLen += recordPval(table, p->cov[m],\n          p->end[m] - start);\n      start = p->end[m];\n    }\n  }\n\n  return table;\n}\n\n/* float* collectPval()\n * Collect arrays of p-values and genome lengths from\n *   hashtable (to be used in q-value calculations).\n */\nfloat* collectPval(Hash** table, uint64_t** pEnd,\n    int64_t pLen, uint64_t* checkLen) {\n  float* pVal = (float*) memalloc(pLen * sizeof(float));\n  int64_t idx = 0;\n  for (int i = 0; i < HASH_SIZE; i++)\n    for (Hash* h = table[i]; h != NULL; h = h->next) {\n      pVal[idx] = h->val;\n      (*pEnd)[idx] = h->len;\n      *checkLen += h->len;\n      idx++;\n    }\n  if (idx != pLen)\n    exit(error(errMsg[ERRPVAL], ERRISSUE));\n  return pVal;\n}\n\n/* void computeQval()\n * Control q-value calculations.\n */\nvoid computeQval(Chrom* chrom, int chromLen,\n    uint64_t genomeLen, bool genomeOpt, int n,\n    bool verbose) {\n\n  // create \"pileup\" arrays for q-values\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || chr->pval[n] == NULL)\n      continue;\n    uint32_t num = chr->pvalLen[n]; // use last p-value array length\n    chr->qval = (Pileup*) memalloc(sizeof(Pileup));\n    chr->qval->end = (uint32_t*) memalloc(num * sizeof(uint32_t));\n    chr->qval->cov = (float*) memalloc(num * sizeof(float));\n  }\n\n  // save all p-values (genome-wide) to hashtable\n  int64_t pLen = 0;\n  Hash** table = hashPval(chrom, chromLen, n, &pLen);\n\n  // collect p-values from hashtable\n  uint64_t* pEnd = memalloc(pLen * sizeof(uint64_t));\n  uint64_t checkLen = 0;  // should match genomeLen\n  float* pVal = collectPval(table, &pEnd, pLen, &checkLen);\n\n  // check that collected p-value lengths match genomeLen\n  if (genomeOpt && checkLen != genomeLen) {\n    char msg[MAX_ALNS];\n    sprintf(msg, \"Genome length (%ld) does not match p-value length (%ld)\",\n      genomeLen, checkLen);\n    exit(error(msg, ERRISSUE));\n  }\n\n  // convert p-values to q-values\n  saveQval(chrom, chromLen, n, genomeLen, pVal, pEnd,\n    pLen, verbose);\n\n  // free memory\n  free(pEnd);\n  free(pVal);\n  for (int i = 0; i < HASH_SIZE; i++) {\n    Hash* tmp;\n    Hash* h = table[i];\n    while (h != NULL) {\n      tmp = h->next;\n      free(h);\n      h = tmp;\n    }\n  }\n  free(table);\n}\n\n/*** Calculate p-value for Chi-squared test ***/\n// adapted from R-3.5.0 source code, as noted below\n\n// from dpq.h in R-3.5.0:\n#define R_Log1_Exp(x)  ((x) > -M_LN2 ? log(-expm1(x)) : log1p(-exp(x)))\n\n/* double bd0()\n * Adapted from bd0.c in R-3.5.0.\n */\ndouble bd0(double x, double np) {\n  double ej, s, s1, v;\n  if (fabs(x-np) < 0.1*(x+np)) {\n    v = (x-np)/(x+np);\n    s = (x-np)*v;\n    if (fabs(s) < DBL_MIN)\n      return s;\n    ej = 2*x*v;\n    v = v*v;\n    for (int j = 1; j < 1000; j++) {\n      ej *= v;\n      s1 = s+ej/((j<<1)+1);\n      if (s1 == s)\n        return s1;\n      s = s1;\n    }\n  }\n  return x * log(x / np) + np - x;\n}\n\n/* double stirlerr()\n * Adapted from stirlerr.c in R-3.5.0.\n *   Argument 'n' is an integer in [1, 199].\n */\ndouble stirlerr(double n) {\n  double S0 = 1.0 / 12;\n  double S1 = 1.0 / 360;\n  double S2 = 1.0 / 1260;\n  double S3 = 1.0 / 1680;\n  double S4 = 1.0 / 1188;\n  double sferr[16] = {\n    0.0,\n    0.0810614667953272582196702,\n    0.0413406959554092940938221,\n    0.02767792568499833914878929,\n    0.02079067210376509311152277,\n    0.01664469118982119216319487,\n    0.01387612882307074799874573,\n    0.01189670994589177009505572,\n    0.010411265261972096497478567,\n    0.009255462182712732917728637,\n    0.008330563433362871256469318,\n    0.007573675487951840794972024,\n    0.006942840107209529865664152,\n    0.006408994188004207068439631,\n    0.005951370112758847735624416,\n    0.005554733551962801371038690\n  };\n\n  double nn = n * n;\n  if (n > 80.0)\n    return (S0-(S1-S2/nn)/nn)/n;\n  if (n > 35.0)\n    return (S0-(S1-(S2-S3/nn)/nn)/nn)/n;\n  if (n > 15.0)\n    return (S0-(S1-(S2-(S3-S4/nn)/nn)/nn)/nn)/n;\n  return sferr[(int) n];\n}\n\n/* double dpois()\n * Adapted from dpois.c in R-3.5.0 (cf. dpois_raw()).\n */\ndouble dpois(double x, double lambda) {\n  return -0.5 * log(2.0 * M_PI * x) - stirlerr(x)\n    - bd0(x, lambda);\n}\n\n/* double pd_upper_series()\n * Adapted from pgamma.c in R-3.5.0.\n */\ndouble pd_upper_series(double x, double alph) {\n  double term = x / alph;\n  double sum = term;\n  do {\n    alph++;\n    term *= x / alph;\n    sum += term;\n  } while (term > sum * DBL_EPSILON);\n  return log(sum);\n}\n\n/* double pd_lower_series()\n * Adapted from pgamma.c in R-3.5.0.\n */\ndouble pd_lower_series(double lambda, double y) {\n  double term = 1, sum = 0;\n  while (y >= 1 && term > sum * DBL_EPSILON) {\n    term *= y / lambda;\n    sum += term;\n    y--;\n  }\n  return log1p(sum);\n}\n\n/* double pgamma_smallx()\n * Adapted from pgamma.c in R-3.5.0.\n */\ndouble pgamma_smallx(double x, double alph) {\n  double sum = 0.0;\n  double c = alph;\n  double n = 0.0;\n  double term;\n  do {\n    n++;\n    c *= -x / n;\n    term = c / (alph + n);\n    sum += term;\n  } while (fabs(term) > DBL_EPSILON * fabs(sum));\n  double lf2 = alph * log(x) - lgamma(alph + 1);\n  return R_Log1_Exp(log1p(sum) + lf2);\n}\n\n/* double pgamma()\n * Adapted from pgamma.c in R-3.5.0 (cf. pgamma_raw()).\n *   Argument 'alph' is an integer in [2, 200].\n */\ndouble pgamma(double x, double alph) {\n\n  if (x < 1)\n    // small values of x\n    return pgamma_smallx(x, alph);\n\n  else if (x <= alph - 1) {\n    // larger alph than x\n    double sum = pd_upper_series(x, alph);\n    double d = dpois(alph - 1, x);\n    return R_Log1_Exp(sum + d);\n  }\n\n  // x > alph - 1\n  double sum = pd_lower_series(x, alph - 1);\n  double d = dpois(alph - 1, x);\n  return sum + d;\n}\n\n/* double pchisq()\n * Calculate a p-value for a chi-squared distribution\n *   with observation 'x' and 'df' degrees of freedom.\n *   'df' must be an even integer in [4, 400].\n * Adapted from pchisq.c and pgamma.c in R-3.5.0,\n *   with lower_tail=FALSE and log_p=TRUE.\n * Return value is -log10(p).\n */\ndouble pchisq(double x, int df) {\n  if (df < 4 || df > 400 || df / 2.0 != (int) (df / 2.0))\n    exit(error(errMsg[ERRDF], ERRISSUE));\n  return -pgamma(x / 2.0, df / 2.0) / M_LN10;\n}\n\n/*** Combine p-values from multiple replicates ***/\n\n/* float multPval()\n * Combine multiple p-values into a single net p-value\n *   using Fisher's method.\n */\nfloat multPval(Pileup** pval, int n, uint32_t idx[]) {\n  double sum = 0.0;\n  int df = 0;\n  for (int j = 0; j < n; j++)\n    if (pval[j] != NULL && pval[j]->cov[idx[j]] != SKIP) {\n      sum += pval[j]->cov[idx[j]];\n      df += 2;\n    }\n  if (df == 0)\n    return SKIP;\n  if (df == 2 || ! sum)\n    return (float) sum;\n\n  // calculate p-value using chi-squared dist.\n  double p = pchisq(2.0 * sum / M_LOG10E, df);\n  return p > FLT_MAX ? FLT_MAX : (float) p;\n}\n\n/* uint32_t countIntervals2()\n * Count the number of pileup intervals to create\n *   for the combined p-values.\n */\nuint32_t countIntervals2(Chrom* c, int n) {\n  uint32_t num = 1;\n  uint32_t idx[n];  // indexes into each pval array\n  for (int j = 0; j < n; j++)\n    idx[j] = 0;\n  for (uint32_t k = 1; k < c->len; k++) {\n    bool add = false;\n    for (int j = 0; j < n; j++)\n      if (c->pval[j] != NULL\n          && c->pval[j]->end[idx[j]] == k) {\n        if (! add) {\n          num++;\n          add = true;\n        }\n        idx[j]++;\n      }\n  }\n  return num;\n}\n\n/* void combinePval()\n * Combine p-values for multiple replicates.\n */\nvoid combinePval(Chrom* chrom, int chromLen, int n) {\n\n  // combine p-value \"pileups\" for each chrom\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip)\n      continue;\n\n    // make sure at least one pval array exists\n    int j;\n    for (j = 0; j < n; j++)\n      if (chr->pval[j] != NULL)\n        break;\n    if (j == n) {\n      // none exists: append another NULL\n      chr->pval = (Pileup**) memrealloc(chr->pval,\n        (n + 1) * sizeof(Pileup*));\n      chr->pval[n] = NULL;\n      continue;\n    }\n\n    // create additional 'pileup' array for combined p-values\n    uint32_t num = countIntervals2(chr, n);\n    chr->pval = (Pileup**) memrealloc(chr->pval,\n      (n + 1) * sizeof(Pileup*));\n    chr->pval[n] = (Pileup*) memalloc(sizeof(Pileup));\n    chr->pval[n]->end = (uint32_t*) memalloc(num * sizeof(uint32_t));\n    chr->pval[n]->cov = (float*) memalloc(num * sizeof(float));\n    chr->pvalLen = (uint32_t*) memrealloc(chr->pvalLen,\n      (n + 1) * sizeof(uint32_t));\n    chr->pvalLen[n] = num;\n    chr->sample++;\n\n    // save combined p-values\n    uint32_t idx[n + 1];  // indexes into each pval array\n    for (int j = 0; j <= n; j++)\n      idx[j] = 0;\n    for (uint32_t k = 1; k <= chr->len; k++) {\n      bool add = false;\n      for (int j = 0; j < n; j++)\n        if (chr->pval[j] != NULL\n            && chr->pval[j]->end[idx[j]] == k) {\n          if (! add) {\n            chr->pval[n]->end[idx[n]] = k;\n            chr->pval[n]->cov[idx[n]]\n              = multPval(chr->pval, n, idx);\n            idx[n]++;\n            add = true;\n          }\n          idx[j]++;\n        }\n\n    }\n\n  }\n}\n\n/*** Log pileups and stats ***/\n\n/* void printLogHeader()\n * Print header of logfile.\n */\nvoid printLogHeader(File log, bool gzOut, int n,\n    bool qvalOpt, bool sigOpt) {\n  if (n) {\n    // multiple samples: logfile has multiple p-values, no pileups\n    if (gzOut) {\n      gzprintf(log.gzf, \"chr\\tstart\\tend\");\n      for (int i = 0; i < n; i++)\n        gzprintf(log.gzf, \"\\t-log(p)_%d\", i);\n      gzprintf(log.gzf, \"\\t-log(p)_comb\");\n      if (qvalOpt)\n        gzprintf(log.gzf, \"\\t-log(q)\");\n      if (sigOpt)\n        gzprintf(log.gzf, \"\\tsignif\");\n      gzprintf(log.gzf, \"\\n\");\n    } else {\n      fprintf(log.f, \"chr\\tstart\\tend\");\n      for (int i = 0; i < n; i++)\n        fprintf(log.f, \"\\t-log(p)_%d\", i);\n      fprintf(log.f, \"\\t-log(p)_comb\");\n      if (qvalOpt)\n        fprintf(log.f, \"\\t-log(q)\");\n      if (sigOpt)\n        fprintf(log.f, \"\\tsignif\");\n      fprintf(log.f, \"\\n\");\n    }\n  } else {\n    // single sample: logfile has pileups and p-/q-values\n    if (gzOut) {\n      gzprintf(log.gzf, \"chr\\tstart\\tend\\texperimental\\tcontrol\\t-log(p)\");\n      if (qvalOpt)\n        gzprintf(log.gzf, \"\\t-log(q)\");\n      if (sigOpt)\n        gzprintf(log.gzf, \"\\tsignif\");\n      gzprintf(log.gzf, \"\\n\");\n    } else {\n      fprintf(log.f, \"chr\\tstart\\tend\\texperimental\\tcontrol\\t-log(p)\");\n      if (qvalOpt)\n        fprintf(log.f, \"\\t-log(q)\");\n      if (sigOpt)\n        fprintf(log.f, \"\\tsignif\");\n      fprintf(log.f, \"\\n\");\n    }\n  }\n}\n\n/* void printIntervalN()\n * Print bedgraph(ish) interval for multiple replicates.\n *   Values: -log(p) for each replicate, combined -log(p),\n *   -log(q), and significance ('*') for each.\n */\nvoid printIntervalN(File out, bool gzOut, char* name,\n    uint32_t start, uint32_t end, Pileup** p, int n,\n    uint32_t idx[], float pval, bool qvalOpt,\n    float qval, bool sig) {\n  if (gzOut) {\n    gzprintf(out.gzf, \"%s\\t%d\\t%d\", name, start, end);\n    for (int i = 0; i < n; i++)\n      if (p[i] == NULL || p[i]->cov[idx[i]] == SKIP)\n        gzprintf(out.gzf, \"\\t%s\", NA);\n      else\n        gzprintf(out.gzf, \"\\t%f\", p[i]->cov[idx[i]]);\n    if (pval == SKIP) {\n      gzprintf(out.gzf, \"\\t%s\", NA);\n      if (qvalOpt)\n        gzprintf(out.gzf, \"\\t%s\", NA);\n    } else {\n      gzprintf(out.gzf, \"\\t%f\", pval);\n      if (qvalOpt)\n        gzprintf(out.gzf, \"\\t%f\", qval);\n    }\n    gzprintf(out.gzf, \"%s\\n\", sig ? \"\\t*\" : \"\");\n  } else {\n    fprintf(out.f, \"%s\\t%d\\t%d\", name, start, end);\n    for (int i = 0; i < n; i++)\n      if (p[i] == NULL || p[i]->cov[idx[i]] == SKIP)\n        fprintf(out.f, \"\\t%s\", NA);\n      else\n        fprintf(out.f, \"\\t%f\", p[i]->cov[idx[i]]);\n    if (pval == SKIP) {\n      fprintf(out.f, \"\\t%s\", NA);\n      if (qvalOpt)\n        fprintf(out.f, \"\\t%s\", NA);\n    } else {\n      fprintf(out.f, \"\\t%f\", pval);\n      if (qvalOpt)\n        fprintf(out.f, \"\\t%f\", qval);\n    }\n    fprintf(out.f, \"%s\\n\", sig ? \"\\t*\" : \"\");\n  }\n}\n\n/* void printInterval()\n * Print bedgraph(ish) interval for a single replicate.\n *   Values: pileups (experimental and control), -log(p),\n *   -log(q), and significance ('*') for each.\n */\nvoid printInterval(File out, bool gzOut, char* name,\n    uint32_t start, uint32_t end, float exptVal,\n    float ctrlVal, float pval, bool qvalOpt, float qval,\n    bool sig) {\n  if (gzOut) {\n    if (ctrlVal == SKIP) {\n      gzprintf(out.gzf, \"%s\\t%d\\t%d\\t%f\\t%f\\t%s\",\n        name, start, end, exptVal, 0.0f, NA);\n      if (qvalOpt)\n        gzprintf(out.gzf, \"\\t%s\", NA);\n      gzprintf(out.gzf, \"\\n\");\n    } else {\n      gzprintf(out.gzf, \"%s\\t%d\\t%d\\t%f\\t%f\\t%f\",\n        name, start, end, exptVal, ctrlVal, pval);\n      if (qvalOpt)\n        gzprintf(out.gzf, \"\\t%f\", qval);\n      gzprintf(out.gzf, \"%s\\n\", sig ? \"\\t*\" : \"\");\n    }\n  } else {\n    if (ctrlVal == SKIP) {\n      fprintf(out.f, \"%s\\t%d\\t%d\\t%f\\t%f\\t%s\",\n        name, start, end, exptVal, 0.0f, NA);\n      if (qvalOpt)\n        fprintf(out.f, \"\\t%s\", NA);\n      fprintf(out.f, \"\\n\");\n    } else {\n      fprintf(out.f, \"%s\\t%d\\t%d\\t%f\\t%f\\t%f\",\n        name, start, end, exptVal, ctrlVal, pval);\n      if (qvalOpt)\n        fprintf(out.f, \"\\t%f\", qval);\n      fprintf(out.f, \"%s\\n\", sig ? \"\\t*\" : \"\");\n    }\n  }\n}\n\n/* void printLog()\n * Control printing of stats for an interval.\n */\nvoid printLog(File log, bool gzOut, Chrom* chr,\n    uint32_t start, int n, uint32_t m, uint32_t j,\n    uint32_t k, uint32_t idx[], bool qvalOpt,\n    bool sig) {\n  if (! n) {\n    // single replicate\n    printInterval(log, gzOut, chr->name,\n      start, chr->pval[n]->end[m],\n      chr->expt->cov[j], chr->ctrl->cov[k],\n      chr->pval[n]->cov[m], qvalOpt,\n      qvalOpt ? chr->qval->cov[m] : SKIP, sig);\n  } else {\n    // multiple replicates\n    printIntervalN(log, gzOut, chr->name,\n      start, chr->pval[n]->end[m], chr->pval, n, idx,\n      chr->pval[n]->cov[m], qvalOpt,\n      qvalOpt ? chr->qval->cov[m] : SKIP, sig);\n    // update indexes into pval arrays\n    for (int r = 0; r < n; r++)\n      if (chr->pval[r] != NULL\n          && chr->pval[r]->end[idx[r]] == chr->pval[n]->end[m])\n        idx[r]++;\n  }\n}\n\n/* void logIntervals()\n * Instead of calling peaks, just print log of pileups,\n *   and p- and q-values for each interval.\n */\nvoid logIntervals(File log, bool gzOut, Chrom* chrom,\n    int chromLen, int n, bool qvalOpt) {\n  // print header\n  printLogHeader(log, gzOut, n, qvalOpt, false);\n\n  // loop through chroms\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || (qvalOpt && chr->qval == NULL)\n        || (! qvalOpt && chr->pval[n] == NULL) )\n      continue;\n\n    // create indexes into arrays (expt/ctrl pileup [if n == 0]\n    //   and p-value arrays [if n > 0])\n    uint32_t j = 0, k = 0;  // indexes into chr->expt, chr->ctrl\n    uint32_t idx[n];        // indexes into each pval array\n    for (int r = 0; r < n; r++)\n      idx[r] = 0;\n\n    // loop through intervals (defined by chr->pval[n])\n    uint32_t start = 0;    // start of interval\n    for (uint32_t m = 0; m < chr->pvalLen[n]; m++) {\n\n      // print stats for interval\n      printLog(log, gzOut, chr, start, n, m, j, k,\n        idx, qvalOpt, false);\n\n      // update chr->expt and chr->ctrl indexes\n      if (! n) {\n        if (chr->ctrl->end[k] < chr->expt->end[j])\n          k++;\n        else {\n          if (chr->ctrl->end[k] == chr->expt->end[j])\n            k++;\n          j++;\n        }\n      }\n\n      start = chr->pval[n]->end[m];\n    }\n  }\n}\n\n/*** Call peaks ***/\n\n/* void printPeak()\n * Print peaks in ENCODE narrowPeak format.\n */\nvoid printPeak(File out, bool gzOut, char* name,\n    int64_t start, int64_t end, int count, float signal,\n    float pval, float qval, uint32_t pos) {\n  if (gzOut) {\n    gzprintf(out.gzf, \"%s\\t%ld\\t%ld\\tpeak_%d\\t%d\\t.\\t%f\\t%f\",\n      name, start, end, count,\n      MIN((unsigned int) (1000.0f * signal / (end - start)\n        + 0.5f), 1000),\n      signal, pval);\n    if (qval == SKIP)\n      gzprintf(out.gzf, \"\\t-1\\t%d\\n\", pos);\n    else\n      gzprintf(out.gzf, \"\\t%f\\t%d\\n\", qval, pos);\n  } else {\n    fprintf(out.f, \"%s\\t%ld\\t%ld\\tpeak_%d\\t%d\\t.\\t%f\\t%f\",\n      name, start, end, count,\n      MIN((unsigned int) (1000.0f * signal / (end - start)\n        + 0.5f), 1000),\n      signal, pval);\n    if (qval == SKIP)\n      fprintf(out.f, \"\\t-1\\t%d\\n\", pos);\n    else\n      fprintf(out.f, \"\\t%f\\t%d\\n\", qval, pos);\n  }\n}\n\n/* void checkPeak()\n * Check if potential peak is valid (coordinates,\n *   minAUC, and minLen parameters). If valid, print\n *   results via printPeak().\n */\nvoid checkPeak(File out, bool gzOut, char* name,\n    int64_t start, int64_t end, int* count, float auc,\n    float pval, float qval, uint32_t pos, float minAUC,\n    int minLen, uint64_t* peakBP) {\n  if (start != -1 && auc >= minAUC\n      && end - start >= minLen) {\n    printPeak(out, gzOut, name, start, end, *count,\n      auc, pval, qval, pos);\n    (*peakBP) += end - start;\n    (*count)++;\n  }\n}\n\n/* void resetVars()\n * Reset peak variables to null values.\n */\nvoid resetVars(int64_t* peakStart, float* summitVal,\n    uint32_t* summitLen, float* auc) {\n  *peakStart = -1;\n  *summitVal = -1.0f;\n  *summitLen = 0;\n  *auc = 0.0f;\n}\n\n/* void updatePeak()\n * Update peak variables for current interval.\n */\nvoid updatePeak(int64_t* peakStart, int64_t* peakEnd,\n    uint32_t start, uint32_t end, float* auc, float pqval,\n    float minPQval, float pval, float qval,\n    float* summitVal, float* summitPval, float* summitQval,\n    uint32_t* summitPos, uint32_t* summitLen) {\n  // update peak AUC, coordinates\n  uint32_t len = end - start;\n  *auc += len * (pqval - minPQval); // sum AUC\n  if (*peakStart == -1)\n    *peakStart = start; // start new potential peak\n  *peakEnd = end;       // end of potential peak (for now)\n\n  // check if interval is summit for this peak\n  if (pqval > *summitVal) {\n    *summitVal = pqval;\n    *summitPval = pval;\n    *summitQval = qval;\n    *summitPos = (end + start)/2 - *peakStart; // midpoint of interval\n    *summitLen = len;\n  } else if (pqval == *summitVal) {\n    // update summitPos only if interval is longer\n    if (len > *summitLen) {\n      *summitPos = (end + start)/2 - *peakStart; // midpoint of interval\n      *summitLen = len;\n      // assume summitPval, summitQval remain the same\n    }\n  }\n}\n\n/* int callPeaks()\n * Call peaks, using minAUC, maxGap, and minLen parameters.\n *   Produce output on the fly. Log pileups, p- and\n *   q-values for each interval. Return number of peaks.\n */\nint callPeaks(File out, File log, bool logOpt, bool gzOut,\n    Chrom* chrom, int chromLen, int n, float minPQval,\n    bool qvalOpt, float minAUC, int minLen, int maxGap,\n    uint64_t* peakBP) {\n\n  if (logOpt)\n    printLogHeader(log, gzOut, n, qvalOpt, true);\n\n  // loop through chroms\n  int count = 0;      // count of peaks\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || (qvalOpt && chr->qval == NULL)\n        || (! qvalOpt && chr->pval[n] == NULL) )\n      continue;\n\n    // create indexes into arrays for logging purposes\n    //   (expt/ctrl pileup [if n == 0] and p-value arrays [if n > 0])\n    uint32_t j = 0, k = 0;  // indexes into chr->expt, chr->ctrl\n    uint32_t idx[n];        // indexes into each pval array\n    for (int r = 0; r < n; r++)\n      idx[r] = 0;\n\n    // initialize peak variables\n    float auc = 0.0f;                     // area under the curve (signif.)\n    int64_t peakStart = -1, peakEnd = -1; // ends of potential peak\n    float summitVal = -1.0f;              // summit p/q value\n    uint32_t summitPos = 0;               // distance from peakStart to summit\n    uint32_t summitLen = 0;               // length of summit interval\n    float summitPval = -1.0f, summitQval = -1.0f; // summit p- and q-values\n\n    // loop through intervals (defined by chr->pval[n])\n    uint32_t start = 0;    // start of interval\n    for (uint32_t m = 0; m < chr->pvalLen[n]; m++) {\n\n      bool sig = false;\n      float pqval = qvalOpt ? chr->qval->cov[m]\n        : chr->pval[n]->cov[m];\n      if (pqval > minPQval) {\n\n        // interval reaches significance\n        sig = true;\n        updatePeak(&peakStart, &peakEnd, start,\n          chr->pval[n]->end[m], &auc, pqval,\n          minPQval, chr->pval[n]->cov[m],\n          qvalOpt ? chr->qval->cov[m] : SKIP,\n          &summitVal, &summitPval, &summitQval,\n          &summitPos, &summitLen);\n\n      } else {\n\n        // interval does not reach significance --\n        //   check if interval is to be skipped\n        //   OR distance is beyond maxGap from peak\n        if (pqval == SKIP\n            || chr->pval[n]->end[m] - peakEnd > maxGap) {\n          // check if previous peak is valid\n          checkPeak(out, gzOut, chr->name, peakStart,\n            peakEnd, &count, auc, summitPval, summitQval,\n            summitPos, minAUC, minLen, peakBP);\n\n          // reset peak variables\n          resetVars(&peakStart, &summitVal, &summitLen, &auc);\n        }\n      }\n\n      // print stats for interval\n      if (logOpt)\n        printLog(log, gzOut, chr, start, n, m, j, k,\n          idx, qvalOpt, sig);\n\n      // update chr->expt and chr->ctrl indexes\n      if (! n) {\n        if (chr->ctrl->end[k] < chr->expt->end[j])\n          k++;\n        else {\n          if (chr->ctrl->end[k] == chr->expt->end[j])\n            k++;\n          j++;\n        }\n      }\n\n      start = chr->pval[n]->end[m];\n    }\n\n    // determine if last peak is valid\n    checkPeak(out, gzOut, chr->name, peakStart, peakEnd,\n      &count, auc, summitPval, summitQval, summitPos,\n      minAUC, minLen, peakBP);\n  }\n\n  return count;\n}\n\n/* void findPeaks()\n * Control process of finding peaks:\n *   calculating p- and q-values, calling peaks,\n *   and printing output.\n */\nvoid findPeaks(File out, File log, bool logOpt, bool gzOut,\n    Chrom* chrom, int chromLen, int* sample,\n    float minPQval, bool qvalOpt, int minLen, int maxGap,\n    float minAUC, bool peaksOpt, uint64_t genomeLen,\n    bool verbose) {\n\n  // calculate combined p-values for multiple replicates\n  if (*sample > 1) {\n    combinePval(chrom, chromLen, *sample);\n    (*sample)++;\n  }\n\n  // calculate genome length (only chroms that are not\n  //   skipped and have had p-values calculated)\n  bool genomeOpt = false;\n  if (! genomeLen) {\n    genomeOpt = true;\n    for (int i = 0; i < chromLen; i++) {\n      Chrom* chr = chrom + i;\n      if (! chr->skip && chr->pval[*sample - 1] != NULL) {\n        genomeLen += chr->len;\n        for (int j = 0; j < chr->bedLen; j += 2)\n          genomeLen -= chr->bed[j+1] - chr->bed[j];\n      }\n    }\n  }\n\n  if (verbose) {\n    if (peaksOpt) {\n      fprintf(stderr, \"Peak-calling parameters:\\n\");\n      fprintf(stderr, \"  Genome length: %ldbp\\n\", genomeLen);\n      fprintf(stderr, \"  Significance threshold: -log(%c) > %.3f\\n\",\n        qvalOpt ? 'q' : 'p', minPQval);\n      fprintf(stderr, \"  Min. AUC: %.3f\\n\", minAUC);\n      if (minLen)\n        fprintf(stderr, \"  Min. peak length: %dbp\\n\", minLen);\n      fprintf(stderr, \"  Max. gap between sites: %dbp\\n\", maxGap);\n    } else {\n      fprintf(stderr, \"- peak-calling skipped -\\n\");\n      fprintf(stderr, \"  Genome length: %ldbp\\n\", genomeLen);\n    }\n  }\n\n  // compute q-values\n  if (qvalOpt)\n    computeQval(chrom, chromLen, genomeLen, genomeOpt,\n      *sample - 1, verbose);\n\n  // call peaks\n  if (peaksOpt) {\n    uint64_t peakBP = 0;\n    int count = callPeaks(out, log, logOpt, gzOut, chrom,\n      chromLen, *sample - 1, minPQval, qvalOpt, minAUC,\n      minLen, maxGap, &peakBP);\n    if (verbose)\n      fprintf(stderr, \"Peaks identified: %d (%ldbp)\\n\",\n        count, peakBP);\n  } else if (logOpt)\n    // not calling peaks -- just log pileups and stats\n    logIntervals(log, gzOut, chrom, chromLen, *sample - 1,\n      qvalOpt);\n}\n\n/*** Call peaks directly from a log file ***/\n\n/* void saveXBed()\n * Save BED regions to be excluded for a chrom.\n */\nvoid saveXBed(char* name, uint32_t len, int* bedLen,\n    uint32_t** bed, int xBedLen, Bed* xBed,\n    bool verbose) {\n  // check each xBed interval for match to chrom (name)\n  for (int i = 0; i < xBedLen; i++) {\n    Bed* b = xBed + i;\n    if (!strcmp(name, b->name)) {\n\n      // check if interval is located off end of chrom\n      if (b->pos[0] >= len) {\n        if (verbose) {\n          fprintf(stderr, \"Warning! BED interval (%s, %d - %d) ignored\\n\",\n            b->name, b->pos[0], b->pos[1]);\n          fprintf(stderr, \"  - located off end of reference %s (length %d)\\n\",\n            name, len);\n        }\n        continue;\n      }\n\n      // insert interval into array, sorted by start pos\n      int j;\n      for (j = 0; j < *bedLen; j += 2)\n        if (b->pos[0] <= (*bed)[j])\n          break;\n      *bedLen += 2;\n      *bed = (uint32_t*) memrealloc(*bed,\n        *bedLen * sizeof(uint32_t));\n      // shift intervals forward\n      for (int k = *bedLen - 1; k > j + 1; k--)\n        (*bed)[k] = (*bed)[k-2];\n      (*bed)[j] = b->pos[0];\n      (*bed)[j+1] = b->pos[1];\n    }\n  }\n\n  // merge overlapping intervals\n  int i = 0;\n  while (i < *bedLen) {\n\n    // check for interval past end of chrom\n    if ((*bed)[i+1] > len) {\n      if (verbose) {\n        fprintf(stderr, \"Warning! BED interval (%s, %d - %d) extends \",\n          name, (*bed)[i], (*bed)[i+1]);\n        fprintf(stderr, \"past end of ref.\\n  - edited to (%s, %d - %d)\\n\",\n          name, (*bed)[i], len);\n      }\n      (*bed)[i+1] = len;\n    }\n\n    // check for overlap with previous\n    if (i && (*bed)[i] <= (*bed)[i-1]) {\n      if ((*bed)[i+1] > (*bed)[i-1])\n        (*bed)[i-1] = (*bed)[i+1];\n      // shift coordinates backward\n      for (int j = i; j < *bedLen - 2; j++)\n        (*bed)[j] = (*bed)[j + 2];\n      *bedLen -= 2;\n    } else\n      i += 2;\n\n  }\n}\n\n/* bool checkChrom()\n * Return true if given char* matches a chromosome\n *   to be skipped (-e).\n */\nbool checkChrom(char* chr, int xcount, char** xchrList) {\n  for (int i = 0; i < xcount; i++)\n    if (!strcmp(xchrList[i], chr))\n      return true;\n  return false;\n}\n\n/* int getIdx()\n * Find fields of header line of bedgraph-ish log file\n *   that match \"-log([pq])\".\n *   If multiple matches, return the last one.\n */\nint getIdx(File in, bool gz, char* line, bool qvalOpt,\n    int* idxQ) {\n  if (getLine(line, MAX_SIZE, in, gz) == NULL)\n    exit(error(\"<header>\", ERRLOGIDX));\n  char matchP[8] = \"-log(p)\";\n  char matchQ[8] = \"-log(q)\";\n  int idxP = -1;\n  int i = 0;\n  char* field = strtok(line, TABN);\n  while (field != NULL) {\n    if (!strncmp(field, matchP, 7))\n      idxP = i;\n    else if (!strncmp(field, matchQ, 7))\n      *idxQ = i;\n    field = strtok(NULL, TABN);\n    i++;\n  }\n  if (idxP == -1)\n    exit(error(matchP, ERRLOGIDX));\n  if (qvalOpt && *idxQ == -1)\n    exit(error(matchQ, ERRLOGIDX));\n  return idxP;\n}\n\n/* void loadBDG()\n * Load fields (chr, start, end, and [pq]-value) from\n *   a bedgraph-ish record (-f log file).\n */\nvoid loadBDG(char* line, char** chr, uint32_t* start,\n    uint32_t* end, char** pStat, int idxP, char** qStat,\n    int idxQ, bool qvalOpt) {\n  int idx = qvalOpt ? idxQ : idxP;\n  char* field = strtok(line, TABN);\n  for (int i = 0; i <= idx; i++) {\n    if (field == NULL)\n      exit(error(\"\", ERRLOG));\n    switch (i) {\n      case CHR: *chr = field; break;\n      case START: *start = getInt(field); break;\n      case END: *end = getInt(field); break;\n      default:\n        if (i == idxP)\n          *pStat = field;\n        else if (i == idxQ)\n          *qStat = field;\n    }\n    field = strtok(NULL, TABN);\n  }\n}\n\n/* void callPeaksLog()\n * Call peaks directly from a bedgraph-ish log file.\n */\nvoid callPeaksLog(File in, bool gz, File out, bool gzOut,\n    char* line, int xcount, char** xchrList, int xBedLen,\n    Bed* xBed, float minPQval, bool qvalOpt, int minLen,\n    int maxGap, float minAUC, uint64_t genomeLen,\n    bool verbose) {\n  // determine index of fields ([pq]-value) to analyze\n  int idxQ = -1;\n  int idxP = getIdx(in, gz, line, qvalOpt, &idxQ);\n\n  // initialize variables for genomic regions to be excluded\n  uint32_t* bed = NULL;         // array of BED coordinates\n  int bedLen = 0;               // length of bed array\n  int bedIdx = 0;               // index into bed array\n  uint32_t bedPos = UINT32_MAX; // next coordinate\n  bool save = true;             // status of current interval\n  bool warn = false;            // warn of new skipped regions?\n\n  // initialize counting variables\n  int count = 0;\n  uint64_t peakBP = 0;\n  bool genomeOpt = genomeLen == 0;\n\n  // initialize peak variables\n  float auc = 0.0f;                     // area under the curve (signif.)\n  int64_t peakStart = -1, peakEnd = -1; // ends of potential peak\n  float summitVal = -1.0f;              // summit p/q value\n  uint32_t summitPos = 0;               // distance from peakStart to summit\n  uint32_t summitLen = 0;               // length of summit interval\n  float summitPval = -1.0f, summitQval = -1.0f; // summit p- and q-values\n\n  // parse bedgraph records\n  char prev[MAX_ALNS];              // previous chrom name\n  prev[0] = '\\0';\n  bool skip = false;                // skip current chrom?\n  char* chr, *stat, *pStat, *qStat; // chrom and stats/NA of current interval\n  uint32_t start, end;              // coordinates of current interval\n  float pqval;                      // [pq]-value of current interval\n  while (getLine(line, MAX_SIZE, in, gz) != NULL) {\n\n    // load fields from bedgraph record\n    loadBDG(line, &chr, &start, &end, &pStat, idxP,\n      &qStat, idxQ, qvalOpt);\n\n    // new chromosome\n    if (strcmp(prev, chr)) {\n      // check if previous chrom's last peak is valid\n      checkPeak(out, gzOut, prev, peakStart, peakEnd,\n        &count, auc, summitPval, summitQval, summitPos,\n        minAUC, minLen, &peakBP);\n\n      // reset peak variables\n      resetVars(&peakStart, &summitVal, &summitLen, &auc);\n\n      // check if new chrom should be skipped\n      skip = checkChrom(chr, xcount, xchrList);\n      if (verbose && skip) {\n        fprintf(stderr, \"Warning! Skipping chromosome %s --\\n  \", chr);\n        fprintf(stderr, \"Reads aligning to it were used in the background\");\n        fprintf(stderr, \" pileup calculation,\\n  and its length was included\");\n        fprintf(stderr, \" in the genome length %scalculation\\n\",\n          qvalOpt ? \"(and q-value) \" : \"\");\n      }\n\n      // check for regions to be skipped\n      bedLen = 0;\n      if (! skip) {\n        // load regions from xBed\n        // (note: cannot check validity of coordinates)\n        saveXBed(chr, UINT32_MAX, &bedLen, &bed, xBedLen,\n          xBed, verbose);\n\n        // load next coordinate\n        bedIdx = 0;\n        bedPos = bedIdx < bedLen ? bed[bedIdx]\n          : UINT32_MAX;\n        save = true;\n      }\n\n      strcpy(prev, chr);  // update current chrom\n    }\n    if (skip)\n      continue; // current chrom to be skipped\n\n    // check stat for 'NA' (skipped region)\n    stat = qvalOpt ? qStat : pStat;\n    if (!strcmp(stat, NA)) {\n      // check if previous peak is valid\n      checkPeak(out, gzOut, chr, peakStart, peakEnd,\n        &count, auc, summitPval, summitQval, summitPos,\n        minAUC, minLen, &peakBP);\n\n      // reset peak variables\n      resetVars(&peakStart, &summitVal, &summitLen, &auc);\n      continue;\n    }\n    pqval = getFloat(stat);\n\n    // check if current interval starts at a position\n    //   to be skipped (new -E)\n    if (bedPos == start) {\n      if (save) {\n        // check if previous peak is valid\n        checkPeak(out, gzOut, chr, peakStart, peakEnd,\n          &count, auc, summitPval, summitQval, summitPos,\n          minAUC, minLen, &peakBP);\n        // reset peak variables\n        resetVars(&peakStart, &summitVal, &summitLen, &auc);\n      }\n\n      // load next coordinate\n      save = ! save;\n      bedIdx++;\n      bedPos = bedIdx < bedLen ? bed[bedIdx] : UINT32_MAX;\n    }\n\n    // check if current interval contains the next position\n    //   to be skipped (new -E)\n    uint32_t subStart = start;  // start of current subinterval\n    while (bedPos > start && bedPos < end) {\n\n      if (save) {\n        // interval *not* to be skipped:\n        //   update peak if necessary, then print if valid\n        if (pqval > minPQval) {\n          // interval reaches significance\n          updatePeak(&peakStart, &peakEnd, subStart,\n            bedPos, &auc, pqval, minPQval,\n            qvalOpt ? getFloat(pStat) : pqval,\n            qvalOpt ? pqval : SKIP,\n            &summitVal, &summitPval, &summitQval,\n            &summitPos, &summitLen);\n        }\n        // check if peak is valid\n        checkPeak(out, gzOut, chr, peakStart, peakEnd,\n          &count, auc, summitPval, summitQval, summitPos,\n          minAUC, minLen, &peakBP);\n        // reset peak variables\n        resetVars(&peakStart, &summitVal, &summitLen, &auc);\n        if (genomeOpt)\n          genomeLen += bedPos - subStart; // update genome length\n      } else\n        warn = true;\n\n      // load next coordinate\n      subStart = bedPos;\n      save = ! save;\n      bedIdx++;\n      bedPos = bedIdx < bedLen ? bed[bedIdx] : UINT32_MAX;\n    }\n    if (! save) {\n      warn = true;\n      continue;\n    }\n    start = subStart; // reset start coordinate\n\n    // check [pq]-value for significance\n    if (genomeOpt)\n      genomeLen += end - start; // update genome length\n    if (pqval > minPQval) {\n\n      // interval reaches significance\n      updatePeak(&peakStart, &peakEnd, start, end,\n        &auc, pqval, minPQval,\n        qvalOpt ? getFloat(pStat) : pqval,\n        qvalOpt ? pqval : SKIP,\n        &summitVal, &summitPval, &summitQval,\n        &summitPos, &summitLen);\n\n    } else {\n\n      // interval does not reach significance --\n      //   check if distance is beyond maxGap from peak\n      if (end - peakEnd > maxGap) {\n        // check if previous peak is valid\n        checkPeak(out, gzOut, chr, peakStart, peakEnd,\n          &count, auc, summitPval, summitQval, summitPos,\n          minAUC, minLen, &peakBP);\n\n        // reset peak variables\n        resetVars(&peakStart, &summitVal, &summitLen, &auc);\n      }\n    }\n\n  }\n\n  // check if last peak is valid\n  checkPeak(out, gzOut, chr, peakStart, peakEnd,\n    &count, auc, summitPval, summitQval, summitPos,\n    minAUC, minLen, &peakBP);\n\n  if (verbose) {\n    if (warn) {\n      fprintf(stderr, \"Warning! Skipping given BED regions --\\n  \");\n      fprintf(stderr, \"Reads aligning to them were used in the background\");\n      fprintf(stderr, \" pileup calculation,\\n  and the lengths were included\");\n      fprintf(stderr, \" in the genome length %scalculation\\n\",\n        qvalOpt ? \"(and q-value) \" : \"\");\n    }\n    fprintf(stderr, \"Peak-calling parameters:\\n\");\n    fprintf(stderr, \"  Genome length: %ldbp\\n\", genomeLen);\n    fprintf(stderr, \"  Significance threshold: -log(%c) > %.3f\\n\",\n      qvalOpt ? 'q' : 'p', minPQval);\n    fprintf(stderr, \"  Min. AUC: %.3f\\n\", minAUC);\n    if (minLen)\n      fprintf(stderr, \"  Min. peak length: %dbp\\n\", minLen);\n    fprintf(stderr, \"  Max. gap between sites: %dbp\\n\", maxGap);\n    fprintf(stderr, \"Peaks identified: %d (%ldbp)\\n\",\n      count, peakBP);\n  }\n\n  free(bed);\n}\n\n/*** Calculate a p-value (log-normal distribution) ***/\n// adapted from R-3.5.0 source code, as noted below\n\n/* double do_del()\n * Adapted from pnorm.c in R-3.5.0\n *   (cf. do_del() and swap_tail).\n */\ndouble do_del(double y, double temp, bool ret) {\n  double xsq = trunc(y * 16) / 16;\n  double del = (y - xsq) * (y + xsq);\n  if (ret)\n    return log1p(-exp((-xsq * xsq - del) / 2.0) * temp);\n  return (-xsq * xsq - del) / 2.0 + log(temp);\n}\n\n/* double pnorm()\n * Adapted from pnorm.c in R-3.5.0\n *   (cf. pnorm_both() with i_tail=1, log_p=TRUE).\n */\ndouble pnorm(double x) {\n  double a[5] = {\n    2.2352520354606839287,\n    161.02823106855587881,\n    1067.6894854603709582,\n    18154.981253343561249,\n    0.065682337918207449113\n  };\n  double b[4] = {\n    47.20258190468824187,\n    976.09855173777669322,\n    10260.932208618978205,\n    45507.789335026729956\n  };\n  double c[9] = {\n    0.39894151208813466764,\n    8.8831497943883759412,\n    93.506656132177855979,\n    597.27027639480026226,\n    2494.5375852903726711,\n    6848.1904505362823326,\n    11602.651437647350124,\n    9842.7148383839780218,\n    1.0765576773720192317e-8\n  };\n  double d[8] = {\n    22.266688044328115691,\n    235.38790178262499861,\n    1519.377599407554805,\n    6485.558298266760755,\n    18615.571640885098091,\n    34900.952721145977266,\n    38912.003286093271411,\n    19685.429676859990727\n  };\n  double p[6] = {\n    0.21589853405795699,\n    0.1274011611602473639,\n    0.022235277870649807,\n    0.001421619193227893466,\n    2.9112874951168792e-5,\n    0.02307344176494017303\n  };\n  double q[5] = {\n    1.28426009614491121,\n    0.468238212480865118,\n    0.0659881378689285515,\n    0.00378239633202758244,\n    7.29751555083966205e-5\n  };\n\n  double xden, xnum, xsq, temp;\n  double y = fabs(x);\n  if (y <= 0.67448975) {\n\n    // small values of fabs(x)\n    if (y > DBL_EPSILON * 0.5) {\n      xsq = x * x;\n      xnum = a[4] * xsq;\n      xden = xsq;\n      for (int i = 0; i < 3; i++) {\n        xnum = (xnum + a[i]) * xsq;\n        xden = (xden + b[i]) * xsq;\n      }\n      temp = x * (xnum + a[3]) / (xden + b[3]);\n    } else\n      temp = x * a[3] / b[3];\n    return log(0.5 - temp);\n\n  } else if (y <= sqrt(32.0)) {\n\n    // slightly larger values of fabs(x)\n    xnum = c[8] * y;\n    xden = y;\n    for (int i = 0; i < 7; i++) {\n      xnum = (xnum + c[i]) * y;\n      xden = (xden + d[i]) * y;\n    }\n    temp = (xnum + c[7]) / (xden + d[7]);\n    return do_del(y, temp, x <= 0.0);\n\n  } else if (y < 1e170) {\n\n    // even larger values of fabs(x)\n    xsq = 1.0 / (x * x);\n    xnum = p[5] * xsq;\n    xden = xsq;\n    for (int i = 0; i < 4; i++) {\n      xnum = (xnum + p[i]) * xsq;\n      xden = (xden + q[i]) * xsq;\n    }\n    temp = xsq * (xnum + p[4]) / (xden + q[4]);\n    temp = (1/sqrt(2*M_PI) - temp) / y;\n    return do_del(x, temp, x <= 0.0);\n  }\n\n  // default\n  return -0.0;\n}\n\n/* double plnorm()\n * Calculate a p-value for a log-normal distribution\n *   with observation 'x' and parameters 'meanlog' and\n *   'sdlog'.\n * Adapted from plnorm.c and pnorm.c in R-3.5.0,\n *   with lower_tail=FALSE and log_p=TRUE.\n * Return value is -log10(p).\n */\ndouble plnorm(double x, double meanlog, double sdlog) {\n  if (sdlog == 0.0)\n    return x < meanlog ? 0.0 : FLT_MAX;\n  return -pnorm((log(x) - meanlog) / sdlog) / M_LN10;\n}\n\n/* float calcPval()\n * Calculate -log10(p) using a log-normal distribution\n *   with mu=ctrlVal, sd={mu>7 ? 10*log10(mu) : 1.2*mu},\n *   and observation exptVal.\n */\nfloat calcPval(float exptVal, float ctrlVal) {\n  if (ctrlVal == SKIP)\n    return SKIP; // in a skipped region\n  if (ctrlVal == 0.0f)\n    return exptVal == 0.0f ? 0.0f : FLT_MAX;\n  if (exptVal == 0.0f)\n    return 0.0f;\n\n  // calculate meanlog and sdlog for plnorm()\n  double meanlog, sdlog;\n  double mu = ctrlVal;\n  if (mu > 7.0) {\n    double sd = 10.0 * log10(mu);\n    mu *= mu;\n    sd *= sd;\n    meanlog = log(mu / sqrt(sd + mu));\n    sdlog = sqrt(log1p(sd / mu));\n  } else {\n    meanlog = log(mu) - LOGSQRT;\n    sdlog = SQRTLOG;\n  }\n\n  // calculate pval by plnorm()\n  double pval = plnorm(exptVal, meanlog, sdlog);\n  return pval > FLT_MAX ? FLT_MAX : (float) pval;\n}\n\n/*** Create and save p-values genome-wide ***/\n\n/* uint32_t countIntervals()\n * Count the number of pileup intervals to create\n *   for a composite.\n */\nuint32_t countIntervals(Chrom* chr) {\n  uint32_t num = 0;\n  uint32_t k = 0;\n  for (uint32_t j = 0; j < chr->exptLen; j++) {\n    while (k < chr->ctrlLen\n        && chr->ctrl->end[k] < chr->expt->end[j]) {\n      num++;\n      k++;\n    }\n    if (chr->ctrl->end[k] == chr->expt->end[j])\n      k++;\n    num++;\n  }\n  return num;\n}\n\n/* void printPileHeader()\n * Print header of the bedgraph-ish pileup log file.\n */\nvoid printPileHeader(File pile, char* exptName,\n    char* ctrlName, bool gzOut) {\n  if (gzOut) {\n    gzprintf(pile.gzf, \"# experimental file: %s; control file: %s\\n\",\n      exptName, ctrlName && strcmp(ctrlName, \"null\") ? ctrlName : NA);\n    gzprintf(pile.gzf, \"chr\\tstart\\tend\\texperimental\\tcontrol\\t-log(p)\\n\");\n  } else {\n    fprintf(pile.f, \"# experimental file: %s; control file: %s\\n\",\n      exptName, ctrlName && strcmp(ctrlName, \"null\") ? ctrlName : NA);\n    fprintf(pile.f, \"chr\\tstart\\tend\\texperimental\\tcontrol\\t-log(p)\\n\");\n  }\n}\n\n/* void printPile()\n * Print bedgraph-ish interval of experimental/control\n *   pileup values and p-value.\n */\nvoid printPile(File pile, char* name, uint32_t start,\n    uint32_t end, float expt, float ctrl, float pval,\n    bool gzOut) {\n  if (gzOut) {\n    if (ctrl == SKIP)\n      gzprintf(pile.gzf, \"%s\\t%d\\t%d\\t%f\\t%f\\t%s\\n\",\n        name, start, end, expt, 0.0f, NA);\n    else\n      gzprintf(pile.gzf, \"%s\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",\n        name, start, end, expt, ctrl, pval);\n  } else {\n    if (ctrl == SKIP)\n      fprintf(pile.f, \"%s\\t%d\\t%d\\t%f\\t%f\\t%s\\n\",\n        name, start, end, expt, 0.0f, NA);\n    else\n      fprintf(pile.f, \"%s\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",\n        name, start, end, expt, ctrl, pval);\n  }\n}\n\n/* void savePval()\n * Create and save p-values as pileups for each Chrom*.\n */\nvoid savePval(Chrom* chrom, int chromLen, int n,\n    File pile, bool pileOpt, char* exptName,\n    char* ctrlName, bool gzOut) {\n\n  // print log header\n  if (pileOpt)\n    printPileHeader(pile, exptName, ctrlName, gzOut);\n\n  // create pileups for each chrom\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip)\n      continue;\n\n    // fill in missing pval arrays from previous samples\n    if (chr->sample < n) {\n      chr->pval = (Pileup**) memrealloc(chr->pval,\n        n * sizeof(Pileup*));\n      for (int j = n - 1; j >= chr->sample; j--)\n        chr->pval[j] = NULL;\n      chr->sample = n;\n    }\n\n    // p-values not to be saved: append a NULL\n    if (! chr->save) {\n      chr->pval = (Pileup**) memrealloc(chr->pval,\n        (n + 1) * sizeof(Pileup*));\n      chr->pval[n] = NULL;\n      chr->sample++;\n      continue;\n    }\n\n    // create 'pileup' arrays for p-values\n    uint32_t num = countIntervals(chr);\n    chr->pval = (Pileup**) memrealloc(chr->pval,\n      (n + 1) * sizeof(Pileup*));\n    chr->pval[n] = (Pileup*) memalloc(sizeof(Pileup));\n    chr->pval[n]->end = (uint32_t*) memalloc(num * sizeof(uint32_t));\n    chr->pval[n]->cov = (float*) memalloc(num * sizeof(float));\n    chr->pvalLen = (uint32_t*) memrealloc(chr->pvalLen,\n      (n + 1) * sizeof(uint32_t));\n    chr->pvalLen[n] = num;\n    chr->sample++;\n\n    // save p-values to arrays\n    Pileup* p = chr->pval[n];\n    uint32_t start = 0;    // start of interval\n    uint32_t j = 0, k = 0;\n    for (uint32_t m = 0; m < num; m++) {\n      if (chr->ctrl->end[k] < chr->expt->end[j]) {\n        p->end[m] = chr->ctrl->end[k];\n        p->cov[m] = calcPval(chr->expt->cov[j],\n          chr->ctrl->cov[k]);\n        if (pileOpt)\n          printPile(pile, chr->name, start, p->end[m],\n            chr->expt->cov[j], chr->ctrl->cov[k],\n            p->cov[m], gzOut);\n        k++;\n      } else {\n        p->end[m] = chr->expt->end[j];\n        p->cov[m] = calcPval(chr->expt->cov[j],\n          chr->ctrl->cov[k]);\n        if (pileOpt)\n          printPile(pile, chr->name, start, p->end[m],\n            chr->expt->cov[j], chr->ctrl->cov[k],\n            p->cov[m], gzOut);\n        if (chr->ctrl->end[k] == chr->expt->end[j])\n          k++;\n        j++;\n      }\n      start = p->end[m];\n    }\n\n  }\n}\n\n/*** Save experimental/control pileup values ***/\n\n/* void saveConst()\n * Save a given value as pileup for a full chromosome.\n */\nvoid saveConst(Pileup* p, uint32_t* size, uint32_t* mem,\n    uint32_t len, float val) {\n  if (! *mem) {\n    p->end = (uint32_t*) memalloc(sizeof(uint32_t));\n    p->cov = (float*) memalloc(sizeof(float));\n    *mem = 1;\n  }\n  p->end[0] = len;\n  p->cov[0] = val;\n  *size = 1;\n}\n\n/* float calcLambda()\n * Calculate a background lambda value: sum of fragment\n *   lengths divided by total genome length.\n */\nfloat calcLambda(Chrom* chrom, int chromLen,\n    double fragLen, uint64_t genomeLen) {\n  if (! genomeLen) {\n    for (int i = 0; i < chromLen; i++) {\n      Chrom* chr = chrom + i;\n      if (! chr->skip && chr->save) {\n        genomeLen += chr->len;\n        for (int j = 0; j < chr->bedLen; j += 2)\n          genomeLen -= chr->bed[j+1] - chr->bed[j];\n      }\n    }\n    if (! genomeLen)\n      exit(error(\"\", ERRGEN));\n  }\n  return fragLen / genomeLen;\n}\n\n/* void saveLambda()\n * For a ctrl pileup, save constant value of lambda,\n *   or -1 (SKIP) for BED intervals to be skipped.\n */\nvoid saveLambda(Chrom* chr, float lambda) {\n  if (chr->bedLen == 0) {\n    // no BED intervals: save constant of lambda\n    saveConst(chr->ctrl, &chr->ctrlLen, &chr->ctrlMem,\n      chr->len, lambda);\n    return;\n  }\n\n  // with BED intervals\n  int num = chr->bedLen + 1;  // number of array intervals\n  int idx = 0;    // index into chr->bed array\n  bool save = true;\n  if (chr->bed[0] == 0) {\n    num--;\n    idx++;\n    save = false; // 1st interval skipped\n  }\n  if (chr->bed[chr->bedLen - 1] == chr->len)\n    num--;\n\n  // expand pileup arrays (if necessary)\n  if (num > chr->ctrlMem) {\n    chr->ctrl->end = (uint32_t*) memrealloc(chr->ctrl->end,\n      num * sizeof(uint32_t));\n    chr->ctrl->cov = (float*) memrealloc(chr->ctrl->cov,\n      num * sizeof(float));\n    chr->ctrlMem = num;\n  }\n  chr->ctrlLen = num;\n\n  // populate chr->ctrl arrays: alternate lambda and -1\n  for (int j = 0; j < num - 1; j++) {\n    chr->ctrl->end[j] = chr->bed[idx];\n    chr->ctrl->cov[j] = save ? lambda : SKIP;\n    save = ! save;\n    idx++;\n  }\n  chr->ctrl->end[num-1] = chr->len;\n  chr->ctrl->cov[num-1] = save ? lambda : SKIP;\n}\n\n/* void savePileupNoCtrl()\n * When no control is available, save the control\n *   pileup as the background lambda value.\n */\nvoid savePileupNoCtrl(Chrom* chrom, int chromLen,\n    double fragLen, uint64_t genomeLen, bool verbose) {\n  float lambda = calcLambda(chrom, chromLen, fragLen,\n    genomeLen);\n  if (verbose)\n    fprintf(stderr, \"  Background pileup value: %f\\n\",\n      lambda);\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || ! chr->save)\n      continue;\n    saveLambda(chr, lambda);\n  }\n}\n\n/* float getVal()\n * Reconstruct a float value from the given\n *   int and 8-bit encoded fractional part.\n */\nfloat getVal(int32_t cov, uint8_t frac) {\n  return (float) cov\n    + ((frac & 0x7) / 8.0f)\n    + (((frac >> 3) & 0x3) / 6.0f)\n    + (((frac >> 5) & 0x7) / 10.0f);\n}\n\n/* float updateVal()\n * Update pileup value (int cov and 8-bit encoded\n *   fractional part) by adding the given int (dCov)\n *   and fractional part (dFrac) from the 'diff' arrays.\n *   Return reconstructed value via getVal().\n */\nfloat updateVal(int16_t dCov, uint8_t dFrac, int32_t* cov,\n    uint8_t* frac) {\n\n  // add ints\n  *cov += dCov;\n  if (! dFrac) {\n    if (*cov < 0)\n      exit(error(errMsg[ERRPILE], ERRISSUE));\n    return getVal(*cov, *frac);\n  }\n\n  // sum eighths (and collect halves)\n  int half = 0;\n  if (dFrac & 0x7) {\n    int sum8 = (dFrac & 0x7) + (*frac & 0x7);\n    while (sum8 > 3) {\n      half++;\n      sum8 -= 4;\n    }\n    *frac = (*frac & 0xF8) | sum8;\n  } else if (*frac & 0x4) {\n    half++;\n    *frac &= 0xFB;\n  }\n\n  // sum sixths\n  if (dFrac & 0x18) {\n    int sum6 = ((dFrac >> 3) & 0x3) + ((*frac >> 3) & 0x3);\n    if (sum6 > 2) {\n      half++;\n      sum6 -= 3;\n    }\n    *frac = (*frac & 0xE7) | (sum6 << 3);\n  }\n\n  // sum tenths\n  if (dFrac & 0xE0) {\n    int sum10 = ((dFrac >> 5) & 0x7) + ((*frac >> 5) & 0x7);\n    if (sum10 > 4) {\n      half++;\n      sum10 -= 5;\n    }\n    *frac = (*frac & 0x1F) | (sum10 << 5);\n  }\n\n  // combine halves\n  while (half > 1) {\n    (*cov)++;\n    half -= 2;\n  }\n  if (half)\n    *frac |= 0x4;\n\n  // check for negative pileup\n  if (*cov < 0)\n    exit(error(errMsg[ERRPILE], ERRISSUE));\n\n  return getVal(*cov, *frac);\n}\n\n/* float calcFactor\n * Calculate the scaling factor of experimental fragment\n *   lengths to control. Also, set ctrlLen for each\n *   Chrom* (to be corrected in savePileupCtrl()).\n */\nfloat calcFactor(Chrom* chrom, int chromLen,\n    double fragLen) {\n\n  // sum weighted lengths of control fragments\n  double ctrlFrag = 0.0;\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || ! chr->save || chr->diff == NULL)\n      continue;\n\n    // initialize BED interval variables\n    int bedIdx = 0;         // index into chr->bed array\n    uint32_t bedPos = bedIdx < chr->bedLen ?\n      chr->bed[bedIdx] : chr->len + 1;  // position of BED interval\n    bool save = true;\n    if (bedPos == 0) {\n      // first BED interval starts at 0\n      save = false;\n      bedIdx++;\n      bedPos = bedIdx < chr->bedLen ?\n        chr->bed[bedIdx] : chr->len + 1;\n    }\n\n    // initialize variables\n    uint32_t num = 1;       // number of intervals to create for pileup\n    Diff* d = chr->diff;\n    int32_t cov = 0;        // current pileup value\n    uint8_t frac = 0;       // current pileup value (fraction part)\n    float val = updateVal(d->cov[0], d->frac[0], &cov, &frac);\n\n    // calculate fragment lengths along the chrom\n    uint32_t start = 0;     // beginning coordinate of interval\n    uint32_t j;\n    for (j = 1; j < chr->len; j++) {\n\n      if (j == bedPos || (save && (d->cov[j] || d->frac[j]))) {\n        if (save)\n          // save frag. length weighted by val\n          ctrlFrag += (j - start) * val;\n        num++;\n        start = j;\n      }\n\n      // update pileup value\n      if (d->cov[j] || d->frac[j])\n        val = updateVal(d->cov[j], d->frac[j], &cov, &frac);\n\n      // update 'save' status (from BED intervals)\n      if (j == bedPos) {\n        save = ! save;\n        bedIdx++;\n        bedPos = bedIdx < chr->bedLen ?\n          chr->bed[bedIdx] : chr->len + 1;\n      }\n    }\n\n    // save final interval\n    if (save)\n      ctrlFrag += (j - start) * val;\n    chr->ctrlLen = num; // save number of intervals\n  }\n\n  // return ratio of experimental frags to ctrl frags\n  if (! ctrlFrag)\n    return 1.0f;\n  return fragLen / ctrlFrag;\n}\n\n/* void savePileupCtrl()\n * Save pileup values for a control sample from\n *   'diff' arrays and background lambda value.\n */\nvoid savePileupCtrl(Chrom* chrom, int chromLen,\n    double fragLen, uint64_t genomeLen, bool verbose) {\n\n  // calculate background lambda value\n  float lambda = calcLambda(chrom, chromLen, fragLen, genomeLen);\n  if (verbose)\n    fprintf(stderr, \"  Background pileup value: %f\\n\", lambda);\n\n  // calculate scale factor (experimental / control)\n  float factor = calcFactor(chrom, chromLen, fragLen);\n  if (verbose) {\n    fprintf(stderr, \"  Scaling factor for control pileup: %f\\n\", factor);\n    if (factor > 5.0f)\n      fprintf(stderr, \"  ** Warning! Large scaling may mask true signal **\\n\");\n  }\n\n  // create pileup for each chrom\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || ! chr->save)\n      continue;\n\n    // if no read coverage, save constant pileup of lambda\n    if (chr->diff == NULL) {\n      saveLambda(chr, lambda);\n      continue;\n    }\n\n    // expand pileup arrays (if necessary)\n    //   (chr->ctrlLen already set in calcFactor())\n    if (chr->ctrlLen > chr->ctrlMem) {\n      chr->ctrl->end = (uint32_t*) memrealloc(chr->ctrl->end,\n        chr->ctrlLen * sizeof(uint32_t));\n      chr->ctrl->cov = (float*) memrealloc(chr->ctrl->cov,\n        chr->ctrlLen * sizeof(float));\n      chr->ctrlMem = chr->ctrlLen;\n    }\n\n    // initialize BED interval variables\n    int bedIdx = 0;         // index into chr->bed array\n    uint32_t bedPos = bedIdx < chr->bedLen ?\n      chr->bed[bedIdx] : chr->len + 1;  // position of BED interval\n    bool save = true;\n    if (bedPos == 0) {\n      // first BED interval starts at 0 (should be skipped)\n      save = false;\n      bedIdx++;\n      bedPos = bedIdx < chr->bedLen ?\n        chr->bed[bedIdx] : chr->len + 1;\n    }\n\n    // initialize pileup values\n    Diff* d = chr->diff;\n    int32_t cov = 0;      // current pileup value\n    uint8_t frac = 0;     // current pileup value (fraction part)\n    float val = factor * updateVal(d->cov[0], d->frac[0],\n      &cov, &frac);\n    float net = MAX(val, lambda);\n\n    // save pileup values along the chrom\n    uint32_t pos = 0;     // position in pileup arrays\n    uint32_t j;\n    for (j = 1; j < chr->len; j++) {\n\n      // update pileup value\n      if (d->cov[j] || d->frac[j])\n        val = factor * updateVal(d->cov[j], d->frac[j],\n          &cov, &frac);\n\n      // determine if interval should be saved\n      if (j == bedPos || (save && net != MAX(val, lambda))) {\n        chr->ctrl->end[pos] = j;\n        chr->ctrl->cov[pos] = save ? net : SKIP;\n        pos++;\n      }\n      net = MAX(val, lambda);\n\n      // update 'save' status (from BED intervals)\n      if (j == bedPos) {\n        save = ! save;\n        bedIdx++;\n        bedPos = bedIdx < chr->bedLen ?\n          chr->bed[bedIdx] : chr->len + 1;\n      }\n\n    }\n\n    // save final interval\n    chr->ctrl->end[pos] = j;\n    chr->ctrl->cov[pos] = save ? net : SKIP;\n\n    // update array length\n    if (pos >= chr->ctrlLen) {\n      char msg[MAX_ALNS];\n      sprintf(msg, \"%s (%s)\", errMsg[ERRARRC], chr->name);\n      exit(error(msg, ERRISSUE));\n    }\n    chr->ctrlLen = pos + 1;\n\n    // check for val error (should end at 0)\n    val = updateVal(d->cov[j], d->frac[j], &cov, &frac);\n    if (val) {\n      char msg[MAX_ALNS];\n      sprintf(msg, \"Control pileup for ref %s finishes at %f (not 0.0)\",\n        chr->name, val);\n      exit(error(msg, ERRISSUE));\n    }\n  }\n\n}\n\n/* double savePileupExpt()\n * Save pileup values for an experimental sample from\n *   'diff' arrays.\n *   Return total length of all fragments (weighted).\n */\ndouble savePileupExpt(Chrom* chrom, int chromLen) {\n\n  // create pileup for each chrom\n  double fragLen = 0.0;  // weighted fragment length\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->skip || ! chr->save)\n      continue;\n\n    // if no read coverage, save constant pileup of 0\n    if (chr->diff == NULL) {\n      saveConst(chr->expt, &chr->exptLen, &chr->exptMem,\n        chr->len, 0.0f);\n      continue;\n    }\n\n    // determine number of pileup intervals\n    int bedIdx = 0;     // index into chr->bed array\n    uint32_t bedPos = bedIdx < chr->bedLen ?\n      chr->bed[bedIdx] : chr->len + 1;  // position of BED interval\n    bool save = true;\n    if (bedPos == 0) {\n      // first BED interval starts at 0 (should be skipped)\n      save = false;\n      bedIdx++;\n      bedPos = bedIdx < chr->bedLen ?\n        chr->bed[bedIdx] : chr->len + 1;\n    }\n    Diff* d = chr->diff;\n    uint32_t num = 1;   // number of intervals\n    for (uint32_t j = 1; j < chr->len; j++)\n      if (j == bedPos) {\n        num++;\n        save = ! save;\n        bedIdx++;\n        bedPos = bedIdx < chr->bedLen ?\n          chr->bed[bedIdx] : chr->len + 1;\n      } else if (save && (d->cov[j] || d->frac[j]))\n        num++;\n\n    // expand pileup arrays (if necessary)\n    if (num > chr->exptMem) {\n      chr->expt->end = (uint32_t*) memrealloc(chr->expt->end,\n        num * sizeof(uint32_t));\n      chr->expt->cov = (float*) memrealloc(chr->expt->cov,\n        num * sizeof(float));\n      chr->exptMem = num;\n    }\n    chr->exptLen = num;\n\n    // reset BED interval values\n    bedIdx = 0; // index into chr->bed array\n    bedPos = bedIdx < chr->bedLen ?\n      chr->bed[bedIdx] : chr->len + 1;\n    save = true;\n    if (bedPos == 0) {\n      save = false;\n      bedIdx++;\n      bedPos = bedIdx < chr->bedLen ?\n        chr->bed[bedIdx] : chr->len + 1;\n    }\n\n    // initialize pileup values\n    int32_t cov = 0;      // current pileup value\n    uint8_t frac = 0;     // current pileup value (fraction part)\n    float val = updateVal(d->cov[0], d->frac[0], &cov, &frac);\n\n    // save pileup values along the chrom\n    uint32_t start = 0;   // beginning coordinate of interval\n    uint32_t pos = 0;     // position in pileup arrays\n    uint32_t j;\n    for (j = 1; j < chr->len; j++) {\n\n      if (j == bedPos || (save && (d->cov[j] || d->frac[j]))) {\n        // save end of interval and pileup value\n        chr->expt->end[pos] = j;\n        if (save) {\n          chr->expt->cov[pos] = val;\n          fragLen += (j - start) * val; // frag. length weighted by val\n        } else\n          chr->expt->cov[pos] = 0.0f;\n        pos++;\n        start = j;\n      }\n\n      // update pileup value\n      if (d->cov[j] || d->frac[j])\n        val = updateVal(d->cov[j], d->frac[j], &cov, &frac);\n\n      // update 'save' status (from BED intervals)\n      if (j == bedPos) {\n        save = ! save;\n        bedIdx++;\n        bedPos = bedIdx < chr->bedLen ?\n          chr->bed[bedIdx] : chr->len + 1;\n      }\n\n    }\n\n    // save final interval\n    chr->expt->end[pos] = j;\n    if (save) {\n      chr->expt->cov[pos] = val;\n      fragLen += (j - start) * val;\n    } else\n      chr->expt->cov[pos] = 0.0f;\n\n    // verify array length\n    if (pos + 1 != chr->exptLen) {\n      char msg[MAX_ALNS];\n      sprintf(msg, \"%s (%s)\", errMsg[ERRARR], chr->name);\n      exit(error(msg, ERRISSUE));\n    }\n\n    // check for val error (should end at 0)\n    val = updateVal(d->cov[j], d->frac[j], &cov, &frac);\n    if (val) {\n      char msg[MAX_ALNS];\n      sprintf(msg, \"Experimental pileup for ref %s finishes at %f (not 0.0)\",\n        chr->name, val);\n      exit(error(msg, ERRISSUE));\n    }\n  }\n\n  if (fragLen == 0.0)\n    exit(error(\"\", ERREXPT));\n  return fragLen;\n}\n\n/*** Fractional alignment accounting ***/\n\n/* void addFrac()\n * Add a fractional count (1/count) to *frac,\n *   which has this 8-bit encoding:\n *       000     00      000\n *     tenths  sixths  eighths\n *   Carry over is applied to *cov.\n *\n * This function (as well as subFrac(), below) is\n *   written at the bit level to optimize efficiency,\n *   and hence is nearly impossible to debug/maintain.\n *   Apologies in advance to those attempting to do so.\n */\nvoid addFrac(int16_t* cov, uint8_t* frac, uint8_t count) {\n  switch (count) {\n    case 8:\n      if ((*frac & 0x7) == 0x7) {\n        (*cov)++;\n        *frac &= 0xF8;\n      } else\n        (*frac)++;\n      break;\n    case 4:\n      if ((*frac & 0x6) == 0x6) {\n        (*cov)++;\n        *frac &= 0xF9;\n      } else\n        *frac += 0x2;\n      break;\n    case 2:\n      if (*frac & 0x4) {\n        (*cov)++;\n        *frac &= 0xFB;\n      } else\n        *frac |= 0x4;\n      break;\n    case 6:\n      if (*frac & 0x10) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0xEB;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0xEF;\n        }\n      } else\n        *frac += 0x8;\n      break;\n    case 3:\n      if (*frac & 0x8) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0xF3;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0xF7;\n        }\n      } else if (*frac & 0x10) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0xEB;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0xEF;\n        }\n        *frac |= 0x8;\n      } else\n        *frac |= 0x10;\n      break;\n    case 10:\n      if (*frac & 0x80) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0x7B;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0x7F;\n        }\n      } else\n        *frac += 0x20;\n      break;\n    case 5:\n      if (*frac & 0x80) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0x7B;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0x7F;\n        }\n        *frac += 0x20;\n      } else if ((*frac & 0x60) == 0x60) {\n        if (*frac & 0x4) {\n          (*cov)++;\n          *frac &= 0x9B;\n        } else {\n          *frac |= 0x4;\n          *frac &= 0x9F;\n        }\n      } else\n        *frac += 0x40;\n      break;\n    default: ;\n      char msg[MAX_ALNS];\n      sprintf(msg, \"%s (%d)\", errMsg[ERRALNS], count);\n      exit(error(msg, ERRISSUE));\n  }\n}\n\n/* void subFrac()\n * Subtract a fractional count to *frac.\n *   See addFrac() above for a description of\n *   the 8-bit encoding.\n */\nvoid subFrac(int16_t* cov, uint8_t* frac, uint8_t count) {\n  switch (count) {\n    case 8:\n      if (*frac & 0x7)\n        (*frac)--;\n      else {\n        (*cov)--;\n        *frac |= 0x7;\n      }\n      break;\n    case 4:\n      if (*frac & 0x6)\n        *frac -= 0x2;\n      else {\n        (*cov)--;\n        *frac |= 0x6;\n      }\n      break;\n    case 2:\n      if (*frac & 0x4)\n        *frac &= 0xFB;\n      else {\n        (*cov)--;\n        *frac |= 0x4;\n      }\n      break;\n    case 6:\n      if (*frac & 0x18)\n        *frac -= 0x8;\n      else if (*frac & 0x4) {\n        *frac |= 0x10;\n        *frac &= 0xFB;\n      } else {\n        (*cov)--;\n        *frac |= 0x14;\n      }\n      break;\n    case 3:\n      if (*frac & 0x10)\n        *frac &= 0xEF;\n      else if (*frac & 0x4) {\n        *frac &= 0xFB;\n        *frac += 0x8;\n      } else {\n        (*cov)--;\n        *frac += 0xC;\n      }\n      break;\n    case 10:\n      if (*frac & 0xE0)\n        *frac -= 0x20;\n      else if (*frac & 0x4) {\n        *frac &= 0xFB;\n        *frac |= 0x80;\n      } else {\n        (*cov)--;\n        *frac |= 0x84;\n      }\n      break;\n    case 5:\n      if (*frac & 0xC0)\n        *frac -= 0x40;\n      else if (*frac & 0x4) {\n        *frac &= 0xFB;\n        *frac += 0x60;\n      } else {\n        (*cov)--;\n        *frac |= 0x4;\n        *frac += 0x60;\n      }\n      break;\n    default: ;\n      char msg[MAX_ALNS];\n      sprintf(msg, \"%s (%d)\", errMsg[ERRALNS], count);\n      exit(error(msg, ERRISSUE));\n  }\n}\n\n/*** Convert alignments to intervals ***/\n\n/* void printBED()\n * Print a BED interval for a read/fragment.\n *   Append the aln count, 'C'ontrol/'E'xperimental,\n *   and sample number to the read name (4th column).\n */\nvoid printBED(File bed, bool gzOut, char* chr,\n    int64_t start, int64_t end, char* qname,\n    uint8_t count, bool ctrl, int sample) {\n  if (gzOut)\n    gzprintf(bed.gzf, \"%s\\t%ld\\t%ld\\t%s_%d_%c_%d\\n\",\n      chr, start, end, qname, count, ctrl ? 'C' : 'E',\n      sample);\n  else\n    fprintf(bed.f, \"%s\\t%ld\\t%ld\\t%s_%d_%c_%d\\n\",\n      chr, start, end, qname, count, ctrl ? 'C' : 'E',\n      sample);\n}\n\n/* uint32_t saveInterval()\n * Check the validity of the start/end coordinates\n *   of a read/fragment. Save the ends to the 'diff'\n *   arrays of the given Chrom*.\n *   Return the fragment length.\n */\nuint32_t saveInterval(Chrom* c, int64_t start, int64_t end,\n    char* qname, uint8_t count, File bed, bool bedOpt,\n    bool gzOut, bool ctrl, int sample, uint64_t* errCount,\n    bool verbose) {\n\n  // check validity of positions\n  if (start < 0) {\n    if (verbose) {\n      if (*errCount < MAX_ALNS)\n        fprintf(stderr, \"Warning! Read %s prevented from extending below 0 on %s\\n\",\n          qname, c->name);\n      (*errCount)++;\n    }\n    start = 0;\n  }\n  if (start >= c->len) {\n    char msg[MAX_ALNS];\n    sprintf(msg, \"Read %s, ref. %s\", qname, c->name);\n    exit(error(msg, ERRPOS));\n  }\n  if (end > c->len) {\n    if (verbose) {\n      if (*errCount < MAX_ALNS)\n        fprintf(stderr, \"Warning! Read %s prevented from extending past %d on %s\\n\",\n          qname, c->len, c->name);\n      (*errCount)++;\n    }\n    end = c->len;\n  }\n\n  // create 'diff' arrays if necessary\n  if (c->diff == NULL) {\n    c->diff = (Diff*) memalloc(sizeof(Diff));\n    c->diff->frac = (uint8_t*) memalloc((1 + c->len) * sizeof(uint8_t));\n    c->diff->cov = (int16_t*) memalloc((1 + c->len) * sizeof(int16_t));\n    for (int i = 0; i < 1 + c->len; i++) {\n      c->diff->frac[i] = 0;\n      c->diff->cov[i] = 0;\n    }\n  }\n\n  // check for overflow/underflow (c->diff->cov is int16_t)\n  if (c->diff->cov[start] == INT16_MAX) {\n    if (verbose) {\n      fprintf(stderr, \"Warning! Read %s, alignment at (%s, %ld-%ld)\",\n        qname, c->name, start, end);\n      fprintf(stderr, \" skipped due to overflow\\n\");\n    }\n    return 0;\n  }\n  if (c->diff->cov[end] == INT16_MIN) {\n    if (verbose) {\n      fprintf(stderr, \"Warning! Read %s, alignment at (%s, %ld-%ld)\",\n        qname, c->name, start, end);\n      fprintf(stderr, \" skipped due to underflow\\n\");\n    }\n    return 0;\n  }\n\n  // add counts to diff array(s)\n  if (count == 1) {\n    c->diff->cov[start]++;\n    c->diff->cov[end]--;\n  } else {\n    // add fractional count\n    addFrac(&c->diff->cov[start], &c->diff->frac[start], count);\n    subFrac(&c->diff->cov[end], &c->diff->frac[end], count);\n  }\n\n  // print BED interval\n  if (bedOpt)\n    printBED(bed, gzOut, c->name, start, end, qname,\n      count, ctrl, sample);\n\n  return end - start;\n}\n\n/* int calcAvgLen()\n * Calculate the average fragment length.\n *   Return 0 if no fragments.\n */\nint calcAvgLen(double totalLen, uint64_t pairedPr,\n    bool verbose) {\n  if (! pairedPr) {\n    if (verbose) {\n      fprintf(stderr, \"Warning! No paired alignments to calculate avg frag \");\n      fprintf(stderr, \"length --\\n  Printing unpaired alignments \\\"as is\\\"\\n\");\n    }\n    return 0;\n  }\n  return (int) (totalLen / pairedPr + 0.5);\n}\n\n/* void processAvgExt()\n * Save complete intervals for unpaired alignments\n *   with \"extend to average length\" option, after\n *   calculating average length from paired alns.\n */\nvoid processAvgExt(Aln** unpair, int unpairIdx,\n    int unpairLen, double totalLen, uint64_t pairedPr,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, uint64_t* errCount, bool verbose) {\n\n  // determine average fragment length\n  int avgLen = calcAvgLen(totalLen, pairedPr, verbose);\n\n  // process each alignment\n  for (int i = 0; i <= unpairIdx; i++) {\n\n    int end = (i == unpairIdx ? unpairLen : MAX_SIZE);\n    for (int j = 0; j < end; j++) {\n\n      Aln* a = unpair[i] + j;\n      if (! avgLen)\n        saveInterval(a->chrom, a->pos[0], a->pos[1], a->name,\n          a->count, bed, bedOpt, gzOut, ctrl, sample,\n          errCount, verbose);\n      else if (a->strand)\n        saveInterval(a->chrom, a->pos[0], a->pos[0] + avgLen,\n          a->name, a->count, bed, bedOpt, gzOut, ctrl,\n          sample, errCount, verbose);\n      else\n        saveInterval(a->chrom, (signed) (a->pos[1] - avgLen),\n          a->pos[1], a->name, a->count, bed, bedOpt, gzOut,\n          ctrl, sample, errCount, verbose);\n\n      // free memory\n      free(a->name);\n    }\n\n  }\n}\n\n/* void saveAvgExt()\n * Save info for an unpaired alignment to an array,\n *   for \"extend to average length\" option. Alignments\n *   will be processed later by processAvgExt().\n */\nvoid saveAvgExt(char* qname, Aln* b, uint8_t count,\n    Aln*** unpair, int* unpairIdx, int* unpairLen,\n    int* unpairMem) {\n\n  // alloc memory if necessary\n  if (*unpairLen == 0 && *unpairIdx == *unpairMem) {\n    *unpair = (Aln**) memrealloc(*unpair,\n      (*unpairMem + 1) * sizeof(Aln*));\n    (*unpair)[*unpairMem] = (Aln*) memalloc(MAX_SIZE\n      * sizeof(Aln));\n    (*unpairMem)++;\n  }\n\n  // copy alignment info\n  Aln* a = (*unpair)[*unpairIdx] + *unpairLen;\n  a->name = (char*) memalloc(1 + strlen(qname));\n  strcpy(a->name, qname);\n  a->chrom = b->chrom;\n  a->strand = b->strand;\n  a->pos[0] = b->pos[0];\n  a->pos[1] = b->pos[1];\n  a->count = count;\n\n  (*unpairLen)++;\n  if (*unpairLen == MAX_SIZE) {\n    *unpairLen = 0;\n    (*unpairIdx)++;\n  }\n}\n\n/* void saveUnpair()\n * Control processing of unpaired alignments\n *   (either keeping them as is, or extending\n *   to a given length).\n */\nvoid saveUnpair(char* qname, Aln* a, uint8_t count,\n    bool extendOpt, int extend, bool atacOpt,\n    int atacLen5, int atacLen3, bool atacAdj,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, uint64_t* errCount, bool verbose) {\n  if (extendOpt) {\n    if (a->strand)\n      saveInterval(a->chrom, a->pos[0], a->pos[0] + extend,\n        qname, count, bed, bedOpt, gzOut, ctrl, sample,\n        errCount, verbose);\n    else\n      saveInterval(a->chrom, (signed) (a->pos[1] - extend),\n        a->pos[1], qname, count, bed, bedOpt, gzOut, ctrl,\n        sample, errCount, verbose);\n  } else if (atacOpt) {\n    if (a->strand) {\n      if (atacAdj)\n        a->pos[0] += ATACADJF;\n      saveInterval(a->chrom, (signed) (a->pos[0] - atacLen5),\n        a->pos[0] + atacLen3, qname, count, bed, bedOpt,\n        gzOut, ctrl, sample, errCount, verbose);\n    } else {\n      if (atacAdj)\n        a->pos[1] += ATACADJR;\n      saveInterval(a->chrom, (signed) (a->pos[1] - atacLen3),\n        a->pos[1] + atacLen5, qname, count, bed, bedOpt,\n        gzOut, ctrl, sample, errCount, verbose);\n    }\n  } else\n    saveInterval(a->chrom, a->pos[0], a->pos[1], qname,\n      count, bed, bedOpt, gzOut, ctrl, sample, errCount,\n      verbose);\n}\n\n/* uint32_t saveFragAtac()\n * In ATAC-seq mode, save intervals for each end of a full\n *   fragment. If they overlap, just save one big interval.\n *   Return total length.\n */\nuint32_t saveFragAtac(Chrom* c, uint32_t start,\n    uint32_t end, int atacLen5, int atacLen3, bool atacAdj,\n    char* qname, uint8_t count, File bed, bool bedOpt,\n    bool gzOut, bool ctrl, int sample, uint64_t* errCount,\n    bool verbose) {\n  if (atacAdj) {\n    start += ATACADJF;\n    end += ATACADJR;\n  }\n  if (start + atacLen3 >= (signed) (end - atacLen3))\n    // expanded intervals overlap: just save one\n    return saveInterval(c, (signed) (start - atacLen5),\n      end + atacLen5, qname, count, bed, bedOpt, gzOut,\n      ctrl, sample, errCount, verbose);\n  // save two intervals\n  return saveInterval(c, (signed) (start - atacLen5),\n      start + atacLen3, qname, count, bed, bedOpt,\n      gzOut, ctrl, sample, errCount, verbose)\n    + saveInterval(c, (signed) (end - atacLen3),\n      end + atacLen5, qname, count, bed, bedOpt,\n      gzOut, ctrl, sample, errCount, verbose);\n}\n\n/* uint32_t saveFragment()\n * Save full fragment for a proper pair. Return length.\n */\nuint32_t saveFragment(char* qname, Aln* a, uint8_t count,\n    bool atacOpt, int atacLen5, int atacLen3, bool atacAdj,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, uint64_t* errCount, bool verbose) {\n  // ensure start < end\n  uint32_t start, end;\n  if (a->pos[0] > a->pos[1]) {\n    start = a->pos[1];\n    end = a->pos[0];\n  } else {\n    start = a->pos[0];\n    end = a->pos[1];\n  }\n  if (atacOpt)\n    return saveFragAtac(a->chrom, start, end, atacLen5,\n      atacLen3, atacAdj, qname, count, bed, bedOpt,\n      gzOut, ctrl, sample, errCount, verbose);\n  return saveInterval(a->chrom, start, end, qname,\n    count, bed, bedOpt, gzOut, ctrl, sample, errCount,\n    verbose);\n}\n\n/*** Save alignments for evaluation of PCR duplicates ***/\n\n/* Read* createRead()\n * Expand read arrays if necessary and update\n *   indexes. Return pointer to next Read*.\n */\nRead* createRead(Read*** read, int* readIdx,\n    int* readLen, int* readMem) {\n  // alloc memory if necessary\n  if (*readLen == 0 && *readIdx == *readMem) {\n    // check if max. number of reads exceeded\n    if ((*readMem + 1) * MAX_SIZE > UINT32_MAX) {\n      char msg[MAX_ALNS];\n      sprintf(msg, \"Exceeded max. number of reads (%u)\",\n        UINT32_MAX);\n      exit(error(msg, ERRISSUE));\n    }\n\n    *read = (Read**) memrealloc(*read,\n      (*readMem + 1) * sizeof(Read*));\n    (*read)[*readMem] = (Read*) memalloc(MAX_SIZE\n      * sizeof(Read));\n    (*readMem)++;\n  }\n  Read* r = (*read)[*readIdx] + *readLen;\n\n  // update indexes\n  (*readLen)++;\n  if (*readLen == MAX_SIZE) {\n    *readLen = 0;\n    (*readIdx)++;\n  }\n  return r;\n}\n\n/* void copyAlns()\n * Copy alignment info for a set of singleton\n *   alignments.\n */\nvoid copyAlns(Aln* aln, int alnLen, float score,\n    float asDiff, bool first, Aln** dest,\n    uint8_t* destLen) {\n  // adjust AS tolerance for secondary alns\n  if (score != NOSCORE)\n    score -= asDiff;\n\n  // determine number of valid single alignments\n  uint8_t count = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (! a->paired && a->first == first\n        && a->score >= score)\n      count++;\n  }\n  *dest = (Aln*) memalloc(count * sizeof(Aln));\n  *destLen = count;\n\n  // copy alignment info for valid alignments\n  uint8_t j = 0;  // index into r->aln\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (! a->paired && a->first == first\n        && a->score >= score) {\n      Aln* b = *dest + j;\n      b->paired = a->paired;\n      b->first = a->first;\n      b->strand = a->strand;\n      b->score = a->score;\n      b->chrom = a->chrom;\n      b->pos[0] = a->pos[0];\n      b->pos[1] = a->pos[1];\n      j++;\n    }\n  }\n\n}\n\n/* void saveAlnsSingle()\n * Save a set of singleton alignments.\n */\nvoid saveAlnsSingle(char* qname, Aln* aln, int alnLen,\n    float score, float asDiff, bool first, Read* r,\n    uint16_t qual) {\n  // populate Read* struct\n  r->name = (char*) memalloc(1 + strlen(qname));\n  strcpy(r->name, qname);\n  r->qual = qual;\n  r->first = first;\n  r->score = score;\n\n  // copy alignments to struct\n  copyAlns(aln, alnLen, score, asDiff, first, &r->aln,\n    &r->alnLen);\n}\n\n/* void saveAlnsDiscord()\n * Save a set of discordant alignments.\n */\nvoid saveAlnsDiscord(char* qname, Aln* aln, int alnLen,\n    float scoreR1, float scoreR2, float asDiff,\n    Read* r, uint16_t qualR1, uint16_t qualR2) {\n  // save R1 alignments\n  saveAlnsSingle(qname, aln, alnLen, scoreR1, asDiff,\n    true, r, qualR1);\n  // save R2 alignments\n  copyAlns(aln, alnLen, scoreR2, asDiff, false, &r->alnR2,\n    &r->alnLenR2);\n  r->qual = MIN(qualR1 + qualR2, UINT16_MAX);\n  r->scoreR2 = scoreR2;\n}\n\n/* void saveAlnsPair()\n * Save a set of properly paired alignments.\n */\nvoid saveAlnsPair(char* qname, Aln* aln, int alnLen,\n    float score, float asDiff, Read* r, uint16_t qualR1,\n    uint16_t qualR2) {\n  // populate Read* struct\n  r->name = (char*) memalloc(1 + strlen(qname));\n  strcpy(r->name, qname);\n  r->qual = MIN(qualR1 + qualR2, UINT16_MAX);\n  r->score = score;\n\n  // adjust AS tolerance for secondary alns\n  if (score != NOSCORE)\n    score -= asDiff;\n\n  // determine number of valid paired alignments\n  uint8_t count = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired && a->full && a->score >= score)\n      count++;\n  }\n  r->aln = (Aln*) memalloc(count * sizeof(Aln));\n  r->alnLen = count;\n\n  // copy alignment info for valid alignments\n  uint8_t j = 0;  // index into r->aln\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired && a->full && a->score >= score) {\n      Aln* b = r->aln + j;\n      b->paired = a->paired;\n      b->full = a->full;\n      b->score = a->score;\n      b->chrom = a->chrom;\n      // ensure positions are ordered\n      if (a->pos[0] > a->pos[1]) {\n        b->pos[0] = a->pos[1];\n        b->pos[1] = a->pos[0];\n      } else {\n        b->pos[0] = a->pos[0];\n        b->pos[1] = a->pos[1];\n      }\n      j++;\n    }\n  }\n\n}\n\n/* void saveAlns()\n * Control saving of alignments. Use createRead()\n *   to make a Read*, and pass that to saveAlnsPair(),\n *   saveAlnsDiscord(), or saveAlnsSingle().\n */\nvoid saveAlns(char* qname, Aln* aln, int alnLen, bool pair,\n    bool singleOpt, bool singleR1, bool singleR2,\n    float scorePr, float scoreR1, float scoreR2,\n    float asDiff, Read*** readPr, int* readIdxPr,\n    int* readLenPr, int* readMemPr, Read*** readDc,\n    int* readIdxDc, int* readLenDc, int* readMemDc,\n    Read*** readSn, int* readIdxSn, int* readLenSn,\n    int* readMemSn, uint16_t qualR1, uint16_t qualR2) {\n  if (pair) {\n    // properly paired alignment(s)\n    Read* r = createRead(readPr, readIdxPr, readLenPr,\n      readMemPr);\n    saveAlnsPair(qname, aln, alnLen, scorePr, asDiff, r,\n      qualR1, qualR2);\n  } else if (singleOpt) {\n    if (singleR1 && singleR2) {\n      // both reads aligned (discordant)\n      Read* r = createRead(readDc, readIdxDc, readLenDc,\n        readMemDc);\n      saveAlnsDiscord(qname, aln, alnLen, scoreR1,\n        scoreR2, asDiff, r, qualR1, qualR2);\n    } else if (singleR1) {\n      // only R1 read aligned\n      Read* r = createRead(readSn, readIdxSn, readLenSn,\n        readMemSn);\n      saveAlnsSingle(qname, aln, alnLen, scoreR1, asDiff,\n        true, r, qualR1);\n    } else if (singleR2) {\n      // only R2 read aligned\n      Read* r = createRead(readSn, readIdxSn, readLenSn,\n        readMemSn);\n      saveAlnsSingle(qname, aln, alnLen, scoreR2, asDiff,\n        false, r, qualR2);\n    }\n  }\n}\n\n/*** Process a set of alignments ***/\n\n/* void subsampleSingle()\n * For sets of unpaired alns at an invalid count (>10, 9, 7),\n *   find a more stringent score.\n */\nvoid subsampleSingle(Aln* aln, int alnLen, bool first,\n    uint8_t* count, float* score) {\n\n  // insertion sort of aln scores\n  float arr[*count];  // sorted array of aln scores\n  int k = 0;          // count of valid alns analyzed\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (! a->paired && a->first == first\n        && a->score >= *score && a->chrom->save\n        && ! a->chrom->skip) {\n\n      // insert a->score into array\n      int j;\n      for (j = 0; j < k; j++)\n        if (a->score > arr[j])\n          break;\n      for (int m = k; m > j; m--)\n        arr[m] = arr[m - 1];\n      arr[j] = a->score;\n      k++;\n\n    }\n  }\n\n  *count = *count > 10 ? 10 : *count - 1; // update count of alns to keep\n  *score = arr[*count - 1];               // save new min. score\n}\n\n/* int processSingle()\n * Process a set of unpaired alignments, weighted to\n *   1/n (number of valid alignments).\n *   Return 1 if valid alignments found, else 0.\n */\nint processSingle(char* qname, Aln* aln, int alnLen,\n    bool extendOpt, int extend, bool avgExtOpt,\n    Aln*** unpair, int* unpairIdx, int* unpairLen,\n    int* unpairMem, float score, float asDiff,\n    bool first, bool atacOpt, int atacLen5,\n    int atacLen3, bool atacAdj, File bed, bool bedOpt,\n    bool gzOut, bool ctrl, int sample, uint64_t* errCount,\n    bool verbose) {\n\n  // adjust AS tolerance for secondary alns\n  if (score != NOSCORE)\n    score -= asDiff;\n\n  // determine number of valid unpaired alignments\n  //   (within score threshold and not to skipped chrom)\n  uint8_t count = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (! a->paired && a->first == first\n        && a->score >= score && a->chrom->save\n        && ! a->chrom->skip)\n      count++;\n  }\n  if (! count)\n    return 0;\n\n  // adjust score so that num alns is OK (1/2/3/4/5/6/8/10)\n  if (count > 10 || count == 7 || count == 9)\n    subsampleSingle(aln, alnLen, first, &count, &score);\n\n  // find unpaired alns to save\n  uint8_t saved = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (! a->paired && a->first == first\n        && a->score >= score && a->chrom->save\n        && ! a->chrom->skip) {\n\n      if (avgExtOpt)\n        // for average-extension option, save alignment\n        //   for later processing by processAvgExt()\n        saveAvgExt(qname, a, count, unpair,\n          unpairIdx, unpairLen, unpairMem);\n\n      else\n        // for other options, save interval\n        saveUnpair(qname, a, count, extendOpt, extend,\n          atacOpt, atacLen5, atacLen3, atacAdj, bed,\n          bedOpt, gzOut, ctrl, sample, errCount, verbose);\n\n      if (++saved == count)\n        break;  // in case of AS ties\n    }\n  }\n\n  // check for error saving alignments\n  if (saved != count) {\n    char msg[MAX_ALNS];\n    sprintf(msg, \"Saved %d alignments for read %s; should have been %d\",\n      saved, qname, count);\n    exit(error(msg, ERRISSUE));\n  }\n\n  return 1;\n}\n\n/* void subsamplePair()\n * For sets of paired alns at an invalid count (>10, 9, 7),\n *   find a more stringent score.\n */\nvoid subsamplePair(Aln* aln, int alnLen, uint8_t* count,\n    float* score) {\n\n  // insertion sort of aln scores\n  float arr[*count];  // sorted array of aln scores\n  int k = 0;          // count of valid alns analyzed\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired && a->full && a->score >= *score\n        && a->chrom->save && ! a->chrom->skip) {\n\n      // insert a->score into array\n      int j;\n      for (j = 0; j < k; j++)\n        if (a->score > arr[j])\n          break;\n      for (int m = k; m > j; m--)\n        arr[m] = arr[m - 1];\n      arr[j] = a->score;\n      k++;\n\n    }\n  }\n\n  *count = *count > 10 ? 10 : *count - 1; // update count of alns to keep\n  *score = arr[*count - 1];               // save new min. score\n}\n\n/* int processPair()\n * Process a set of paired alignments, weighted to\n *   1/n (number of valid alignments).\n *   Return 1 if valid alignments found, else 0.\n */\nint processPair(char* qname, Aln* aln, int alnLen,\n    double* totalLen, float score, float asDiff,\n    bool atacOpt, int atacLen5, int atacLen3, bool atacAdj,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, uint64_t* errCount, bool verbose) {\n\n  // adjust AS tolerance for secondary alns\n  if (score != NOSCORE)\n    score -= asDiff;\n\n  // determine number of valid paired alignments\n  //   (within score threshold and not to skipped chrom)\n  uint8_t count = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired && a->full && a->score >= score\n        && a->chrom->save && ! a->chrom->skip)\n      count++;\n  }\n  if (! count)\n    return 0;\n\n  // adjust score so that num alns is OK (1/2/3/4/5/6/8/10)\n  if (count > 10 || count == 7 || count == 9)\n    subsamplePair(aln, alnLen, &count, &score);\n\n  // find full fragments to save\n  uint64_t fragLen = 0;     // local sum of fragment lengths\n  uint8_t saved = 0;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired && a->full && a->score >= score\n        && a->chrom->save && ! a->chrom->skip) {\n\n      // save full fragment\n      fragLen += saveFragment(qname, a, count,\n        atacOpt, atacLen5, atacLen3, atacAdj, bed, bedOpt,\n        gzOut, ctrl, sample, errCount, verbose);\n\n      if (++saved == count)\n        break;  // in case of AS ties\n    }\n  }\n\n  // check for error saving alignments\n  if (saved != count) {\n    char msg[MAX_ALNS];\n    sprintf(msg, \"Saved %d alignments for read %s; should have been %d\",\n      saved, qname, count);\n    exit(error(msg, ERRISSUE));\n  }\n\n  *totalLen += (double) fragLen / count;\n  return 1;\n}\n\n/* void processAlns()\n * Control processing of a set of alignments.\n *   Determine if the set has complete paired\n *   alignments or not, and what the best alignment\n *   scores are.\n * If duplicate removal is required, save alignments via\n *   saveAlns(), else pass results to processPair()\n *   or processSingle() directly.\n */\nvoid processAlns(char* qname, Aln* aln, int alnLen,\n    double* totalLen, uint64_t* pairedPr,\n    uint64_t* singlePr, uint64_t* orphan, bool singleOpt,\n    bool extendOpt, int extend, bool avgExtOpt,\n    Aln*** unpair, int* unpairIdx, int* unpairLen,\n    int* unpairMem, float asDiff, bool atacOpt,\n    int atacLen5, int atacLen3, bool atacAdj, File bed,\n    bool bedOpt, bool gzOut, bool ctrl, int sample,\n    bool dupsOpt, Read*** readPr, int* readIdxPr,\n    int* readLenPr, int* readMemPr, Read*** readDc,\n    int* readIdxDc, int* readLenDc, int* readMemDc,\n    Read*** readSn, int* readIdxSn, int* readLenSn,\n    int* readMemSn, uint16_t qualR1, uint16_t qualR2,\n    uint64_t* errCount, bool verbose) {\n\n  // determine if paired alns are valid, and best score\n  float scorePr = NOSCORE, scoreR1 = NOSCORE,\n    scoreR2 = NOSCORE;\n  bool pair = false, singleR1 = false, singleR2 = false;\n  for (int i = 0; i < alnLen; i++) {\n    Aln* a = aln + i;\n    if (a->paired) {\n      if (a->full) {\n        // valid paired aln\n        if (! pair || scorePr < a->score)\n          scorePr = a->score; // best score so far\n        pair = true;\n      } else\n        (*orphan)++;  // incomplete paired alignment\n    } else if (singleOpt && ! pair) {\n      // update best scores of unpaired alns\n      if (a->first && scoreR1 <= a->score) {\n        scoreR1 = a->score;\n        singleR1 = true;\n      } else if (! a->first && scoreR2 <= a->score) {\n        scoreR2 = a->score;\n        singleR2 = true;\n      }\n    }\n  }\n\n  if (dupsOpt)\n    // save alignments for later evaluation of duplicates\n    saveAlns(qname, aln, alnLen, pair, singleOpt, singleR1,\n      singleR2, scorePr, scoreR1, scoreR2, asDiff,\n      readPr, readIdxPr, readLenPr, readMemPr,\n      readDc, readIdxDc, readLenDc, readMemDc,\n      readSn, readIdxSn, readLenSn, readMemSn,\n      qualR1, qualR2);\n\n  else {\n    // process alns directly\n    if (pair)\n      // process paired alignments\n      *pairedPr += processPair(qname, aln, alnLen,\n        totalLen, scorePr, asDiff, atacOpt, atacLen5,\n        atacLen3, atacAdj, bed, bedOpt, gzOut, ctrl,\n        sample, errCount, verbose);\n    else if (singleOpt) {\n      // process unpaired alignments (separately for R1, R2)\n      if (singleR1)\n        *singlePr += processSingle(qname, aln, alnLen,\n          extendOpt, extend, avgExtOpt,\n          unpair, unpairIdx, unpairLen, unpairMem,\n          scoreR1, asDiff, true,\n          atacOpt, atacLen5, atacLen3, atacAdj,\n          bed, bedOpt, gzOut, ctrl, sample, errCount,\n          verbose);\n      if (singleR2)\n        *singlePr += processSingle(qname, aln, alnLen,\n          extendOpt, extend, avgExtOpt,\n          unpair, unpairIdx, unpairLen, unpairMem,\n          scoreR2, asDiff, false,\n          atacOpt, atacLen5, atacLen3, atacAdj,\n          bed, bedOpt, gzOut, ctrl, sample, errCount,\n          verbose);\n    }\n  }\n}\n\n/*** Efficient read sorting ***/\n\n/* uint32_t johnPartition()\n * Partition the reads of a section of the qual/order\n *   arrays based on one value into 3 bins (greater,\n *   equal, and lower).\n */\nuint32_t johnPartition(uint16_t* qual, uint32_t* order,\n    uint32_t low, uint32_t high, uint16_t* qual0,\n    uint16_t* qual1, uint16_t* qual2, uint32_t* order0,\n    uint32_t* order1, uint32_t* order2,\n    uint32_t* idxHigh) {\n\n  // separate qual values into temp arrays --\n  //   qual0 for higher values, qual1 equal, qual2 lower\n  uint16_t pivot = qual[high - 1];  // pivot value: last elt\n  uint32_t idx0 = 0, idx1 = 0, idx2 = 0; // indexes into temp arrays\n  for (uint32_t j = low; j < high; j++) {\n    if (qual[j] > pivot) {\n      qual0[idx0] = qual[j];\n      order0[idx0] = order[j];\n      idx0++;\n    } else if (qual[j] == pivot) {\n      qual1[idx1] = qual[j];\n      order1[idx1] = order[j];\n      idx1++;\n    } else {\n      qual2[idx2] = qual[j];\n      order2[idx2] = order[j];\n      idx2++;\n    }\n  }\n\n  if (! idx0 && ! idx2)\n    return 0; // all equal values, no need to shuffle\n\n  // recombine temp arrays back into qual/order\n  uint32_t i = 0;\n  bool val0 = (bool) idx0, val1 = true;\n  for (uint32_t j = low; j < high; j++) {\n    if (val0) {\n      qual[j] = qual0[i];\n      order[j] = order0[i];\n      if (++i == idx0) {\n        val0 = false;\n        i = 0;\n      }\n    } else if (val1) {\n      qual[j] = qual1[i];\n      order[j] = order1[i];\n      if (++i == idx1) {\n        val1 = false;\n        i = 0;\n      }\n    } else {\n      qual[j] = qual2[i];\n      order[j] = order2[i];\n      i++;\n    }\n  }\n\n  // return low/high indexes\n  *idxHigh = low + idx0 + idx1;\n  return low + idx0;\n}\n\n/* void johnSort()\n * Variation of quicksort that is stable and optimized\n *   for repeated qual values.\n */\nvoid johnSort(uint16_t* qual, uint32_t* order,\n    uint32_t low, uint32_t high, uint16_t* qual0,\n    uint16_t* qual1, uint16_t* qual2, uint32_t* order0,\n    uint32_t* order1, uint32_t* order2) {\n  if (low + 1 < high) {\n    uint32_t idx1 = 0; // new low index for upper recursive call\n    uint32_t idx = johnPartition(qual, order, low, high,\n      qual0, qual1, qual2, order0, order1, order2, &idx1);\n    if (idx)\n      // lower recursive call\n      johnSort(qual, order, low, idx, qual0, qual1,\n        qual2, order0, order1, order2);\n    if (idx1)\n      // upper recursive call\n      johnSort(qual, order, idx1, high, qual0, qual1,\n        qual2, order0, order1, order2);\n  }\n}\n\n/* void sortReads()\n * Determine sort order of a Read** array, based on\n *   sums of quality scores.\n * Sorting performed by johnSort(), an efficient,\n *   *stable* quicksort that is optimized for these\n *   arrays in which values are frequently repeated.\n */\nvoid sortReads(Read** arr, uint32_t count, uint32_t* order,\n    uint16_t* qual, uint32_t* order0, uint32_t* order1,\n    uint32_t* order2, uint16_t* qual0, uint16_t* qual1,\n    uint16_t* qual2) {\n\n  // initialize order and qual arrays\n  for (uint32_t i = 0; i < count; i++) {\n    order[i] = i;\n    qual[i] = (arr[i / MAX_SIZE] + i % MAX_SIZE)->qual;\n  }\n\n  // initialize johnSort()\n  johnSort(qual, order, 0, count, qual0, qual1, qual2,\n    order0, order1, order2);\n}\n\n/*** PCR duplicate removal ***/\n\n/* uint32_t calcHashSize()\n * Calculate a size for a hashtable:\n *   a power of 2 >= the given count * 4/3.\n * The given count is the number of reads, which\n *   will not necessarily be the same as the number\n *   of alignments. Some reads will have multiple\n *   alignments, but they will be counterbalanced\n *   by PCR duplicates (which will be discarded).\n */\nuint32_t calcHashSize(uint32_t count) {\n  uint32_t size = 2;\n  uint32_t val = MIN(UINT32_MAX, 4 * count / 3);\n  for (int i = 1; i < 31; i++) {\n    if (size >= val)\n      return size;\n    size *= 2;\n  }\n  // max size 2^31\n  return size;\n}\n\n/* uint32_t jenkins_hash_aln()\n * Adapted from http://www.burtleburtle.net/bob/hash/doobs.html\n *   Modified to take an alignment as input, hashed\n *   differently depending on alignment type.\n *   Returns index into hashtable.\n */\nuint32_t jenkins_hash_aln(Chrom* chrom, Chrom* chrom1,\n    uint32_t pos, uint32_t pos1, bool strand, bool strand1,\n    int alignType, uint32_t hashSize) {\n  uint32_t hash = 0;\n  unsigned char* p;\n\n  // hash Chrom*\n  int end = (alignType == DISCORD ? 2 : 1);\n  for (int j = 0; j < end; j++) {\n    p = (unsigned char*) (j ? chrom1 : chrom);\n    for (int i = 0; i < sizeof(Chrom*); i++) {\n      hash += p[i];\n      hash += hash << 10;\n      hash ^= hash >> 6;\n    }\n  }\n\n  // hash pos\n  end = (alignType == SINGLE ? 1 : 2);\n  for (int j = 0; j < end; j++) {\n    p = (unsigned char*) (j ? &pos1 : &pos);\n    for (int i = 0; i < sizeof(uint32_t); i++) {\n      hash += p[i];\n      hash += hash << 10;\n      hash ^= hash >> 6;\n    }\n  }\n\n  // hash strand\n  end = alignType;  // convenient!\n  for (int j = 0; j < end; j++) {\n    p = (unsigned char*) (j ? &strand1 : &strand);\n    for (int i = 0; i < sizeof(bool); i++) {\n      hash += p[i];\n      hash += hash << 10;\n      hash ^= hash >> 6;\n    }\n  }\n\n  hash += hash << 3;\n  hash ^= hash >> 11;\n  hash += hash << 15;\n  return hash % hashSize;\n}\n\n/* void addToHash()\n * Add a new node (HashAln) to hashtable, inserted\n *   at given idx (already calculated).\n */\nvoid addToHash(Chrom* chrom, Chrom* chrom1, uint32_t pos,\n    uint32_t pos1, bool strand, bool strand1,\n    HashAln** table, uint32_t idx, char* name) {\n  HashAln* h = (HashAln*) memalloc(sizeof(HashAln));\n  h->chrom = chrom;\n  h->chrom1 = chrom1;\n  h->pos = pos;\n  h->pos1 = pos1;\n  h->strand = strand;\n  h->strand1 = strand1;\n  h->name = NULL;\n  if (name) {\n    h->name = (char*) memalloc(1 + strlen(name));\n    strcpy(h->name, name);\n  }\n  h->next = table[idx];\n  table[idx] = h;\n}\n\n/* HashAln* checkHash()\n * Check hashtable for a match to an alignment.\n *   The alignment attributes are defined by the\n *   alignment type.\n */\nHashAln* checkHash(Chrom* chrom, Chrom* chrom1,\n    uint32_t pos, uint32_t pos1, bool strand, bool strand1,\n    int alignType, HashAln** table, uint32_t idx) {\n  for (HashAln* h = table[idx]; h != NULL; h = h->next) {\n    // check for match, based on alignment type\n    switch (alignType) {\n      case PAIRED:\n        if (chrom == h->chrom && pos == h->pos\n            && pos1 == h->pos1)\n          return h;\n        break;\n      case SINGLE:\n        if (chrom == h->chrom && pos == h->pos\n            && strand == h->strand)\n          return h;\n        break;\n      case DISCORD:\n        if (chrom == h->chrom && chrom1 == h->chrom1\n            && pos == h->pos && pos1 == h->pos1\n            && strand == h->strand && strand1 == h->strand1)\n          return h;\n        break;\n      default:\n        exit(error(\"\", ERRALNTYPE));\n    }\n  }\n  return NULL;\n}\n\n/* void checkAndAdd()\n * Check a singleton alignment for a match to the\n *   hashtable; if there is none, add it to the table.\n */\nvoid checkAndAdd(HashAln** tableSn, uint32_t hashSizeSn,\n    Chrom* chrom, uint32_t pos, bool strand, char* name) {\n  uint32_t idx = jenkins_hash_aln(chrom, NULL, pos, 0,\n    strand, 0, SINGLE, hashSizeSn);\n  if (! checkHash(chrom, NULL, pos, 0, strand, 0, SINGLE,\n      tableSn, idx))\n    addToHash(chrom, NULL, pos, 0, strand, 0, tableSn,\n      idx, name);\n}\n\n/* void logDup()\n * Print log information about a read identified as a\n *   duplicate, based on alignment type.\n */\nvoid logDup(File dups, bool gzOut, char* name,\n    Chrom* chrom, Chrom* chrom1, uint32_t pos,\n    uint32_t pos1, bool strand, bool strand1,\n    char* match, int alignType) {\n  switch (alignType) {\n    case PAIRED:\n      if (gzOut)\n        gzprintf(dups.gzf, \"%s\\t%s:%d-%d\\t%s\\tpaired\\n\",\n          name, chrom->name, pos, pos1, match);\n      else\n        fprintf(dups.f, \"%s\\t%s:%d-%d\\t%s\\tpaired\\n\",\n          name, chrom->name, pos, pos1, match);\n      break;\n    case SINGLE:\n      if (gzOut)\n        gzprintf(dups.gzf, \"%s\\t%s:%d,%c\\t%s\\tsingle\\n\",\n          name, chrom->name, pos, strand ? '+' : '-',\n          match);\n      else\n        fprintf(dups.f, \"%s\\t%s:%d,%c\\t%s\\tsingle\\n\",\n          name, chrom->name, pos, strand ? '+' : '-',\n          match);\n      break;\n    case DISCORD:\n      if (gzOut)\n        gzprintf(dups.gzf, \"%s\\t%s:%d,%c;%s:%d,%c\\t%s\\tdiscordant\\n\",\n          name, chrom->name, pos, strand ? '+' : '-',\n          chrom1->name, pos1, strand1 ? '+' : '-', match);\n      else\n        fprintf(dups.f, \"%s\\t%s:%d,%c;%s:%d,%c\\t%s\\tdiscordant\\n\",\n          name, chrom->name, pos, strand ? '+' : '-',\n          chrom1->name, pos1, strand1 ? '+' : '-', match);\n      break;\n    default:\n      exit(error(\"\", ERRALNTYPE));\n  }\n}\n\n/* void addHashPr()\n * Add all paired alignments for a Read* to the hashtable.\n */\nvoid addHashPr(Read* r, HashAln** table,\n    uint32_t hashSize, bool dupsVerb,\n    HashAln** tableSn, uint32_t hashSizeSn) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t idx = jenkins_hash_aln(a->chrom, NULL,\n      a->pos[0], a->pos[1], 0, 0, PAIRED, hashSize);\n    addToHash(a->chrom, NULL, a->pos[0], a->pos[1],\n      0, 0, table, idx, dupsVerb ? r->name : NULL);\n\n    // also add both alignments as singletons to hashtable\n    if (tableSn != NULL && hashSizeSn) {\n      checkAndAdd(tableSn, hashSizeSn, a->chrom, a->pos[0],\n        true, dupsVerb ? r->name : NULL);\n      checkAndAdd(tableSn, hashSizeSn, a->chrom, a->pos[1],\n        false, dupsVerb ? r->name : NULL);\n    }\n  }\n}\n\n/* bool checkHashPr()\n * Check a set of paired alignments for a match in the\n *   hashtable. Return true if *any* match.\n */\nbool checkHashPr(Read* r, HashAln** table,\n    uint32_t hashSize, File dups, bool dupsVerb,\n    bool gzOut) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t idx = jenkins_hash_aln(a->chrom, NULL,\n      a->pos[0], a->pos[1], 0, 0, PAIRED, hashSize);\n    HashAln* h = checkHash(a->chrom, NULL, a->pos[0],\n      a->pos[1], 0, 0, PAIRED, table, idx);\n    if (h) {\n      if (dupsVerb)\n        logDup(dups, gzOut, r->name, a->chrom, NULL,\n          a->pos[0], a->pos[1], 0, 0, h->name, PAIRED);\n      return true;\n    }\n  }\n  return false;\n}\n\n/* void findDupsPr()\n * Find PCR duplicates among properly paired\n *   alignment sets.\n */\nvoid findDupsPr(Read** readPr, int readIdxPr,\n    int readLenPr, uint64_t* countPr, uint64_t* dupsPr,\n    uint64_t* pairedPr, double* totalLen, float asDiff,\n    bool atacOpt, int atacLen5, int atacLen3, bool atacAdj,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, HashAln*** table, uint32_t* tableMem,\n    HashAln** tableSn, uint32_t hashSizeSn, File dups,\n    bool dupsVerb, uint32_t* order, uint32_t* order0,\n    uint32_t* order1, uint32_t* order2, uint16_t* qual,\n    uint16_t* qual0, uint16_t* qual1, uint16_t* qual2,\n    uint64_t* errCount, bool verbose) {\n\n  // initialize hashtable\n  uint32_t count = readIdxPr * MAX_SIZE + readLenPr;\n  uint32_t hashSize = calcHashSize(count);\n  if (hashSize > *tableMem) {\n    *table = (HashAln**) memrealloc(*table,\n      hashSize * sizeof(HashAln*));\n    *tableMem = hashSize;\n  } else\n    hashSize = *tableMem;\n  for (uint32_t i = 0; i < hashSize; i++)\n    (*table)[i] = NULL;\n\n  // get sort order of reads by qual score sum\n  sortReads(readPr, count, order, qual, order0, order1,\n    order2, qual0, qual1, qual2);\n\n  // loop through paired reads\n  for (uint32_t i = 0; i < count; i++) {\n    Read* r = readPr[order[i] / MAX_SIZE]\n      + order[i] % MAX_SIZE;\n\n    // check hashtable for matches\n    if (checkHashPr(r, *table, hashSize, dups,\n        dupsVerb, gzOut))\n      (*dupsPr)++;\n    else {\n      // add alignments to hashtable(s)\n      addHashPr(r, *table, hashSize, dupsVerb,\n        tableSn, hashSizeSn);\n      // process alignments too\n      *pairedPr += processPair(r->name, r->aln,\n        r->alnLen, totalLen, r->score, asDiff,\n        atacOpt, atacLen5, atacLen3, atacAdj,\n        bed, bedOpt, gzOut, ctrl, sample, errCount,\n        verbose);\n    }\n\n    // free Read\n    free(r->name);\n    free(r->aln);\n\n    (*countPr)++;\n  }\n\n  // free nodes of hashtable\n  for (uint32_t i = 0; i < hashSize; i++) {\n    HashAln* tmp;\n    HashAln* h = (*table)[i];\n    while (h != NULL) {\n      free(h->name);\n      tmp = h->next;\n      free(h);\n      h = tmp;\n    }\n  }\n}\n\n/* void addHashDc()\n * Add all combinations of discordant alignments for a\n *   Read* to the hashtable.\n */\nvoid addHashDc(Read* r, HashAln** table,\n    uint32_t hashSize, bool dupsVerb,\n    HashAln** tableSn, uint32_t hashSizeSn) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t pos = (a->strand ? a->pos[0] : a->pos[1]);\n    for (int j = 0; j < r->alnLenR2; j++) {\n      Aln* b = r->alnR2 + j;\n      uint32_t pos1 = (b->strand ? b->pos[0] : b->pos[1]);\n      uint32_t idx = jenkins_hash_aln(a->chrom, b->chrom,\n        pos, pos1, a->strand, b->strand, DISCORD, hashSize);\n      addToHash(a->chrom, b->chrom, pos, pos1, a->strand,\n        b->strand, table, idx, dupsVerb ? r->name : NULL);\n\n      // also add both alignments as singletons to hashtable\n      if (tableSn != NULL && hashSizeSn) {\n        if (! j)\n          checkAndAdd(tableSn, hashSizeSn, a->chrom, pos,\n            a->strand, dupsVerb ? r->name : NULL);\n        if (! k)\n          checkAndAdd(tableSn, hashSizeSn, b->chrom, pos1,\n            b->strand, dupsVerb ? r->name : NULL);\n      }\n    }\n  }\n}\n\n/* bool checkHashDc()\n * Check each combination of discordant alignments for a\n *   match in the hashtable. Return true if *any* match.\n */\nbool checkHashDc(Read* r, HashAln** table,\n    uint32_t hashSize, File dups, bool dupsVerb,\n    bool gzOut) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t pos = (a->strand ? a->pos[0] : a->pos[1]);\n    for (int j = 0; j < r->alnLenR2; j++) {\n      Aln* b = r->alnR2 + j;\n      uint32_t pos1 = (b->strand ? b->pos[0] : b->pos[1]);\n      uint32_t idx = jenkins_hash_aln(a->chrom, b->chrom,\n        pos, pos1, a->strand, b->strand, DISCORD, hashSize);\n      HashAln* h = checkHash(a->chrom, b->chrom, pos, pos1,\n        a->strand, b->strand, DISCORD, table, idx);\n      if (h) {\n        if (dupsVerb)\n          logDup(dups, gzOut, r->name, a->chrom, b->chrom,\n            pos, pos1, a->strand, b->strand, h->name,\n            DISCORD);\n        return true;\n      }\n      // check the reverse also\n      idx = jenkins_hash_aln(b->chrom, a->chrom,\n        pos1, pos, b->strand, a->strand, DISCORD, hashSize);\n      h = checkHash(b->chrom, a->chrom, pos1, pos,\n        b->strand, a->strand, DISCORD, table, idx);\n      if (h) {\n        if (dupsVerb)\n          logDup(dups, gzOut, r->name, b->chrom, a->chrom,\n            pos1, pos, b->strand, a->strand, h->name,\n            DISCORD);\n        return true;\n      }\n    }\n  }\n  return false;\n}\n\n/* void findDupsDc()\n * Find PCR duplicates among discordant\n *   alignment sets.\n */\nvoid findDupsDc(Read** readDc, int readIdxDc,\n    int readLenDc, uint64_t* countDc, uint64_t* dupsDc,\n    uint64_t* singlePr, bool extendOpt, int extend,\n    float asDiff, bool atacOpt, int atacLen5, int atacLen3,\n    bool atacAdj, File bed, bool bedOpt, bool gzOut,\n    bool ctrl, int sample, HashAln*** table,\n    uint32_t* tableMem, HashAln** tableSn,\n    uint32_t hashSizeSn, File dups, bool dupsVerb,\n    uint32_t* order, uint32_t* order0, uint32_t* order1,\n    uint32_t* order2, uint16_t* qual, uint16_t* qual0,\n    uint16_t* qual1, uint16_t* qual2, uint64_t* errCount,\n    bool verbose) {\n\n  // initialize hashtable\n  uint32_t count = readIdxDc * MAX_SIZE + readLenDc;\n  uint32_t hashSize = calcHashSize(count);\n  if (hashSize > *tableMem) {\n    *table = (HashAln**) memrealloc(*table,\n      hashSize * sizeof(HashAln*));\n    *tableMem = hashSize;\n  } else\n    hashSize = *tableMem;\n  for (uint32_t i = 0; i < hashSize; i++)\n    (*table)[i] = NULL;\n\n  // sort reads by qual score sum\n  sortReads(readDc, count, order, qual, order0, order1,\n    order2, qual0, qual1, qual2);\n\n  // loop through discordant reads\n  for (uint32_t i = 0; i < count; i++) {\n    Read* r = readDc[order[i] / MAX_SIZE]\n      + order[i] % MAX_SIZE;\n\n    // check hashtable for matches\n    if (checkHashDc(r, *table, hashSize, dups,\n        dupsVerb, gzOut))\n      (*dupsDc)++;\n    else {\n      // add alignments to hashtable(s)\n      addHashDc(r, *table, hashSize, dupsVerb,\n        tableSn, hashSizeSn);\n      // process alignments too (as singletons)\n      *singlePr += processSingle(r->name, r->aln,\n        r->alnLen, extendOpt, extend, false,\n        NULL, NULL, NULL, NULL,\n        r->score, asDiff, true,\n        atacOpt, atacLen5, atacLen3, atacAdj,\n        bed, bedOpt, gzOut, ctrl, sample, errCount,\n        verbose);\n      *singlePr += processSingle(r->name, r->alnR2,\n        r->alnLenR2, extendOpt, extend, false,\n        NULL, NULL, NULL, NULL,\n        r->scoreR2, asDiff, false,\n        atacOpt, atacLen5, atacLen3, atacAdj,\n        bed, bedOpt, gzOut, ctrl, sample, errCount,\n        verbose);\n    }\n\n    // free Read\n    free(r->name);\n    free(r->aln);\n    free(r->alnR2);\n\n    (*countDc)++;\n  }\n\n  // free nodes of hashtable\n  for (uint32_t i = 0; i < hashSize; i++) {\n    HashAln* tmp;\n    HashAln* h = (*table)[i];\n    while (h != NULL) {\n      free(h->name);\n      tmp = h->next;\n      free(h);\n      h = tmp;\n    }\n  }\n}\n\n/* void addHashSn()\n * Add all singleton alignments for a Read* to the\n *   hashtable.\n */\nvoid addHashSn(Read* r, HashAln** table,\n    uint32_t hashSize, bool dupsVerb) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t pos = (a->strand ? a->pos[0] : a->pos[1]);\n    uint32_t idx = jenkins_hash_aln(a->chrom, NULL,\n      pos, 0, a->strand, 0, SINGLE, hashSize);\n    addToHash(a->chrom, NULL, pos, 0, a->strand, 0,\n      table, idx, dupsVerb ? r->name : NULL);\n  }\n}\n\n/* bool checkHashSn()\n * Check a set of singleton alignments for a match in the\n *   hashtable. Return true if *any* match.\n */\nbool checkHashSn(Read* r, HashAln** table,\n    uint32_t hashSize, File dups, bool dupsVerb,\n    bool gzOut) {\n  for (int k = 0; k < r->alnLen; k++) {\n    Aln* a = r->aln + k;\n    uint32_t pos = (a->strand ? a->pos[0] : a->pos[1]);\n    uint32_t idx = jenkins_hash_aln(a->chrom, NULL,\n      pos, 0, a->strand, 0, SINGLE, hashSize);\n    HashAln* h = checkHash(a->chrom, NULL, pos, 0,\n      a->strand, 0, SINGLE, table, idx);\n    if (h) {\n      if (dupsVerb)\n        logDup(dups, gzOut, r->name, a->chrom, NULL,\n          pos, 0, a->strand, 0, h->name, SINGLE);\n      return true;\n    }\n  }\n  return false;\n}\n\n/* void findDupsSn()\n * Find PCR duplicates among singleton alignment sets.\n *   Note: the hashtable was already created and\n *   populated by paired and discordant alns.\n */\nvoid findDupsSn(Read** readSn, int readIdxSn,\n    int readLenSn, uint64_t* countSn, uint64_t* dupsSn,\n    uint64_t* singlePr, bool extendOpt, int extend,\n    float asDiff, bool atacOpt, int atacLen5, int atacLen3,\n    bool atacAdj, File bed, bool bedOpt, bool gzOut,\n    bool ctrl, int sample, HashAln** table,\n    uint32_t hashSize, File dups, bool dupsVerb,\n    uint32_t* order, uint32_t* order0, uint32_t* order1,\n    uint32_t* order2, uint16_t* qual, uint16_t* qual0,\n    uint16_t* qual1, uint16_t* qual2, uint64_t* errCount,\n    bool verbose) {\n\n  // sort reads by qual score sum\n  uint32_t count = readIdxSn * MAX_SIZE + readLenSn;\n  sortReads(readSn, count, order, qual, order0, order1,\n    order2, qual0, qual1, qual2);\n\n  // loop through singleton reads\n  for (uint32_t i = 0; i < count; i++) {\n    Read* r = readSn[order[i] / MAX_SIZE]\n      + order[i] % MAX_SIZE;\n\n    // check hashtable for matches\n    if (checkHashSn(r, table, hashSize, dups,\n        dupsVerb, gzOut))\n      (*dupsSn)++;\n    else {\n      // add alignments to hashtable\n      addHashSn(r, table, hashSize, dupsVerb);\n      // process alignments too\n      *singlePr += processSingle(r->name, r->aln,\n        r->alnLen, extendOpt, extend, false,\n        NULL, NULL, NULL, NULL,\n        r->score, asDiff, r->first,\n        atacOpt, atacLen5, atacLen3, atacAdj,\n        bed, bedOpt, gzOut, ctrl, sample, errCount,\n        verbose);\n    }\n\n    // free Read\n    free(r->name);\n    free(r->aln);\n\n    (*countSn)++;\n  }\n\n  // free nodes of hashtable\n  for (uint32_t i = 0; i < hashSize; i++) {\n    HashAln* tmp;\n    HashAln* h = table[i];\n    while (h != NULL) {\n      free(h->name);\n      tmp = h->next;\n      free(h);\n      h = tmp;\n    }\n  }\n}\n\n/* void findDups()\n * Control elucidation of PCR duplicates. Process reads\n *   that are determined not to be duplicates.\n */\nvoid findDups(Read** readPr, int readIdxPr, int readLenPr,\n    Read** readDc, int readIdxDc, int readLenDc,\n    Read** readSn, int readIdxSn, int readLenSn,\n    HashAln*** table, uint32_t* tableMem,\n    HashAln*** tableSn, uint32_t* tableSnMem,\n    uint32_t** order, uint32_t** order0, uint32_t** order1,\n    uint32_t** order2, uint16_t** qual, uint16_t** qual0,\n    uint16_t** qual1, uint16_t** qual2, uint32_t* arrMem,\n    uint64_t* countPr, uint64_t* dupsPr, uint64_t* countDc,\n    uint64_t* dupsDc, uint64_t* countSn, uint64_t* dupsSn,\n    bool singleOpt, uint64_t* pairedPr, uint64_t* singlePr,\n    double* totalLen, bool extendOpt, int extend,\n    bool avgExtOpt, float asDiff, bool atacOpt,\n    int atacLen5, int atacLen3, bool atacAdj, File bed,\n    bool bedOpt, bool gzOut, File dups, bool dupsVerb,\n    bool ctrl, int sample, uint64_t* errCount, bool verbose) {\n\n  // initialize hash table for singletons\n  uint32_t hashSizeSn = 0;\n  if (singleOpt && (readIdxSn || readLenSn)) {\n    // calculate hashtable size\n    uint32_t sum = MIN(UINT32_MAX,\n      2 * (readIdxPr * MAX_SIZE + readLenPr)\n      + 2 * (readIdxDc * MAX_SIZE + readLenDc)\n      + readIdxSn * MAX_SIZE + readLenSn);\n    hashSizeSn = calcHashSize(sum);\n\n    // create/expand hashtable\n    if (hashSizeSn > *tableSnMem) {\n      *tableSn = (HashAln**) memrealloc(*tableSn,\n        hashSizeSn * sizeof(HashAln*));\n      *tableSnMem = hashSizeSn;\n    } else\n      hashSizeSn = *tableSnMem;\n    for (uint32_t i = 0; i < hashSizeSn; i++)\n      (*tableSn)[i] = NULL;\n  }\n\n  // initialize arrays for efficient sorting\n  uint32_t count = MIN(UINT32_MAX,\n    MAX(readIdxPr * MAX_SIZE + readLenPr,\n    MAX(readIdxDc * MAX_SIZE + readLenDc,\n    readIdxSn * MAX_SIZE + readLenSn)));\n  if (count > *arrMem) {\n    *order = (uint32_t*) memrealloc(*order, count * sizeof(uint32_t));\n    *order0 = (uint32_t*) memrealloc(*order0, count * sizeof(uint32_t));\n    *order1 = (uint32_t*) memrealloc(*order1, count * sizeof(uint32_t));\n    *order2 = (uint32_t*) memrealloc(*order2, count * sizeof(uint32_t));\n    *qual = (uint16_t*) memrealloc(*qual, count * sizeof(uint16_t));\n    *qual0 = (uint16_t*) memrealloc(*qual0, count * sizeof(uint16_t));\n    *qual1 = (uint16_t*) memrealloc(*qual1, count * sizeof(uint16_t));\n    *qual2 = (uint16_t*) memrealloc(*qual2, count * sizeof(uint16_t));\n    *arrMem = count;\n  }\n\n  // evaluate and process paired alignments\n  if (readIdxPr || readLenPr)\n    findDupsPr(readPr, readIdxPr, readLenPr, countPr,\n      dupsPr, pairedPr, totalLen, asDiff, atacOpt,\n      atacLen5, atacLen3, atacAdj, bed, bedOpt, gzOut,\n      ctrl, sample, table, tableMem, *tableSn, hashSizeSn,\n      dups, dupsVerb, *order, *order0, *order1, *order2,\n      *qual, *qual0, *qual1, *qual2, errCount, verbose);\n\n  if (singleOpt) {\n    // with avgExtOpt, calculate average fragment length\n    //   and save it as 'extend' with extendOpt=true\n    if (avgExtOpt) {\n      extend = calcAvgLen(*totalLen, *pairedPr, verbose);\n      if (extend)\n        extendOpt = true;\n    }\n\n    // evaluate and process discordant alignments\n    if (readIdxDc || readLenDc)\n      findDupsDc(readDc, readIdxDc, readLenDc, countDc,\n        dupsDc, singlePr, extendOpt, extend, asDiff,\n        atacOpt, atacLen5, atacLen3, atacAdj, bed, bedOpt,\n        gzOut, ctrl, sample, table, tableMem, *tableSn,\n        hashSizeSn, dups, dupsVerb, *order, *order0,\n        *order1, *order2, *qual, *qual0, *qual1, *qual2,\n        errCount, verbose);\n\n    // evaluate and process singleton alignments\n    if (readIdxSn || readLenSn)\n      findDupsSn(readSn, readIdxSn, readLenSn, countSn,\n        dupsSn, singlePr, extendOpt, extend, asDiff,\n        atacOpt, atacLen5, atacLen3, atacAdj, bed, bedOpt,\n        gzOut, ctrl, sample, *tableSn, hashSizeSn, dups,\n        dupsVerb, *order, *order0, *order1, *order2, *qual,\n        *qual0, *qual1, *qual2, errCount, verbose);\n  }\n\n}\n\n/*** Save alignment information ***/\n\n/* void updatePairedAln()\n * Complete a properly paired alignment.\n */\nvoid updatePairedAln(Aln* a, uint16_t flag,\n    uint32_t pos, int length, float score) {\n  if (flag & 0x40)\n    a->pos[0] = flag & 0x10 ? pos + length : pos;\n  else\n    a->pos[1] = flag & 0x10 ? pos + length : pos;\n  if (score == NOSCORE)\n    a->score = NOSCORE;\n  else if (a->score != NOSCORE)\n    a->score += score;\n  a->full = true;\n}\n\n/* bool savePairedAln()\n * Start a properly paired alignment. Return false\n *   if max. number has been reached.\n */\nbool savePairedAln(Aln** aln, int* alnLen,\n    uint16_t flag, Chrom* chrom, uint32_t pos,\n    int length, uint32_t pnext, float score) {\n\n  // check for excessive alignments\n  if (*alnLen == MAX_ALNS)\n    return false;\n\n  // save aln info\n  Aln* a = *aln + *alnLen;\n  a->chrom = chrom;\n  a->score = score;\n  a->primary = (bool) (!(flag & 0x100));\n  a->full = false;\n  a->paired = true;\n\n  // save positions for this aln (corrected if rev-comp),\n  //   and for its pair (pnext, uncorrected)\n  if (flag & 0x40) {\n    a->pos[0] = flag & 0x10 ? pos + length : pos;\n    a->pos[1] = pnext;\n    a->first = true;\n  } else {\n    a->pos[0] = pnext;\n    a->pos[1] = flag & 0x10 ? pos + length : pos;\n    a->first = false;\n  }\n\n  (*alnLen)++;\n  return true;\n}\n\n/* bool saveSingleAln()\n * Save the information for an unpaired alignment.\n *   Return false if max. number has been reached.\n */\nbool saveSingleAln(Aln** aln, int* alnLen,\n    uint16_t flag, Chrom* chrom, uint32_t pos,\n    int length, float score) {\n\n  // check for excessive alignments\n  if (*alnLen == MAX_ALNS)\n    return false;\n\n  // save aln info\n  Aln* a = *aln + *alnLen;\n  a->chrom = chrom;\n  a->score = score;\n  a->primary = (bool) (!(flag & 0x100));\n  a->paired = false;\n  a->strand = (bool) (!(flag & 0x10));\n  a->first = (bool) (flag & 0x40);\n  a->pos[0] = pos;\n  a->pos[1] = pos + length;\n  (*alnLen)++;\n  return true;\n}\n\n/* uint16_t sumQual()\n * Sum an array/string of quality scores.\n */\nuint16_t sumQual(char* qual, int len, int offset) {\n  if (qual[0] == 0xFF)  // BAM 'null' value\n    return 0;\n  int sum = 0;\n  for (int i = 0; i < len; i++)\n    sum += qual[i] - offset;\n  return sum > UINT16_MAX ? UINT16_MAX : (uint16_t) sum;\n}\n\n/* bool parseAlign()\n * Parse a SAM/BAM alignment record. Save alignment\n *   info to Aln* array. Return true unless the max.\n *   number of alignments has been reached.\n */\nbool parseAlign(Aln** aln, int* alnLen, uint16_t flag,\n    Chrom* chrom, uint32_t pos, int length, uint32_t pnext,\n    uint64_t* paired, uint64_t* single, uint64_t* secPair,\n    uint64_t* secSingle, uint64_t* skipped, bool singleOpt,\n    float score, bool dupsOpt, char* qual, int qualLen,\n    int offset, uint16_t* qualR1, uint16_t* qualR2) {\n\n  // check for linear template or missing index\n  if (flag & 0x1) {\n    if ((flag & 0xC0) == 0xC0)\n      exit(error(\"\", ERRLINEAR));\n    if (!(flag & 0xC0))\n      exit(error(\"\", ERRINDEX));\n  }\n\n  // save sum of quality scores (only if removing dups)\n  if (dupsOpt) {\n    if (flag & 0x40) {\n      if (! *qualR1 && strcmp(qual, \"*\"))\n        *qualR1 = sumQual(qual, qualLen, offset);\n    } else {\n      if (! *qualR2 && strcmp(qual, \"*\"))\n        *qualR2 = sumQual(qual, qualLen, offset);\n    }\n  }\n\n  // paired alignment: save alignment information\n  if ((flag & 0x3) == 0x3) {\n\n    // update counts\n    if (chrom->skip || ! chrom->save)\n      (*skipped)++;\n    else {\n      (*paired)++;\n      if (flag & 0x100)\n        (*secPair)++;\n    }\n\n    // search for matching paired alignment (already analyzed)\n    for (int i = 0; i < *alnLen; i++) {\n      Aln* a = *aln + i;\n      if ( a->paired && ! a->full && a->chrom == chrom\n          && (flag & 0x40 ? (! a->first && a->pos[0] == pos)\n            : (a->first && a->pos[1] == pos) )\n          && (flag & 0x100 ? ! a->primary : a->primary) ) {\n        // complete paired alignment\n        updatePairedAln(a, flag, pos, length, score);\n        return true;\n      }\n    }\n\n    // not found: start new paired alignment\n    return savePairedAln(aln, alnLen, flag, chrom,\n      pos, length, pnext, score);\n  }\n\n  // unpaired alignment\n  if (chrom->skip || ! chrom->save)\n    (*skipped)++;\n  else {\n    (*single)++;\n    if (flag & 0x100)\n      (*secSingle)++;\n  }\n\n  // save alignment info\n  if (singleOpt)\n    return saveSingleAln(aln, alnLen, flag, chrom,\n      pos, length, score);\n\n  return true;\n}\n\n/*** Save SAM/BAM header info ***/\n\n/* int saveChrom()\n * If chromosome (reference sequence) has not been\n *   saved yet, save it to the array. Return the index.\n */\nint saveChrom(char* name, uint32_t len, int* chromLen,\n    Chrom** chrom, int xcount, char** xchrList,\n    int xBedLen, Bed* xBed, bool ctrl, bool verbose) {\n\n  // determine if chrom has been saved already\n  for (int i = 0; i < *chromLen; i++) {\n    Chrom* c = *chrom + i;\n    if (!strcmp(c->name, name)) {\n      if (c->len != len)\n        exit(error(c->name, ERRCHRLEN));\n      if (! ctrl)\n        c->save = true;\n      return i;\n    }\n  }\n\n  // save to list\n  *chrom = (Chrom*) memrealloc(*chrom,\n    (*chromLen + 1) * sizeof(Chrom));\n  Chrom* c = *chrom + *chromLen;\n  c->name = (char*) memalloc(1 + strlen(name));\n  strcpy(c->name, name);\n  c->len = len;\n  c->skip = checkChrom(c->name, xcount, xchrList);\n  c->save = ! ctrl; // do not save if ref in ctrl sample only\n  c->diff = NULL;\n  c->expt = (Pileup*) memalloc(sizeof(Pileup));\n  c->expt->end = NULL;\n  c->expt->cov = NULL;\n  c->exptLen = 0;\n  c->exptMem = 0;\n  c->ctrl = (Pileup*) memalloc(sizeof(Pileup));\n  c->ctrl->end = NULL;\n  c->ctrl->cov = NULL;\n  c->ctrlLen = 0;\n  c->ctrlMem = 0;\n  c->pval = NULL;\n  c->pvalLen = NULL;\n  c->sample = 0;\n  c->qval = NULL;\n\n  // determine if there are regions to be skipped\n  c->bed = NULL;\n  c->bedLen = 0;\n  if (! c->skip)\n    saveXBed(c->name, c->len, &c->bedLen, &c->bed,\n      xBedLen, xBed, verbose);\n\n  (*chromLen)++;\n  return *chromLen - 1;\n}\n\n/* void loadChrom()\n * Save chromosome length info from a SAM header line.\n */\nvoid loadChrom(char* line, int* chromLen, Chrom** chrom,\n    int xcount, char** xchrList, int xBedLen, Bed* xBed,\n    bool ctrl, bool verbose) {\n  // parse SAM header line for chrom info\n  char* name = NULL, *len = NULL;\n  char* field = strtok(NULL, TAB);\n  while (field != NULL) {\n    if (!strncmp(field, \"SN:\", 3))\n      name = field + 3;\n    else if (!strncmp(field, \"LN:\", 3))\n      len = field + 3;\n    field = strtok(NULL, TAB);\n  }\n  if (name == NULL || len == NULL)\n    return;\n\n  // remove trailing '\\n'\n  int i;\n  for (i = 0; name[i] != '\\n' && name[i] != '\\0'; i++) ;\n  name[i] = '\\0';\n  for (i = 0; len[i] != '\\n' && len[i] != '\\0'; i++) ;\n  len[i] = '\\0';\n\n  // save chrom info to array (*chrom)\n  saveChrom(name, (uint32_t) getInt(len), chromLen, chrom,\n    xcount, xchrList, xBedLen, xBed, ctrl, verbose);\n}\n\n/* void checkHeader()\n * Check SAM header line for useful information:\n *   sort order or chromosome lengths.\n */\nvoid checkHeader(char* line, int* chromLen, Chrom** chrom,\n    int xcount, char** xchrList, int xBedLen, Bed* xBed,\n    bool ctrl, bool sortOpt, bool verbose) {\n\n  // load tag from SAM header line\n  char* tag = strtok(line, TAB);\n  if (tag == NULL)\n    return;\n\n  if (! strcmp(tag, \"@HD\")) {\n    // first header line: check sort order\n    char* order = NULL;\n    char* field = strtok(NULL, TAB);\n    while (field != NULL) {\n      if (!strncmp(field, \"SO:\", 3))\n        order = field + 3;\n      field = strtok(NULL, TAB);\n    }\n    if (order != NULL) {\n      // removing trailing '\\n'\n      int i;\n      for (i = 0; order[i] != '\\n' && order[i] != '\\0'; i++) ;\n      order[i] = '\\0';\n    }\n\n    // sort order must be queryname\n    if (sortOpt && (order == NULL\n        || strcmp(order, \"queryname\")))\n      exit(error(\"\", ERRSORT));\n\n  } else if (! strcmp(tag, \"@SQ\"))\n    // load chrom lengths from header line\n    loadChrom(line, chromLen, chrom, xcount, xchrList,\n      xBedLen, xBed, ctrl, verbose);\n\n}\n\n/*** SAM parsing ***/\n\n/* bool loadFields()\n * Load alignment info from a SAM record.\n *   Return false on failure.\n */\nbool loadFields(uint16_t* flag, char** rname, uint32_t* pos,\n    uint8_t* mapq, char** cigar, char** rnext, uint32_t* pnext,\n    int32_t* tlen, char** seq, char** qual, char** extra) {\n  *extra = NULL;  // reset 'extra' fields\n  int i = 2;\n  char* field = strtok(NULL, TAB);\n  while (field != NULL) {\n    switch (i) {\n      case FLAG: *flag = getInt(field); break;\n      case RNAME: *rname = field; break;\n      case POS: *pos = getInt(field) - 1; break;  // convert to 0-based\n      case MAPQ: *mapq = getInt(field); break;\n      case CIGAR: *cigar = field; break;\n      case RNEXT: *rnext = field; break;\n      case PNEXT: *pnext = getInt(field) - 1; break;  // convert to 0-based\n      case TLEN: *tlen = getInt(field); break;\n      case SEQ: *seq = field; break;\n      case QUAL: *qual = field; break;\n      default: return false;\n    }\n    if (++i > 11) {\n      *extra = strtok(NULL, \"\\n\");\n      break;\n    }\n    field = strtok(NULL, TAB);\n  }\n  return i > 11;\n}\n\n/* float getScore()\n * Search SAM optional fields for an alignment score.\n *   Return NOSCORE if not found.\n */\nfloat getScore(char* extra) {\n  if (extra == NULL)\n    return NOSCORE;\n  char* end;\n  char* field = strtok_r(extra, TAB, &end);\n  while (field != NULL) {\n    char* tag = strtok(field, COL);\n    if (!strcmp(tag, SCORE)) {\n      for (int i = 0; i < 2; i++) {\n        tag = strtok(NULL, COL);\n        if (tag == NULL)\n          return NOSCORE;\n        if (i)\n          return getFloat(tag);\n      }\n    }\n    field = strtok_r(NULL, TAB, &end);\n  }\n  return NOSCORE;\n}\n\n/* int parseCigar()\n * Calculate length of sequence and offset\n *   from a CIGAR string.\n */\nint parseCigar(char* cigar, int* offset) {\n  int length = 0; // length of sequence\n  int pos = 0;    // position of current op in cigar\n  char op;\n  int len = strlen(cigar);\n  for (int i = 0; i < len; i++) {\n    if (cigar[i] < 0x30 || cigar[i] > 0x39) {\n      op = cigar[i];\n      cigar[i] = '\\0';\n      int opLen = getInt(cigar + pos); // length of current op\n      switch (op) {\n        case 'M':\n        case '=':\n        case 'X':\n          length += opLen;\n          break;\n        case 'I':\n        case 'S':\n          length += opLen;\n          *offset -= opLen;\n          break;\n        case 'D':\n          *offset += opLen;\n          break;\n        case 'N':\n        case 'H':\n        case 'P':\n          break;\n        default: ;\n          char msg[4] = \"' '\";\n          msg[1] = op;\n          exit(error(msg, ERRCIGAR));\n      }\n      pos = i + 1;\n    }\n  }\n  return length;\n}\n\n/* int calcDist()\n * Return distance to 3' end of sequence\n *   (length + offset based on CIGAR [if avl]).\n */\nint calcDist(char* qname, char* seq, char* cigar) {\n  int length = strcmp(seq, \"*\") ? strlen(seq) : 0;\n  int offset = 0;\n  if (strcmp(cigar, \"*\")) {\n    int len = parseCigar(cigar, &offset);\n    if (! length)\n      length = len;\n    else if (length != len)\n      exit(error(qname, ERRMISM));\n  } else if (! length)\n    exit(error(qname, ERRINFO));\n  return length + offset;\n}\n\n/* uint64_t readSAM()\n * Parse the alignments in a SAM file.\n */\nuint64_t readSAM(File in, bool gz, char* line, Aln** aln,\n    char* readName, double* totalLen, uint64_t* unmapped,\n    uint64_t* paired, uint64_t* single, uint64_t* pairedPr,\n    uint64_t* singlePr, uint64_t* supp, uint64_t* skipped,\n    uint64_t* lowMapQ, int minMapQ, int xcount,\n    char** xchrList, int xBedLen, Bed* xBed,\n    uint64_t* secPair, uint64_t* secSingle,\n    uint64_t* orphan, int* chromLen, Chrom** chrom,\n    bool singleOpt, bool extendOpt, int extend,\n    bool avgExtOpt, Aln*** unpair, int* unpairMem,\n    float asDiff, bool atacOpt, int atacLen5, int atacLen3,\n    bool atacAdj, File bed, bool bedOpt, bool gzOut,\n    bool ctrl, int sample, bool dupsOpt, File dups,\n    bool dupsVerb, Read*** readPr, int* readMemPr,\n    Read*** readDc, int* readMemDc, Read*** readSn,\n    int* readMemSn, HashAln*** table, uint32_t* tableMem,\n    HashAln*** tableSn, uint32_t* tableSnMem,\n    uint32_t** order, uint32_t** order0, uint32_t** order1,\n    uint32_t** order2, uint16_t** qualA, uint16_t** qual0,\n    uint16_t** qual1, uint16_t** qual2, uint32_t* arrMem,\n    uint64_t* countPr, uint64_t* dupsPr, uint64_t* countDc,\n    uint64_t* dupsDc, uint64_t* countSn, uint64_t* dupsSn,\n    uint64_t* errCount, bool sortOpt, bool verbose) {\n\n  // SAM fields to save\n  char* qname, *rname, *cigar, *rnext, *seq, *qual, *extra;\n  uint16_t flag;\n  uint32_t pos, pnext;\n  int32_t tlen;\n  uint8_t mapq;\n\n  int alnLen = 0;     // number of alignments for this read\n  int unpairIdx = 0;  // \\ indexes into unpaired array(s)\n  int unpairLen = 0;  // /   (with avgExtOpt)\n  int readIdxPr = 0;  // \\ indexes into read array readPr\n  int readLenPr = 0;  // /   (with dupsOpt)\n  int readIdxDc = 0;  // \\ indexes into read array readDc\n  int readLenDc = 0;  // /   (with dupsOpt)\n  int readIdxSn = 0;  // \\ indexes into read array readSn\n  int readLenSn = 0;  // /   (with dupsOpt)\n  uint16_t qualR1 = 0, qualR2 = 0; // sums of quality scores\n  bool pastHeader = false;    // to check for misplaced header lines\n  uint64_t count = 0;\n  while (getLine(line, MAX_SIZE, in, gz) != NULL) {\n\n    if (line[0] == '@') {\n      if (pastHeader)\n        exit(error(line, ERRHEAD));\n      checkHeader(line, chromLen, chrom, xcount, xchrList,\n        xBedLen, xBed, ctrl, sortOpt, verbose);\n      continue;\n    }\n    pastHeader = true;\n\n    // parse SAM record\n    qname = strtok(line, TAB);\n    if (qname == NULL)\n      exit(error(line, ERRSAM));\n    if (! loadFields(&flag, &rname, &pos, &mapq, &cigar,\n        &rnext, &pnext, &tlen, &seq, &qual, &extra))\n      exit(error(qname, ERRSAM));\n\n    count++;\n    if (flag & 0x4) {\n      // skip unmapped\n      (*unmapped)++;\n      continue;\n    }\n    if (! strcmp(qname, \"*\") || ! strcmp(rname, \"*\")\n        || pos < 0)\n      // insufficient alignment info\n      exit(error(qname, ERRSAM));\n    if (flag & 0xE00) {\n      // skip supplementary/PCR dups/low quality\n      (*supp)++;\n      continue;\n    }\n    // find matching Chrom (reference sequence)\n    Chrom* ref = NULL;\n    for (int i = 0; i < *chromLen; i++)\n      if (! strcmp((*chrom + i)->name, rname)) {\n        ref = *chrom + i;\n        break;\n      }\n    if (ref == NULL)\n      // cannot find reference sequence\n      exit(error(rname, ERRCHROM));\n    if (mapq < minMapQ) {\n      // skip low MAPQ alignments\n      (*lowMapQ)++;\n      continue;\n    }\n\n    // process previous set of alns, if starting a new set\n    if (readName[0] == '\\0' || strcmp(qname, readName)) {\n      if (readName[0] != '\\0')\n        processAlns(readName, *aln, alnLen, totalLen,\n          pairedPr, singlePr, orphan, singleOpt,\n          extendOpt, extend, avgExtOpt, unpair,\n          &unpairIdx, &unpairLen, unpairMem, asDiff,\n          atacOpt, atacLen5, atacLen3, atacAdj,\n          bed, bedOpt, gzOut, ctrl, sample, dupsOpt,\n          readPr, &readIdxPr, &readLenPr, readMemPr,\n          readDc, &readIdxDc, &readLenDc, readMemDc,\n          readSn, &readIdxSn, &readLenSn, readMemSn,\n          qualR1, qualR2, errCount, verbose);\n      alnLen = 0;\n      qualR1 = qualR2 = 0;\n      strncpy(readName, qname, MAX_ALNS);\n    }\n\n    // save alignment information\n    int length = calcDist(qname, seq, cigar); // distance to 3' end\n    float score = getScore(extra);\n    if (! parseAlign(aln, &alnLen, flag, ref, pos, length,\n        pnext, paired, single, secPair, secSingle, skipped,\n        singleOpt, score, dupsOpt, qual, strlen(qual),\n        SAMQUAL, &qualR1, &qualR2) && verbose)\n      fprintf(stderr, \"Warning! Read %s has more than %d alignments\\n\",\n        qname, MAX_ALNS);\n    // NOTE: the following SAM fields are ignored:\n    //   rnext, tlen\n  }\n\n  // process last set of alns\n  if (readName[0] != '\\0')\n    processAlns(readName, *aln, alnLen, totalLen,\n      pairedPr, singlePr, orphan, singleOpt,\n      extendOpt, extend, avgExtOpt, unpair,\n      &unpairIdx, &unpairLen, unpairMem, asDiff,\n      atacOpt, atacLen5, atacLen3, atacAdj,\n      bed, bedOpt, gzOut, ctrl, sample, dupsOpt,\n      readPr, &readIdxPr, &readLenPr, readMemPr,\n      readDc, &readIdxDc, &readLenDc, readMemDc,\n      readSn, &readIdxSn, &readLenSn, readMemSn,\n      qualR1, qualR2, errCount, verbose);\n\n  if (dupsOpt)\n    // remove duplicates and process all alignments\n    findDups(*readPr, readIdxPr, readLenPr, *readDc,\n      readIdxDc, readLenDc, *readSn, readIdxSn, readLenSn,\n      table, tableMem, tableSn, tableSnMem, order, order0,\n      order1, order2, qualA, qual0, qual1, qual2, arrMem,\n      countPr, dupsPr, countDc, dupsDc, countSn, dupsSn,\n      singleOpt, pairedPr, singlePr, totalLen, extendOpt,\n      extend, avgExtOpt, asDiff, atacOpt, atacLen5,\n      atacLen3, atacAdj, bed, bedOpt, gzOut, dups,\n      dupsVerb, ctrl, sample, errCount, verbose);\n\n  else if (avgExtOpt)\n    // process single alignments w/ avgExtOpt\n    processAvgExt(*unpair, unpairIdx, unpairLen,\n      *totalLen, *pairedPr, bed, bedOpt, gzOut,\n      ctrl, sample, errCount, verbose);\n\n  return count;\n}\n\n/*** BAM parsing ***/\n\n/* int32_t readInt32()\n * Read an int32_t in little-endian format from a\n *   gzip-compressed file. On failure, return error\n *   or EOF.\n */\nint32_t readInt32(gzFile in, bool end) {\n  int32_t ans = 0;\n  for (int i = 0; i < sizeof(int32_t); i++) {\n    int m = gzgetc(in);\n    if (m == -1) {\n      if (end)\n        exit(error(\"\", ERRBAM));\n      else\n        return EOF;\n    }\n    ans = ans | ((m & 0xFF) << (i*8));\n  }\n  return ans;\n}\n\n/* int32_t loadInt32()\n * Load an int32_t in little-endian format from a\n *   char** block. Increment *block on the fly.\n */\nint32_t loadInt32(char** block) {\n  int32_t ans = 0;\n  for (int i = 0; i < sizeof(int32_t); i++) {\n    ans = ans | ((**block & 0xFF) << (i*8));\n    (*block)++;\n  }\n  return ans;\n}\n\n/* void loadBAMfields()\n * Load fields from a BAM record. See SAM spec section 4.2\n *   for details.\n */\nvoid loadBAMfields(char** block, int32_t* refID, int32_t* pos,\n    uint8_t* mapq, uint16_t* n_cigar_op, uint16_t* flag,\n    int32_t* l_seq, int32_t* next_refID, int32_t* next_pos,\n    int32_t* tlen, char** read_name, uint32_t** cigar,\n    uint8_t** seq, char** qual, char** extra) {\n  *refID = loadInt32(block);\n  *pos = loadInt32(block);\n  uint32_t bin_mq_nl = loadInt32(block);\n  int8_t l_read_name = bin_mq_nl & 0xFF;\n  *mapq = (bin_mq_nl >> 8) & 0xFF;\n  uint32_t flag_nc = loadInt32(block);\n  *n_cigar_op = flag_nc & 0xFFFF;\n  *flag = (flag_nc >> 16) & 0xFFFF;\n  *l_seq = loadInt32(block);\n  *next_refID = loadInt32(block);\n  *next_pos = loadInt32(block);\n  *tlen = loadInt32(block);\n  *read_name = *block;\n  (*block) += l_read_name;\n  *cigar = (uint32_t*) *block;\n  (*block) += *n_cigar_op * sizeof(uint32_t);\n  *seq = (uint8_t*) *block;\n  (*block) += (*l_seq + 1)/2 * sizeof(uint8_t);\n  *qual = *block;\n  (*block) += *l_seq;\n  *extra = *block;\n}\n\n/* int calcDistBAM()\n * Return distance to 3' end of sequence\n *   (length + offset based on BAM CIGAR).\n */\nint calcDistBAM(int32_t l_seq, uint16_t n_cigar_op,\n    uint32_t* cigar) {\n  int length = l_seq;\n  for (int i = 0; i < n_cigar_op; i++) {\n    uint8_t op = cigar[i] & 0xF;\n    uint32_t op_len = cigar[i] >> 4;\n    if (op == 1 || op == 4)  // 'I' or 'S'\n      length -= op_len;\n    else if (op == 2)        // 'D'\n      length += op_len;\n  }\n  return length;\n}\n\n/* int arrayLen()\n * Calculate length of BAM auxiliary field of\n *   array type ('B').\n */\nint arrayLen(char* extra) {\n  int size = 0;\n  switch (extra[0]) {\n    case 'c': size = sizeof(int8_t); break;\n    case 'C': size = sizeof(uint8_t); break;\n    case 's': size = sizeof(int16_t); break;\n    case 'S': size = sizeof(uint16_t); break;\n    case 'i': size = sizeof(int32_t); break;\n    case 'I': size = sizeof(uint32_t); break;\n    case 'f': size = sizeof(float); break;\n    default: ;\n      char msg[4] = \"' '\";\n      msg[1] = extra[0];\n      exit(error(msg, ERRTYPE));\n  }\n  extra++;\n  int32_t count = loadInt32(&extra);\n  return 1 + sizeof(int32_t) + size * count;\n}\n\n/* int64_t loadInt()\n * Load an arbitrarily-sized int in little-endian format\n *   from a char* block. Return an int64_t that should\n *   be cast by the caller.\n */\nint64_t loadInt(char* block, size_t size) {\n  int64_t ans = 0;\n  for (int i = 0; i < size; i++)\n    ans = ans | ((block[i] & 0xFF) << (i*8));\n  return ans;\n}\n\n/* float getBAMscore()\n * Search BAM auxiliary fields for an alignment score.\n *   Return NOSCORE if not found.\n */\nfloat getBAMscore(char* extra, int len) {\n  if (extra == NULL)\n    return NOSCORE;\n\n  // check auxiliary field\n  char tag[3], val;\n  tag[2] = '\\0';\n  int i = 0;\n  while (i < len - 4) {\n\n    // load tag and value\n    for (int j = 0; j < 3; j++) {\n      if (j < 2)\n        tag[j] = extra[i];\n      else\n        val = extra[i];\n      i++;\n    }\n\n    if (! strcmp(tag, SCORE)) {\n\n      // return alignment score (cast to float)\n      extra += i;\n      switch (val) {\n        case 'c':\n          return (float) (int8_t) loadInt(extra, sizeof(int8_t));\n        case 'C':\n          return (float) (uint8_t) loadInt(extra, sizeof(uint8_t));\n        case 's':\n          return (float) (int16_t) loadInt(extra, sizeof(int16_t));\n        case 'S':\n          return (float) (uint16_t) loadInt(extra, sizeof(uint16_t));\n        case 'i':\n          return (float) (int32_t) loadInt(extra, sizeof(int32_t));\n        case 'I':\n          return (float) (uint32_t) loadInt(extra, sizeof(uint32_t));\n        default: ;\n          char msg[4] = \"' '\";\n          msg[1] = val;\n          exit(error(msg, ERRTYPE));\n      }\n\n    } else {\n\n      // skip to next auxiliary field\n      switch (val) {\n        case 'A': i++; break;\n        case 'c': i += sizeof(int8_t); break;\n        case 'C': i += sizeof(uint8_t); break;\n        case 's': i += sizeof(int16_t); break;\n        case 'S': i += sizeof(uint16_t); break;\n        case 'i': i += sizeof(int32_t); break;\n        case 'I': i += sizeof(uint32_t); break;\n        case 'f': i += sizeof(float); break;\n        case 'Z': for (; extra[i] != '\\0'; i++) ; i++; break;\n        case 'H': for (; extra[i] != '\\0'; i += 2) ; i++; break;\n        case 'B': i += arrayLen(extra + i); break;\n        default: ;\n          char msg[4] = \"' '\";\n          msg[1] = val;\n          exit(error(msg, ERRTYPE));\n      }\n    }\n\n    // check if field has gone past end of block\n    if (i > len)\n      exit(error(\"\", ERRAUX));\n  }\n\n  return NOSCORE;\n}\n\n/* uint64_t parseBAM()\n * Parse the alignments in a BAM file.\n */\nuint64_t parseBAM(gzFile in, char* line, Aln** aln,\n    char* readName, int chromLen, Chrom* chrom, int n_ref,\n    int idx[], double* totalLen, uint64_t* unmapped,\n    uint64_t* paired, uint64_t* single, uint64_t* pairedPr,\n    uint64_t* singlePr, uint64_t* supp, uint64_t* skipped,\n    uint64_t* lowMapQ, int minMapQ, uint64_t* secPair,\n    uint64_t* secSingle, uint64_t* orphan, bool singleOpt,\n    bool extendOpt, int extend, bool avgExtOpt,\n    Aln*** unpair, int* unpairMem, float asDiff,\n    bool atacOpt, int atacLen5, int atacLen3, bool atacAdj,\n    File bed, bool bedOpt, bool gzOut, bool ctrl,\n    int sample, bool dupsOpt, File dups, bool dupsVerb,\n    Read*** readPr, int* readMemPr, Read*** readDc,\n    int* readMemDc, Read*** readSn, int* readMemSn,\n    HashAln*** table, uint32_t* tableMem,\n    HashAln*** tableSn, uint32_t* tableSnMem,\n    uint32_t** order, uint32_t** order0, uint32_t** order1,\n    uint32_t** order2, uint16_t** qualA, uint16_t** qual0,\n    uint16_t** qual1, uint16_t** qual2, uint32_t* arrMem,\n    uint64_t* countPr, uint64_t* dupsPr, uint64_t* countDc,\n    uint64_t* dupsDc, uint64_t* countSn, uint64_t* dupsSn,\n    uint64_t* errCount, bool verbose) {\n\n  // BAM fields to save\n  int32_t refID, pos, l_seq, next_refID, next_pos, tlen;\n  uint16_t n_cigar_op, flag;\n  uint8_t mapq;\n  uint32_t* cigar;\n  uint8_t* seq;\n  char* read_name, *qual, *extra;\n\n  int alnLen = 0;     // number of alignments for this read\n  int unpairIdx = 0;  // \\ indexes into unpaired array(s)\n  int unpairLen = 0;  // /   (with avgExtOpt)\n  int readIdxPr = 0;  // \\ indexes into read array readPr\n  int readLenPr = 0;  // /   (with dupsOpt)\n  int readIdxDc = 0;  // \\ indexes into read array readDc\n  int readLenDc = 0;  // /   (with dupsOpt)\n  int readIdxSn = 0;  // \\ indexes into read array readSn\n  int readLenSn = 0;  // /   (with dupsOpt)\n  uint16_t qualR1 = 0, qualR2 = 0; // sums of quality scores\n  uint64_t count = 0;\n  int32_t block_size;\n  while ((block_size = readInt32(in, false)) != EOF) {\n\n    if (block_size < 6 * sizeof(int32_t) + 2 * sizeof(uint32_t))\n      exit(error(\"\", ERRBAM));  // min. guaranteed block size\n\n    // copy alignment record\n    char* block = line;\n    for (int i = 0; i < block_size; i++) {\n      int m = gzgetc(in);\n      if (m == -1)\n        exit(error(\"\", ERRBAM));\n      block[i] = m;\n    }\n\n    // save BAM fields\n    loadBAMfields(&block, &refID, &pos, &mapq, &n_cigar_op,\n      &flag, &l_seq, &next_refID, &next_pos, &tlen,\n      &read_name, &cigar, &seq, &qual, &extra);\n    if (block > line + block_size)\n      exit(error(\"\", ERRBAM));  // read off the end of the block\n\n    count++;\n    if (flag & 0x4) {\n      // skip unmapped\n      (*unmapped)++;\n      continue;\n    }\n    if (! strcmp(read_name, \"*\")\n        || refID < 0 || refID >= n_ref\n        || idx[refID] < 0 || idx[refID] >= chromLen\n        || pos < 0)\n      // insufficient alignment info\n      exit(error(read_name, ERRSAM));\n    if (flag & 0xE00) {\n      // skip supplementary/PCR dups/low quality\n      (*supp)++;\n      continue;\n    }\n    if (mapq < minMapQ) {\n      // skip low MAPQ alignments\n      (*lowMapQ)++;\n      continue;\n    }\n\n    // process previous set of alns\n    if (readName[0] == '\\0' || strcmp(read_name, readName)) {\n      if (readName[0] != '\\0')\n        processAlns(readName, *aln, alnLen, totalLen,\n          pairedPr, singlePr, orphan, singleOpt,\n          extendOpt, extend, avgExtOpt, unpair,\n          &unpairIdx, &unpairLen, unpairMem, asDiff,\n          atacOpt, atacLen5, atacLen3, atacAdj,\n          bed, bedOpt, gzOut, ctrl, sample, dupsOpt,\n          readPr, &readIdxPr, &readLenPr, readMemPr,\n          readDc, &readIdxDc, &readLenDc, readMemDc,\n          readSn, &readIdxSn, &readLenSn, readMemSn,\n          qualR1, qualR2, errCount, verbose);\n      alnLen = 0;\n      qualR1 = qualR2 = 0;\n      strncpy(readName, read_name, MAX_ALNS);\n    }\n\n    // save alignment information\n    int length = calcDistBAM(l_seq, n_cigar_op, cigar); // distance to 3' end\n    float score = getBAMscore(extra, block_size\n      - (int) (extra - line));\n    if (! parseAlign(aln, &alnLen, flag, chrom + idx[refID],\n        pos, length, next_pos, paired, single, secPair,\n        secSingle, skipped, singleOpt, score, dupsOpt,\n        qual, l_seq, 0, &qualR1, &qualR2) && verbose)\n      fprintf(stderr, \"Warning! Read %s has more than %d alignments\\n\",\n        read_name, MAX_ALNS);\n    // NOTE: the following BAM fields are ignored:\n    //   next_refID, tlen, seq\n  }\n\n  // process last set of alns\n  if (readName[0] != '\\0')\n    processAlns(readName, *aln, alnLen, totalLen,\n      pairedPr, singlePr, orphan, singleOpt,\n      extendOpt, extend, avgExtOpt, unpair,\n      &unpairIdx, &unpairLen, unpairMem, asDiff,\n      atacOpt, atacLen5, atacLen3, atacAdj,\n      bed, bedOpt, gzOut, ctrl, sample, dupsOpt,\n      readPr, &readIdxPr, &readLenPr, readMemPr,\n      readDc, &readIdxDc, &readLenDc, readMemDc,\n      readSn, &readIdxSn, &readLenSn, readMemSn,\n      qualR1, qualR2, errCount, verbose);\n\n  if (dupsOpt)\n    // remove duplicates and process all alignments\n    findDups(*readPr, readIdxPr, readLenPr, *readDc,\n      readIdxDc, readLenDc, *readSn, readIdxSn, readLenSn,\n      table, tableMem, tableSn, tableSnMem, order, order0,\n      order1, order2, qualA, qual0, qual1, qual2, arrMem,\n      countPr, dupsPr, countDc, dupsDc, countSn, dupsSn,\n      singleOpt, pairedPr, singlePr, totalLen, extendOpt,\n      extend, avgExtOpt, asDiff, atacOpt, atacLen5,\n      atacLen3, atacAdj, bed, bedOpt, gzOut, dups,\n      dupsVerb, ctrl, sample, errCount, verbose);\n\n  else if (avgExtOpt)\n    // process single alignments w/ avgExtOpt\n    processAvgExt(*unpair, unpairIdx, unpairLen,\n      *totalLen, *pairedPr, bed, bedOpt, gzOut,\n      ctrl, sample, errCount, verbose);\n\n  return count;\n}\n\n/* uint64_t readBAM()\n * Parse the header from a BAM file, then\n *   call parseBAM().\n */\nuint64_t readBAM(gzFile in, char* line, Aln** aln,\n    char* readName, double* totalLen, uint64_t* unmapped,\n    uint64_t* paired, uint64_t* single, uint64_t* pairedPr,\n    uint64_t* singlePr, uint64_t* supp, uint64_t* skipped,\n    uint64_t* lowMapQ, int minMapQ, int xcount,\n    char** xchrList, int xBedLen, Bed* xBed,\n    uint64_t* secPair, uint64_t* secSingle,\n    uint64_t* orphan, int* chromLen, Chrom** chrom,\n    bool singleOpt, bool extendOpt, int extend,\n    bool avgExtOpt, Aln*** unpair, int* unpairMem,\n    float asDiff, bool atacOpt, int atacLen5, int atacLen3,\n    bool atacAdj, File bed, bool bedOpt, bool gzOut,\n    bool ctrl, int sample, bool dupsOpt, File dups,\n    bool dupsVerb, Read*** readPr, int* readMemPr,\n    Read*** readDc, int* readMemDc, Read*** readSn,\n    int* readMemSn, HashAln*** table, uint32_t* tableMem,\n    HashAln*** tableSn, uint32_t* tableSnMem,\n    uint32_t** order, uint32_t** order0, uint32_t** order1,\n    uint32_t** order2, uint16_t** qual, uint16_t** qual0,\n    uint16_t** qual1, uint16_t** qual2, uint32_t* arrMem,\n    uint64_t* countPr, uint64_t* dupsPr, uint64_t* countDc,\n    uint64_t* dupsDc, uint64_t* countSn, uint64_t* dupsSn,\n    uint64_t* errCount, bool sortOpt, bool verbose) {\n\n  // load first line from header\n  int32_t l_text = readInt32(in, true);\n  int i;\n  for (i = 0; i < l_text; i++) {\n    int m = gzgetc(in);\n    if (m == -1)\n      exit(error(\"\", ERRBAM));\n    unsigned char n = m;\n    if (n == '\\n' || n == '\\0')\n      break;\n    line[i] = n;\n  }\n  line[i] = '\\0';\n  // check sort order\n  char* tag = strtok(line, TAB);\n  if (tag == NULL || strcmp(tag, \"@HD\"))\n    exit(error(\"\", ERRBAM));\n  char* sortOrder = NULL;\n  char* field = strtok(NULL, TAB);\n  while (field != NULL) {\n    if (!strncmp(field, \"SO:\", 3))\n      sortOrder = field + 3;\n    field = strtok(NULL, TAB);\n  }\n  if (sortOpt && (sortOrder == NULL\n      || strcmp(sortOrder, \"queryname\")))\n    exit(error(\"\", ERRSORT));\n  if (gzseek(in, l_text - i - 1, SEEK_CUR) == -1)\n    exit(error(\"\", ERRBAM));\n\n  // save chromosome lengths\n  int32_t n_ref = readInt32(in, true); // number of ref sequences\n  int idx[n_ref];  // index of reference sequences into *chrom array\n  for (int i = 0; i < n_ref; i++) {\n    int32_t len = readInt32(in, true);\n    if (len < 1 || len > MAX_SIZE)\n      exit(error(\"\", ERRBAM));\n    for (int j = 0; j < len; j++) {\n      int m = gzgetc(in);\n      if (m == -1)\n        exit(error(\"\", ERRBAM));\n      line[j] = m;\n    }\n    if (line[len-1] != '\\0')\n      exit(error(\"\", ERRBAM));\n    idx[i] = saveChrom(line, (uint32_t) readInt32(in, true),\n      chromLen, chrom, xcount, xchrList, xBedLen, xBed,\n      ctrl, verbose);\n  }\n\n  return parseBAM(in, line, aln, readName, *chromLen,\n    *chrom, n_ref, idx, totalLen, unmapped, paired, single,\n    pairedPr, singlePr, supp, skipped, lowMapQ, minMapQ,\n    secPair, secSingle, orphan, singleOpt, extendOpt,\n    extend, avgExtOpt, unpair, unpairMem, asDiff, atacOpt,\n    atacLen5, atacLen3, atacAdj, bed, bedOpt, gzOut, ctrl,\n    sample, dupsOpt, dups, dupsVerb, readPr, readMemPr,\n    readDc, readMemDc, readSn, readMemSn, table, tableMem,\n    tableSn, tableSnMem, order, order0, order1, order2,\n    qual, qual0, qual1, qual2, arrMem, countPr, dupsPr,\n    countDc, dupsDc, countSn, dupsSn, errCount, verbose);\n}\n\n/*** File I/O ***/\n\n/* void openWrite()\n * Open a file for writing (stdout if file is '-'),\n *   adjusting filenames/extensions as needed.\n */\nvoid openWrite(char* outFile, File* out, bool gz) {\n  if (outFile[0] == '-' && strlen(outFile) > 1)\n    exit(error(outFile, ERRNAME));\n  if (gz) {\n    if (!strcmp(outFile + strlen(outFile) - strlen(GZEXT), GZEXT)\n        || !strcmp(outFile, \"/dev/null\"))\n      out->gzf = gzopen(outFile, \"w\");\n    else if (!strcmp(outFile, \"-\"))\n      out->gzf = gzdopen(fileno(stdout), \"wb\");\n    else {\n      // add \".gz\" to outFile\n      char* outFile2 = memalloc(strlen(outFile)\n        + strlen(GZEXT) + 1);\n      strcpy(outFile2, outFile);\n      strcat(outFile2, GZEXT);\n      out->gzf = gzopen(outFile2, \"w\");\n      free(outFile2);\n    }\n    if (out->gzf == NULL)\n      exit(error(outFile, ERROPENW));\n  } else {\n    out->f = (strcmp(outFile, \"-\") ?\n      fopen(outFile, \"w\") : stdout);\n    if (out->f == NULL)\n      exit(error(outFile, ERROPENW));\n  }\n}\n\n/* bool checkBAM()\n * Determine if file is BAM formatted.\n */\nbool checkBAM(File in, bool gz) {\n  if (! gz)\n    return false;\n  char magic[5] = \"BAM\\1\";  // BAM magic string\n  for (int i = 0; i < 4; i++) {\n    char m = gzgetc(in.gzf);\n    if (m == -1)\n      exit(error(\"\", ERROPEN));\n    if (m != magic[i]) {\n      // push back chars before returning false\n      if (gzungetc(m, in.gzf) == -1)\n        exit(error(\"\", ERRUNGET));\n      for (int j = i - 1; j > -1; j--)\n        if (gzungetc(magic[j], in.gzf) == -1)\n          exit(error(\"\", ERRUNGET));\n      return false;\n    }\n  }\n  return true;\n}\n\n/* bool openRead()\n * Open a file for reading (stdin if file is '-').\n *   Return true if gzip compressed.\n */\nbool openRead(char* inFile, File* in) {\n\n  // open file or stdin\n  bool stdinBool = ! strcmp(inFile, \"-\");\n  FILE* dummy = (stdinBool ? stdin : fopen(inFile, \"r\"));\n  if (dummy == NULL)\n    exit(error(inFile, ERROPEN));\n\n  // check for gzip compression: magic number 0x1F, 0x8B\n  bool gzip = true;\n  int save = 0;  // first char to pushback (for stdin)\n  int i, j;\n  for (i = 0; i < 2; i++) {\n    j = fgetc(dummy);\n    if (j == EOF)\n      exit(error(inFile, ERROPEN));\n    if ( (i && (unsigned char) j != 0x8B)\n        || (! i && (unsigned char) j != 0x1F) ) {\n      gzip = false;\n      break;\n    }\n    if (! i)\n      save = j;\n  }\n\n  // for stdin, push back chars\n  if (stdinBool) {\n    if (gzip)\n      exit(error(\"\", ERRGZIP));\n    if (ungetc(j, dummy) == EOF)\n      exit(error(\"\", ERRUNGET));\n    if (i && ungetc(save, dummy) == EOF)\n      exit(error(\"\", ERRUNGET));\n  }\n\n  // open file\n  if (gzip) {\n    if (fclose(dummy))\n      exit(error(\"<dummy>\", ERRCLOSE));\n    in->gzf = gzopen(inFile, \"r\");\n    if (in->gzf == NULL)\n      exit(error(inFile, ERROPEN));\n  } else {\n    if (! stdinBool)\n      rewind(dummy);\n    in->f = dummy;\n  }\n\n  return gzip;\n}\n\n/* int loadBED()\n * Load genomic regions to exclude from BED file(s).\n *   Return number saved.\n */\nint loadBED(char* xFile, char* line, Bed** xBed) {\n\n  // loop through BED files\n  int count = 0;  // count of intervals saved\n  char* end;\n  char* filename = strtok_r(xFile, COM, &end);\n  while (filename) {\n\n    // open BED file\n    File in;\n    bool gz = openRead(filename, &in);\n\n    // load BED records\n    while (getLine(line, MAX_SIZE, in, gz) != NULL) {\n\n      // parse BED record\n      char* name = strtok(line, TAB);\n      if (name == NULL)\n        exit(error(line, ERRBED));\n      int pos[2];\n      for (int i = 0; i < 2; i++) {\n        char* val = strtok(NULL, i ? TABN : TAB);\n        if (val == NULL)\n          exit(error(line, ERRBED));\n        pos[i] = getInt(val);\n      }\n      if (pos[1] <= pos[0] || pos[0] < 0 || pos[1] < 0) {\n        char msg[MAX_ALNS];\n        sprintf(msg, \"%s, %d - %d\", name, pos[0], pos[1]);\n        exit(error(msg, ERRBED));\n      }\n\n      // save info to xBed array\n      *xBed = (Bed*) memrealloc(*xBed, (count + 1) * sizeof(Bed));\n      Bed* b = *xBed + count;\n      b->name = memalloc(1 + strlen(name));\n      strcpy(b->name, name);\n      b->pos[0] = pos[0];\n      b->pos[1] = pos[1];\n      count++;\n    }\n\n    // close file\n    if ( (gz && gzclose(in.gzf) != Z_OK)\n        || (! gz && fclose(in.f)) )\n      exit(error(filename, ERRCLOSE));\n\n    filename = strtok_r(NULL, COM, &end);\n  }\n\n  return count;\n}\n\n/* void findPeaksOnly()\n * Control peak-calling directly from logfile (-f).\n */\nvoid findPeaksOnly(char* logFile, char* outFile,\n    bool gzOut, int xcount, char** xchrList, char* xFile,\n    float pqvalue, bool qvalOpt, int minLen, int maxGap,\n    float minAUC, uint64_t genomeLen, bool verbose) {\n\n  // save genomic regions to exclude\n  char* line = (char*) memalloc(MAX_SIZE);\n  Bed* xBed = NULL;\n  int xBedLen = 0;\n  if (xFile != NULL)\n    xBedLen = loadBED(xFile, line, &xBed);\n\n  // open files\n  File in, out;\n  bool gz = openRead(logFile, &in);\n  openWrite(outFile, &out, gzOut);\n  if (verbose)\n    fprintf(stderr, \"Peak-calling from log file: %s\\n\",\n      logFile);\n\n  // call peaks\n  callPeaksLog(in, gz, out, gzOut, line, xcount, xchrList,\n    xBedLen, xBed, pqvalue, qvalOpt, minLen, maxGap,\n    minAUC, genomeLen, verbose);\n\n  // free memory\n  if (xcount) {\n    for (int i = 0; i < xcount; i++)\n      free(xchrList[i]);\n    free(xchrList);\n  }\n  if (xBedLen) {\n    for (int i = 0; i < xBedLen; i++)\n      free(xBed[i].name);\n    free(xBed);\n  }\n  free(line);\n\n  // close files\n  if ( ( gz && gzclose(in.gzf) != Z_OK )\n      || ( ! gz && fclose(in.f) ) )\n    exit(error(logFile, ERRCLOSE));\n  if ( ( gzOut && gzclose(out.gzf) != Z_OK )\n      || ( ! gzOut && fclose(out.f) ) )\n    exit(error(outFile, ERRCLOSE));\n}\n\n/*** Main ***/\n\n/* void logCounts()\n * Log alignment counts to stderr.\n */\nvoid logCounts(uint64_t count, uint64_t unmapped,\n    uint64_t supp, uint64_t skipped, Chrom* chrom,\n    int chromLen, int minMapQ, uint64_t lowMapQ,\n    uint64_t paired, uint64_t secPair, uint64_t orphan,\n    uint64_t single, uint64_t secSingle, uint64_t singlePr,\n    uint64_t pairedPr, double totalLen, bool singleOpt,\n    bool extendOpt, int extend, bool avgExtOpt, bool bam,\n    bool atacOpt, int atacLen, bool dupsOpt,\n    uint64_t countPr, uint64_t dupsPr, uint64_t countDc,\n    uint64_t dupsDc, uint64_t countSn, uint64_t dupsSn,\n    uint64_t errCount) {\n  if (errCount > MAX_ALNS)\n    fprintf(stderr, \"(another %ld warning messages suppressed)\\n\",\n      errCount - MAX_ALNS);\n  double avgLen = pairedPr ? totalLen / pairedPr : 0.0;\n  fprintf(stderr, \"  %s records analyzed: %11ld\\n\",\n    bam ? \"BAM\" : \"SAM\", count);\n  if (unmapped)\n    fprintf(stderr, \"    Unmapped:           %11ld\\n\", unmapped);\n  if (supp)\n    fprintf(stderr, \"    Supp./dups/lowQual: %11ld\\n\", supp);\n  if (skipped) {\n    fprintf(stderr, \"    To skipped refs:    %11ld\\n\", skipped);\n    fprintf(stderr, \"      (\");\n    bool first = true;\n    for (int i = 0; i < chromLen; i++) {\n      Chrom* c = chrom + i;\n      if (c->skip || ! c->save) {\n        fprintf(stderr, \"%s%s\", first ? \"\" : \",\", c->name);\n        first = false;\n      }\n    }\n    fprintf(stderr, \")\\n\");\n  }\n  if (lowMapQ)\n    fprintf(stderr, \"    MAPQ < %-2d:          %11ld\\n\", minMapQ, lowMapQ);\n  fprintf(stderr, \"    Paired alignments:  %11ld\\n\", paired);\n  if (secPair)\n    fprintf(stderr, \"      secondary alns:   %11ld\\n\", secPair);\n  if (orphan)\n    fprintf(stderr, \"      \\\"orphan\\\" alns:    %11ld\\t** Warning! **\\n\",\n      orphan);\n  fprintf(stderr, \"    Unpaired alignments:%11ld\\n\", single);\n  if (secSingle)\n    fprintf(stderr, \"      secondary alns:   %11ld\\n\", secSingle);\n  if (dupsOpt) {\n    fprintf(stderr, \"  PCR duplicates --\\n\");\n    fprintf(stderr, \"    Paired aln sets:    %11ld\\n\", countPr);\n    fprintf(stderr, \"      duplicates:       %11ld (%.1f%%)\\n\",\n      dupsPr, countPr ? 100.0f * dupsPr / countPr : 0.0f);\n    if (singleOpt) {\n      fprintf(stderr, \"    Discordant aln sets:%11ld\\n\", countDc);\n      fprintf(stderr, \"      duplicates:       %11ld (%.1f%%)\\n\",\n        dupsDc, countDc ? 100.0f * dupsDc / countDc : 0.0f);\n      fprintf(stderr, \"    Singleton aln sets: %11ld\\n\", countSn);\n      fprintf(stderr, \"      duplicates:       %11ld (%.1f%%)\\n\",\n        dupsSn, countSn ? 100.0f * dupsSn / countSn : 0.0f);\n    }\n  }\n  fprintf(stderr, \"  Fragments analyzed:   %11ld\\n\", singlePr + pairedPr);\n  fprintf(stderr, \"    Full fragments:     %11ld\\n\", pairedPr);\n  if (pairedPr && ! atacOpt)\n    fprintf(stderr, \"      (avg. length: %.1fbp)\\n\", avgLen);\n  if (singleOpt) {\n    fprintf(stderr, \"    Half fragments:     %11ld\\n\", singlePr);\n    if (singlePr) {\n      fprintf(stderr, \"      (from unpaired alns\");\n      if (extendOpt)\n        fprintf(stderr, \", extended to %dbp\", extend);\n      else if (avgExtOpt && pairedPr)\n        fprintf(stderr, \", extended to %dbp\", (int) (avgLen + 0.5));\n      fprintf(stderr, \")\\n\");\n    }\n  }\n  if (atacOpt) {\n    fprintf(stderr, \"    ATAC-seq cut sites: %11ld\\n\",\n      2 * pairedPr + singlePr);\n    fprintf(stderr, \"      (expanded to length %dbp)\\n\", atacLen);\n  }\n}\n\n/* void runProgram()\n * Controls the opening/closing of files, and parsing\n *   of input files by readSAM() or readBAM().\n *   Pileup values are computed by savePileupExpt() or\n *   savePileupCtrl(), and p-values for each experimental/\n *   control pair are calculated by savePval().\n *   Results for all replicates are passed to findPeaks().\n * If calling peaks only (from logfile), pass control\n *   directly to findPeaksOnly().\n */\nvoid runProgram(char* inFile, char* ctrlFile, char* outFile,\n    char* logFile, char* pileFile, char* bedFile,\n    bool gzOut, bool singleOpt, bool extendOpt, int extend,\n    bool avgExtOpt, int minMapQ, int xcount,\n    char** xchrList, char* xFile, float pqvalue,\n    bool qvalOpt, int minLen, int maxGap, float minAUC,\n    float asDiff, bool atacOpt, int atacLen5, int atacLen3,\n    bool atacAdj, bool dupsOpt, char* dupsFile,\n    bool peaksOpt, bool peaksOnly, bool sortOpt,\n    uint64_t genomeLen, bool verbose) {\n\n  // option to call peaks only, from already produced log file\n  if (peaksOnly) {\n    findPeaksOnly(logFile, outFile, gzOut, xcount,\n      xchrList, xFile, pqvalue, qvalOpt, minLen, maxGap,\n      minAUC, genomeLen, verbose);\n    return;\n  }\n\n  // open optional output files\n  File bed, pile, dups;\n  if (bedFile != NULL)\n    openWrite(bedFile, &bed, gzOut);\n  if (pileFile != NULL)\n    openWrite(pileFile, &pile, gzOut);\n  bool dupsVerb = false;\n  if (dupsOpt && dupsFile != NULL) {\n    openWrite(dupsFile, &dups, gzOut);\n    dupsVerb = true;\n  }\n\n  // initialize variables\n  char* line = (char*) memalloc(MAX_SIZE);\n  int chromLen = 0;     // number of reference sequences\n  Chrom* chrom = NULL;  // array of reference sequences\n  Aln* aln = (Aln*) memalloc(MAX_ALNS * sizeof(Aln)); // array of saved alns\n  int unpairMem = 0;    // number of unpaired alns (for avg-ext option)\n  Aln** unpair = NULL;  // array of unpaired alns (for avg-ext option)\n  char* readName = memalloc(MAX_ALNS + 1);  // name of read being analyzed\n  readName[0] = readName[MAX_ALNS] = '\\0';\n  int sample = 0;       // number of sample pairs analyzed\n\n  // variables for duplicate removal option\n  Read** readPr = NULL;     // array of reads with properly paired aln(s)\n  int readMemPr = 0;        // index into readPr array\n  Read** readDc = NULL;     // array of reads with discordant aln(s)\n  int readMemDc = 0;        // index into readDc array\n  Read** readSn = NULL;     // array of reads with singleton aln(s)\n  int readMemSn = 0;        // index into readSn array\n  HashAln** table = NULL;   // hashtable for paired/discordant alns (recycled)\n  uint32_t tableMem = 0;    // size of above hashtable\n  HashAln** tableSn = NULL; // hashtable for singletons alns\n  uint32_t tableSnMem = 0;  // size of above hashtable\n  uint32_t* order = NULL;   // array for order of reads to be processed\n  uint32_t* order0 = NULL;  // |\n  uint32_t* order1 = NULL;  // | temp arrays for order\n  uint32_t* order2 = NULL;  // |\n  uint16_t* qual = NULL;    // array of quality score sums (sorting key)\n  uint16_t* qual0 = NULL;   // |\n  uint16_t* qual1 = NULL;   // | temp arrays for qual\n  uint16_t* qual2 = NULL;   // |\n  uint32_t arrMem = 0;      // length of above order/qual arrays\n\n  // save genomic regions to exclude\n  Bed* xBed = NULL;\n  int xBedLen = 0;\n  if (xFile != NULL)\n    xBedLen = loadBED(xFile, line, &xBed);\n\n  // loop through input files (experimental and control)\n  char* end1, *end2;\n  char* exptName = strtok_r(inFile, COM, &end1);\n  char* ctrlName = ctrlFile == NULL ? NULL\n    : strtok_r(ctrlFile, COM, &end2);\n  while (exptName) {\n\n    // reset 'save' bools of each Chrom\n    for (int j = 0; j < chromLen; j++)\n      (chrom + j)->save = false;\n\n    // process matching experimental/control files\n    double fragLen = 0.0; // total weighted length of all experimental fragments\n    for (int i = 0; i < 2; i++) {\n\n      // get expt/ctrl filename\n      char* filename = exptName;\n      if (i) {\n        filename = ctrlName;\n        if (ctrlName != NULL && !strcmp(ctrlName, \"null\"))\n          filename = NULL;\n        if (filename == NULL) {\n          if (verbose)\n            fprintf(stderr, \"- control file #%d not provided -\\n\",\n              sample);\n          savePileupNoCtrl(chrom, chromLen, fragLen,\n            genomeLen, verbose);\n          break;\n        }\n      }\n\n      // open input file\n      File in;\n      bool gz = openRead(filename, &in);\n      bool bam = checkBAM(in, gz);\n      if (verbose)\n        fprintf(stderr, \"Processing %s file #%d: %s\\n\",\n          i ? \"control\" : \"experimental\", sample, filename);\n      if (dupsVerb) {\n        if (gzOut)\n          gzprintf(dups.gzf, \"# %s file #%d: %s\\n\",\n            i ? \"control\" : \"experimental\", sample, filename);\n        else\n          fprintf(dups.f, \"# %s file #%d: %s\\n\",\n            i ? \"control\" : \"experimental\", sample, filename);\n      }\n\n      // reset 'diff' array for each Chrom\n      for (int j = 0; j < chromLen; j++) {\n        Chrom* c = chrom + j;\n        if (c->diff != NULL)\n          for (int k = 0; k < 1 + c->len; k++) {\n            c->diff->frac[k] = 0;\n            c->diff->cov[k] = 0;\n          }\n      }\n\n      // load and process alignments\n      uint64_t unmapped = 0, paired = 0, single = 0,\n        orphan = 0, pairedPr = 0, singlePr = 0, supp = 0,\n        skipped = 0, lowMapQ = 0, secPair = 0,\n        secSingle = 0, countPr = 0, dupsPr = 0,\n        countDc = 0, dupsDc = 0, countSn = 0, dupsSn = 0,\n        errCount = 0;  // counting variables\n      double totalLen = 0.0;  // total weighted length of paired fragments\n      uint64_t count;\n      if (bam)\n        count = readBAM(in.gzf, line, &aln, readName,\n          &totalLen, &unmapped, &paired, &single,\n          &pairedPr, &singlePr, &supp, &skipped, &lowMapQ,\n          minMapQ, xcount, xchrList, xBedLen, xBed,\n          &secPair, &secSingle, &orphan, &chromLen, &chrom,\n          singleOpt, extendOpt, extend, avgExtOpt, &unpair,\n          &unpairMem, asDiff, atacOpt, atacLen5, atacLen3,\n          atacAdj, bed, bedFile != NULL, gzOut, i, sample,\n          dupsOpt, dups, dupsVerb, &readPr, &readMemPr,\n          &readDc, &readMemDc, &readSn, &readMemSn, &table,\n          &tableMem, &tableSn, &tableSnMem, &order,\n          &order0, &order1, &order2, &qual, &qual0, &qual1,\n          &qual2, &arrMem, &countPr, &dupsPr, &countDc,\n          &dupsDc, &countSn, &dupsSn, &errCount, sortOpt,\n          verbose);\n      else\n        count = readSAM(in, gz, line, &aln, readName,\n          &totalLen, &unmapped, &paired, &single,\n          &pairedPr, &singlePr, &supp, &skipped, &lowMapQ,\n          minMapQ, xcount, xchrList, xBedLen, xBed,\n          &secPair, &secSingle, &orphan, &chromLen, &chrom,\n          singleOpt, extendOpt, extend, avgExtOpt, &unpair,\n          &unpairMem, asDiff, atacOpt, atacLen5, atacLen3,\n          atacAdj, bed, bedFile != NULL, gzOut, i, sample,\n          dupsOpt, dups, dupsVerb, &readPr, &readMemPr,\n          &readDc, &readMemDc, &readSn, &readMemSn, &table,\n          &tableMem, &tableSn, &tableSnMem, &order,\n          &order0, &order1, &order2, &qual, &qual0, &qual1,\n          &qual2, &arrMem, &countPr, &dupsPr, &countDc,\n          &dupsDc, &countSn, &dupsSn, &errCount, sortOpt,\n          verbose);\n\n      // log counts\n      if (verbose)\n        logCounts(count, unmapped, supp, skipped, chrom,\n          chromLen, minMapQ, lowMapQ, paired, secPair,\n          orphan, single, secSingle, singlePr, pairedPr,\n          totalLen, singleOpt, extendOpt, extend,\n          avgExtOpt, bam, atacOpt, atacLen5 + atacLen3,\n          dupsOpt, countPr, dupsPr, countDc, dupsDc,\n          countSn, dupsSn, errCount);\n\n      // save pileup values\n      if (i)\n        savePileupCtrl(chrom, chromLen, fragLen, genomeLen,\n          verbose);\n      else\n        fragLen = savePileupExpt(chrom, chromLen);\n\n      // close input file\n      if ( (gz && gzclose(in.gzf) != Z_OK)\n          || (! gz && fclose(in.f)) )\n        exit(error(filename, ERRCLOSE));\n    }\n\n    // calculate p-values\n    savePval(chrom, chromLen, sample, pile,\n      pileFile != NULL, exptName, ctrlName, gzOut);\n\n    exptName = strtok_r(NULL, COM, &end1);\n    ctrlName = ctrlFile == NULL ? NULL\n      : strtok_r(NULL, COM, &end2);\n    sample++;\n  }\n\n  // free 'diff' arrays\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (chr->diff) {\n      free(chr->diff->frac);\n      free(chr->diff->cov);\n      free(chr->diff);\n    }\n  }\n\n  // open output files\n  File out, log;\n  if (peaksOpt)\n    openWrite(outFile, &out, gzOut);\n  if (logFile != NULL)\n    openWrite(logFile, &log, gzOut);\n\n  // find peaks\n  findPeaks(out, log, logFile != NULL, gzOut, chrom,\n    chromLen, &sample, pqvalue, qvalOpt, minLen,\n    maxGap, minAUC, peaksOpt, genomeLen, verbose);\n\n  // free memory\n  if (xcount) {\n    for (int i = 0; i < xcount; i++)\n      free(xchrList[i]);\n    free(xchrList);\n  }\n  if (xBedLen) {\n    for (int i = 0; i < xBedLen; i++)\n      free(xBed[i].name);\n    free(xBed);\n  }\n  if (avgExtOpt) {\n    for (int i = 0; i < unpairMem; i++)\n      free(unpair[i]);\n    free(unpair);\n  }\n  if (dupsOpt) {\n    for (int i = 0; i < readMemPr; i++)\n      free(readPr[i]);\n    free(readPr);\n    for (int i = 0; i < readMemDc; i++)\n      free(readDc[i]);\n    free(readDc);\n    for (int i = 0; i < readMemSn; i++)\n      free(readSn[i]);\n    free(readSn);\n    free(table);\n    free(tableSn);\n    free(order);\n    free(order0);\n    free(order1);\n    free(order2);\n    free(qual);\n    free(qual0);\n    free(qual1);\n    free(qual2);\n  }\n  for (int i = 0; i < chromLen; i++) {\n    Chrom* chr = chrom + i;\n    if (! chr->skip) {\n      if (chr->bedLen)\n        free(chr->bed);\n      if (qvalOpt && chr->qval) {\n        free(chr->qval->end);\n        free(chr->qval->cov);\n        free(chr->qval);\n      }\n      for (int j = 0; j < sample; j++) {\n        if (chr->pval[j]) {\n          free(chr->pval[j]->end);\n          free(chr->pval[j]->cov);\n          free(chr->pval[j]);\n        }\n      }\n      free(chr->pval);\n      free(chr->pvalLen);\n      free(chr->ctrl->end);\n      free(chr->ctrl->cov);\n      free(chr->expt->end);\n      free(chr->expt->cov);\n    }\n    free(chr->ctrl);\n    free(chr->expt);\n    free(chr->name);\n  }\n  free(chrom);\n  free(aln);\n  free(readName);\n  free(line);\n\n  // close files\n  if (peaksOpt && ( ( gzOut && gzclose(out.gzf) != Z_OK )\n      || ( ! gzOut && fclose(out.f) ) ) )\n    exit(error(outFile, ERRCLOSE));\n  if (logFile != NULL && ( ( ! gzOut && fclose(log.f) )\n      || ( gzOut && gzclose(log.gzf) != Z_OK ) ) )\n    exit(error(logFile, ERRCLOSE));\n  if (pileFile != NULL && ( ( ! gzOut && fclose(pile.f) )\n      || ( gzOut && gzclose(pile.gzf) != Z_OK ) ) )\n    exit(error(pileFile, ERRCLOSE));\n  if (bedFile != NULL && ( ( ! gzOut && fclose(bed.f) )\n      || ( gzOut && gzclose(bed.gzf) != Z_OK ) ) )\n    exit(error(bedFile, ERRCLOSE));\n  if (dupsVerb && ( ( ! gzOut && fclose(dups.f) )\n      || ( gzOut && gzclose(dups.gzf) != Z_OK ) ) )\n    exit(error(dupsFile, ERRCLOSE));\n}\n\n/* int saveXChrom()\n * Save list of chromosomes (ref names) to exclude.\n *   Return count.\n */\nint saveXChrom(char* xchrom, char*** xchrList) {\n  int i = 0;\n  char* chrom = strtok(xchrom, COM);\n  while (chrom != NULL) {\n    *xchrList = (char**) memrealloc(*xchrList,\n      (i + 1) * sizeof(char*));\n    (*xchrList)[i] = (char*) memalloc(1 + strlen(chrom));\n    strcpy((*xchrList)[i], chrom);\n    i++;\n    chrom = strtok(NULL, COM);\n  }\n  return i;\n}\n\n/* void getArgs()\n * Parse the command-line. Check for errors.\n */\nvoid getArgs(int argc, char** argv) {\n\n  // default parameters/filenames\n  char* outFile = NULL, *inFile = NULL, *ctrlFile = NULL,\n    *logFile = NULL, *pileFile = NULL, *bedFile = NULL,\n    *xFile = NULL, *dupsFile = NULL;\n  char* xchrom = NULL;\n  uint64_t genomeLen = 0;\n  int extend = 0, minMapQ = 0, minLen = DEFMINLEN,\n    maxGap = DEFMAXGAP, atacLen5 = DEFATAC, atacLen3 = 0;\n  float asDiff = 0.0f, pqvalue = DEFPVAL, minAUC = DEFAUC;\n  bool singleOpt = false, extendOpt = false,\n    avgExtOpt = false, atacOpt = false, atacAdj = true,\n    gzOut = false, qvalOpt = false, dupsOpt = false,\n    peaksOpt = true, peaksOnly = false, sortOpt = true;\n  bool verbose = false;\n\n  // parse argv\n  int c;\n  while ( (c = getopt_long(argc, argv, OPTIONS, long_options, NULL)) != -1 )\n    switch (c) {\n      case INFILE: inFile = optarg; break;\n      case CTRLFILE: ctrlFile = optarg; break;\n      case OUTFILE: outFile = optarg; break;\n      case LOGFILE: logFile = optarg; break;\n      case PILEFILE: pileFile = optarg; break;\n      case BEDFILE: bedFile = optarg; break;\n      case GZOPT: gzOut = true; break;\n      case UNPAIROPT: singleOpt = true; break;\n      case EXTENDOPT: extend = getInt(optarg); extendOpt = true; break;\n      case AVGEXTOPT: avgExtOpt = true; break;\n      case ATACOPT: atacOpt = true; break;\n      case ATACLEN: atacLen5 = getInt(optarg); break;\n      case DNASEOPT: atacAdj = false; break;\n      case XCHROM: xchrom = optarg; break;\n      case XFILE: xFile = optarg; break;\n      case MINMAPQ: minMapQ = getInt(optarg); break;\n      case ASDIFF: asDiff = getFloat(optarg); break;\n      case PVALUE: pqvalue = getFloat(optarg); break;\n      case QVALUE: pqvalue = getFloat(optarg); qvalOpt = true; break;\n      case MINAUC: minAUC = getFloat(optarg); break;\n      case MINLEN: minLen = getInt(optarg); break;\n      case MAXGAP: maxGap = getInt(optarg); break;\n      case DUPSOPT: dupsOpt = true; break;\n      case DUPSFILE: dupsFile = optarg; break;\n      case NOPEAKS: peaksOpt = false; break;\n      case PEAKSONLY: peaksOnly = true; break;\n      case SORTOPT: sortOpt = false; break;\n      case GENOMELEN: genomeLen = getLong(optarg); break;\n      case VERBOSE: verbose = true; break;\n      case VERSOPT: printVersion(); break;\n      case HELP: usage(); break;\n      default: exit(EXIT_FAILURE);\n    }\n  if (optind < argc)\n    exit(error(argv[optind], ERRPARAM));\n\n  // check for argument errors\n  if ((peaksOpt && outFile == NULL)\n      || (peaksOnly && logFile == NULL)\n      || (!peaksOnly && inFile == NULL)) {\n    error(\"\", ERRFILE);\n    usage();\n  }\n  if (avgExtOpt) {\n    singleOpt = true;\n    extendOpt = false; // avgExtOpt takes precedence\n  }\n  if (extendOpt) {\n    singleOpt = true;\n    if (extend <= 0)\n      exit(error(\"\", ERREXTEND));\n  }\n  if (atacOpt) {\n    avgExtOpt = extendOpt = false;  // no unpaired extensions in ATAC-seq mode\n    if (atacLen5 <= 0)\n      exit(error(\"\", ERRATAC));\n    // split atacLen into atacLen5 and atacLen3\n    atacLen3 = (int) (atacLen5 / 2.0f + 0.5f);  // round up for 3' end\n    atacLen5 /= 2;\n  }\n  if (minLen < 0)\n    exit(error(\"\", ERRMINLEN));\n  if (minAUC < 0.0f)\n    exit(error(\"\", ERRMINAUC));\n  if (asDiff < 0.0f)\n    exit(error(\"\", ERRASDIFF));\n  if (genomeLen < 0)\n    exit(error(\"\", ERRGENLEN));\n\n  // save list of chromosomes to exclude\n  int xcount = 0;\n  char** xchrList = NULL;\n  if (xchrom != NULL)\n    xcount = saveXChrom(xchrom, &xchrList);\n\n  // adjust significance level to -log scale\n  if (pqvalue <= 0.0f || pqvalue > 1.0f)\n    exit(error(\"\", ERRPQVAL));\n  pqvalue = -log10f(pqvalue);\n\n  // send arguments to runProgram()\n  runProgram(inFile, ctrlFile, outFile, logFile, pileFile,\n    bedFile, gzOut, singleOpt, extendOpt, extend,\n    avgExtOpt, minMapQ, xcount, xchrList, xFile, pqvalue,\n    qvalOpt, minLen, maxGap, minAUC, asDiff,\n    atacOpt, atacLen5, atacLen3, atacAdj, dupsOpt,\n    dupsFile, peaksOpt, peaksOnly, sortOpt, genomeLen,\n    verbose);\n}\n\n/* int main()\n * Main.\n */\nint main(int argc, char* argv[]) {\n  getArgs(argc, argv);\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "Genrich.h",
    "content": "/*\n  John M. Gaspar (jsh58@wildcats.unh.edu)\n  June 2018\n\n  Finding sites of enrichment from genome-wide assays.\n\n  Version 0.6.2\n*/\n#define VERSION     \"0.6.2\"\n\n// macros\n#define MAX(a, b) ((a) > (b) ? (a) : (b))\n#define MIN(a, b) ((a) < (b) ? (a) : (b))\n\n// constants\n#define MAX_SIZE    65520   // maximum length of input SAM/BAM alignments\n#define MAX_ALNS    128     // maximum number of alignments per read/pair\n                            //   - also used as max. read name length,\n                            //     and for various dynamic memory allocs\n#define HASH_SIZE   1310417 // size of hashtable for p-values\n#define TAB         \"\\t\"    // separator for SAM/BED fields\n#define TABN        \"\\t\\n\"  // separator for final BED field\n#define COL         \":\"     // separator for SAM optional fields (TAG:TYPE:VALUE)\n#define COM         \", \"    // separator for input file names / ref. names\n#define NA          \"NA\"    // results not available\n#define GZEXT       \".gz\"   // extension for gzip-compressed files\n#define SKIP        -1.0f   // stats for a genomic region to be skipped\n\n// default parameter values\n#define DEFPVAL     0.01f   // default p-value\n#define DEFAUC      200.0f  // area under the curve for peak calling\n#define DEFMAXGAP   100     // maximum gap between significant sites\n#define DEFMINLEN   0       // minimum length of a peak\n#define DEFATAC     100     // interval length for ATAC-seq mode\n#define ATACADJF    5       // adjustment for ATAC interval on fwd strand\n#define ATACADJR    -5      // adjustment for ATAC interval on rev strand\n\n// SAM fields\nenum sam { NIL, QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT,\n  PNEXT, TLEN, SEQ, QUAL };\n#define SAMQUAL     33        // quality score offset\n#define SCORE       \"AS\"      // extra field: alignment score\n#define NOSCORE     -FLT_MAX  // for alignments with no alignment score(s)\n\n// alignment types\nenum alignType { PAIRED, SINGLE, DISCORD };\n\n// fields of a bedGraph file\nenum bedGraph { CHR, START, END };\n\n// constants for log-normal p-value calculation\n#define LOGSQRT     0.445999019652555   // log(sqrt(2.44))\n#define SQRTLOG     0.944456478248262   // sqrt(log(2.44))\n\n// command-line options\n#define OPTIONS     \"ht:c:o:f:k:b:zyw:xjd:De:E:m:s:p:q:a:l:g:rR:XPSL:vV\"\n#define HELP        'h'\n#define INFILE      't'\n#define CTRLFILE    'c'\n#define OUTFILE     'o'\n#define LOGFILE     'f'\n#define PILEFILE    'k'\n#define BEDFILE     'b'\n#define GZOPT       'z'\n#define UNPAIROPT   'y'\n#define EXTENDOPT   'w'\n#define AVGEXTOPT   'x'\n#define ATACOPT     'j'\n#define ATACLEN     'd'\n#define DNASEOPT    'D'\n#define XCHROM      'e'\n#define XFILE       'E'\n#define MINMAPQ     'm'\n#define ASDIFF      's'\n#define PVALUE      'p'\n#define QVALUE      'q'\n#define MINAUC      'a'\n#define MINLEN      'l'\n#define MAXGAP      'g'\n#define DUPSOPT     'r'\n#define DUPSFILE    'R'\n#define NOPEAKS     'X'\n#define PEAKSONLY   'P'\n#define SORTOPT     'S'\n#define GENOMELEN   'L'\n#define VERBOSE     'v'\n#define VERSOPT     'V'\n\nstatic struct option long_options[] = {\n  {\"help\", no_argument, NULL, HELP},\n  {\"verbose\", no_argument, NULL, VERBOSE},\n  {\"version\", no_argument, NULL, VERSOPT},\n  {0, 0, 0, 0}\n};\n\n// error messages\nenum errCode { ERRFILE, ERROPEN, ERROPENW, ERRCLOSE,\n  ERRMEM, ERRINT, ERRFLOAT, ERRPARAM, ERREXTEND, ERRATAC,\n  ERRPQVAL, ERRASDIFF, ERRMINAUC, ERRMINLEN, ERRMISM,\n  ERRINFO, ERRSAM, ERRCHROM, ERRHEAD, ERRBAM, ERRGEN,\n  ERREXPT, ERRCHRLEN, ERRCTRL, ERRPOS, ERRSORT, ERRTYPE,\n  ERRAUX, ERRBED, ERRLINEAR, ERRINDEX, ERRLOGIDX, ERRLOG,\n  ERRISSUE, ERRALNS, ERRPILE, ERRPVAL, ERRARR, ERRARRC,\n  ERRDF, ERRALNTYPE, ERRUNGET, ERRGZIP, ERRNAME, ERRCIGAR,\n  ERRGENLEN, DEFERR\n};\nconst char* errMsg[] = { \"Need input/output files\",\n  \": cannot open file for reading\",\n  \": cannot open file for writing\",\n  \": cannot close file\",\n  \"Cannot allocate memory\",\n  \": cannot convert to int\",\n  \": cannot convert to float\",\n  \": unknown command-line argument\",\n  \"Extension length must be > 0\",\n  \"ATAC-seq interval length must be > 0\",\n  \"p-/q-value must be in (0,1]\",\n  \"Secondary alignment score threshold must be >= 0.0\",\n  \"Minimum AUC must be >= 0.0\",\n  \"Minimum peak length must be >= 0\",\n  \": mismatch between sequence length and CIGAR\",\n  \": no sequence information (SEQ or CIGAR)\",\n  \": poorly formatted SAM/BAM record\",\n  \": cannot find reference sequence name in SAM header\",\n  \": misplaced SAM header line\",\n  \"Cannot parse BAM file\",\n  \"No analyzable genome (length=0)\",\n  \"Experimental sample has no analyzable fragments\",\n  \": reference sequence has different lengths in BAM/SAM files\",\n  \": reference sequence missing from control sample(s)\",\n  \": read aligned beyond reference end\",\n  \"SAM/BAM file not sorted by queryname (samtools sort -n)\",\n  \": unknown value type in BAM auxiliary field\",\n  \"Poorly formatted BAM auxiliary field\",\n  \": poorly formatted BED record\",\n  \"Linear template with >2 reads -- not allowed\",\n  \"Unknown index of paired alignment\",\n  \": cannot find field in header of bedgraph-ish log file\",\n  \"Poorly formatted bedgraph-ish log record\",\n  \"\\n  (internal error: please open an Issue on https://github.com/jsh58/Genrich)\",\n  \"Disallowed number of alignments\",\n  \"Invalid pileup value (< 0)\",\n  \"Failure collecting p-values\",\n  \"Failure creating experimental pileup\",\n  \"Failure creating control pileup\",\n  \"Invalid df in pchisq()\",\n  \"Invalid alignment type\",\n  \"Failure in ungetc() call\",\n  \"Cannot pipe in gzip-compressed file (use zcat instead)\",\n  \": output filename cannot start with '-'\",\n  \": unknown Op in CIGAR\",\n  \"Genome length must be a positive int\",\n  \"Unknown error\"\n};\n\n// generic File type\ntypedef union file {\n  FILE* f;\n  gzFile gzf;\n} File;\n\ntypedef struct bed {\n  uint32_t pos[2];\n  char* name;     // chromosome name\n} Bed;\n\ntypedef struct hash {\n  float val;      // p-value\n  uint64_t len;   // length of genome with that p-value\n  struct hash* next;\n} Hash;\n\ntypedef struct pileup {\n  uint32_t* end;  // array of end coordinates\n  float* cov;     // array of pileup values\n} Pileup;\n\ntypedef struct diff {\n  uint8_t* frac;  // fractions of a count (8-bit encoded)\n  int16_t* cov;   // int counts\n} Diff;\n\ntypedef struct chrom {\n  char* name;         // name of chromosome (reference sequence)\n  uint32_t len;       // length of chromosome\n  bool skip;          // chromosome to be skipped?\n  bool save;          // chromosome to be saved? (by sample)\n  uint32_t* bed;      // coordinates (paired) of regions to be excluded\n  int bedLen;         // number of coordinates of regions to be excluded\n  Diff* diff;         // arrays for keeping track of pileup changes\n  Pileup* expt;       // pileup arrays for experimental sample(s)\n  uint32_t exptLen;   // length of pileup arrays for experimental sample(s) (dynamic)\n  uint32_t exptMem;   // length of pileup arrays for experimental sample(s) (in memory)\n  Pileup* ctrl;       // pileup arrays for control sample(s)\n  uint32_t ctrlLen;   // length of pileup arrays for control sample(s) (dynamic)\n  uint32_t ctrlMem;   // length of pileup arrays for control sample(s) (in memory)\n  Pileup** pval;      // \"pileup\" arrays for p-values\n  uint32_t* pvalLen;  // lengths of \"pileup\" arrays for p-values\n  uint8_t sample;     // count of samples with p-value arrays saved\n  Pileup* qval;       // \"pileup\" arrays for q-values\n} Chrom;\n\ntypedef struct aln {\n  uint32_t pos[2];  // positions of the alignment\n  float score;      // alignment score (sum of scores for paired alns)\n  bool primary;     // primary alignment?\n  bool paired;      // properly paired alignment?\n  bool full;        // both parts of paired aln analyzed? (only for paired alns)\n  bool first;       // which read of a pair this is (true -> R1; false -> R2)\n  bool strand;      // which strand aln is on (only for unpaired alns)\n  uint8_t count;    // value of aln (only for unpaired alns with avg-ext option)\n  char* name;       // read name (only for unpaired alns with avg-ext option)\n  Chrom* chrom;     // reference sequence\n} Aln;\n\ntypedef struct hashAln {\n  char* name;       // read name\n  Chrom* chrom;     // reference sequence\n  Chrom* chrom1;    // other reference sequence (discordant only)\n  uint32_t pos;     // position of the alignment\n  uint32_t pos1;    // other position of the aln (paired and discordant only)\n  bool strand;      // strand of aln (discordant and singletons only)\n  bool strand1;     // other strand of aln (discordant only)\n  struct hashAln* next;\n} HashAln;\n\ntypedef struct read {\n  char* name;       // read name\n  Aln* aln;         // array of alignments\n  uint8_t alnLen;   // length of alignment array\n  Aln* alnR2;       // array of alignments for R2 (discordant alns only)\n  uint8_t alnLenR2; // length of alnR2 array (discordant alns only)\n  uint16_t qual;    // sum of quality scores\n  bool first;       // true -> R1; false -> R2 (singleton alns only)\n  float score;      // min. alignment score\n  float scoreR2;    // min. alignment score for R2 (discordant alns only)\n} Read;\n"
  },
  {
    "path": "LICENSE",
    "content": "The MIT License\n\nCopyright (C) 2018 John M. Gaspar (jsh58@wildcats.unh.edu)\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "Makefile",
    "content": "Genrich: Genrich.c Genrich.h\n\tgcc -g -Wall -std=gnu99 -O2 -o Genrich Genrich.c -lz -lm\n"
  },
  {
    "path": "README.md",
    "content": "# Genrich: detecting sites of genomic enrichment\n\n## Table of Contents\n* [Introduction](#intro)\n  * [Quick start](#quick)\n  * [Software compilation](#compile)\n  * [Usage message](#usage)\n* [Attributes](#attributes)\n  * [Peak-calling method](#method)\n  * [Alignment parsing](#alignment)\n  * [Multimapping reads](#multimap)\n  * [PCR duplicate removal](#duplicate)\n  * [Genome length calculation](#genomelen)\n  * [Control/background pileup calculation](#pileup)\n  * [*p*-value calculation](#pvalue)\n  * [*q*-value calculation](#qvalue)\n  * [Multiple replicates](#replicate)\n* [I/O files and options](#files)\n  * [Required files](#required)\n  * [Optional files](#optional)\n* [Filtering options](#filter)\n  * [Unpaired alignments](#unpaired)\n* [ATAC-seq mode](#atacseq)\n* [Peak-calling parameters](#peakcalling)\n* [Miscellaneous](#misc)\n  * [Full analysis example](#example)\n  * [Warning messages](#warning)\n  * [Computational requirements](#memory)\n* [Contact](#contact)\n<br><br>\n\n## Introduction<a name=\"intro\"></a>\n\nGenrich is a peak-caller for genomic enrichment assays (e.g. ChIP-seq, ATAC-seq).  It analyzes alignment files generated following the assay and produces a file detailing peaks of significant enrichment.\n\n\n### Quick start<a name=\"quick\"></a>\n\nGiven:\n* `sample.bam` (alignment file, sorted by queryname)\n* `Genrich` (compiled as described [below](#compile))\n\nTo produce a file listing regions of genomic enrichment:\n```\n$ ./Genrich  -t sample.bam  -o sample.narrowPeak  -v\n```\n\n### Software compilation<a name=\"compile\"></a>\n\nThe software can be downloaded from [GitHub](https://github.com/jsh58/Genrich).\n\nA Makefile is provided for compilation with [GCC](https://gcc.gnu.org/releases.html), and [zlib](http://zlib.net) is also required.  The program has been tested after compilation with GCC 5.4.0 and zlib 1.2.8.\n\nTo compile, run `make` in the folder in which the software was downloaded.  The executable `Genrich` should be produced.\n\n\n### Usage message<a name=\"usage\"></a>\n\n```\nUsage: ./Genrich  -t <file>  -o <file>  [optional arguments]\nRequired arguments:\n  -t  <file>       Input SAM/BAM file(s) for experimental sample(s)\n  -o  <file>       Output peak file (in ENCODE narrowPeak format)\nOptional I/O arguments:\n  -c  <file>       Input SAM/BAM file(s) for control sample(s)\n  -f  <file>       Output bedgraph-ish file for p/q values\n  -k  <file>       Output bedgraph-ish file for pileups and p-values\n  -b  <file>       Output BED file for reads/fragments/intervals\n  -R  <file>       Output file for PCR duplicates (only with -r)\nFiltering options:\n  -r               Remove PCR duplicates\n  -e  <arg>        Comma-separated list of chromosomes to exclude\n  -E  <file>       Input BED file(s) of genomic regions to exclude\n  -m  <int>        Minimum MAPQ to keep an alignment (def. 0)\n  -s  <float>      Keep sec alns with AS >= bestAS - <float> (def. 0)\n  -y               Keep unpaired alignments (def. false)\n  -w  <int>        Keep unpaired alns, lengths changed to <int>\n  -x               Keep unpaired alns, lengths changed to paired avg\nOptions for ATAC-seq:\n  -j               Use ATAC-seq mode (def. false)\n  -d  <int>        Expand cut sites to <int> bp (def. 100)\n  -D               Skip Tn5 adjustments of cut sites (def. false)\nOptions for peak-calling:\n  -p  <float>      Maximum p-value (def. 0.01)\n  -q  <float>      Maximum q-value (FDR-adjusted p-value; def. 1)\n  -a  <float>      Minimum AUC for a peak (def. 200.0)\n  -l  <int>        Minimum length of a peak (def. 0)\n  -g  <int>        Maximum distance between signif. sites (def. 100)\nOther options:\n  -X               Skip peak-calling\n  -P               Call peaks directly from a log file (-f)\n  -z               Option to gzip-compress output(s)\n  -v               Option to print status updates/counts to stderr\n```\n\n## Attributes<a name=\"attributes\"></a>\n\n### Peak-calling method<a name=\"method\"></a>\n\nHere is an overview of the method used by Genrich to identify peaks (Fig. 1):\n* Parse alignments for the experimental sample and create an experimental \"pileup\" by counting the DNA fragments that cover each position of the genome (additional information about alignment parsing can be found [here](#alignment)).\n* Create a control pileup using the control sample (if available) and background level (additional information about control/background pileup calculation can be found [here](#pileup)).\n* Calculate *p*-values for each genomic position, as described [here](#pvalue).\n* (Optional) Convert *p*-values to *q*-values, as described [here](#qvalue).\n* Calculate the \"area under the curve\" (AUC) for all regions reaching statistical significance (e.g., *q* &lt; 0.05 &rArr; -log(*q*) &gt; 1.301).\n* Combine nearby regions and call peaks whose total AUC is above a threshold (details of peak-calling parameters can be found [here](#peakcalling)).\n\n<figure>\n  <img src=\"figures/figure1.png\" alt=\"Peak-calling by Genrich\" width=\"800\">\n  <figcaption><strong>Figure 1.  Peak-calling by Genrich.</strong>  Information about the sample and the Genrich command can be found <a href=\"https://github.com/jsh58/Genrich#full-analysis-example\">here</a>.  Visualization by <a href=\"http://software.broadinstitute.org/software/igv/\">IGV</a>.</figcaption>\n</figure>\n<br><br>\n\n### Alignment parsing<a name=\"alignment\"></a>\n\nGenrich analyzes paired-end reads aligned to a reference genome.  It correctly infers full fragments as spanning between the 5' ends of two properly paired alignments.  By default, it does **not** consider unpaired alignments (including those from single-end reads), although there are three options for keeping such alignments, as described [here](#unpaired).\n\nAn alternative analysis mode for ATAC-seq is also provided by Genrich, as described [here](#atacseq).\n\n\n### Multimapping reads<a name=\"multimap\"></a>\n\nGenrich analyzes reads/fragments that map to multiple locations in the genome by adding a fractional count to each location.  This allows for peak detection in regions of the genome that are otherwise inaccessible to the assay.  The input SAM/BAM file(s) must list secondary alignments for multimapping reads/fragments, such as those reported by the short read aligner [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) in [`-k <int>` mode](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#k-mode-search-for-one-or-more-alignments-report-each) or [`-a` mode](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#a-mode-search-for-and-report-all-alignments).  Additional information about the processing of secondary alignments by Genrich can be found in the description of the [`-s` parameter](#sparam).\n\n\n### PCR duplicate removal<a name=\"duplicate\"></a>\n\nGenrich provides an option for removing PCR duplicates (`-r`).  In this process, it analyzes reads/fragments based on their alignments, in three separate groups (proper pairs, discordant pairs, and singletons), and removes those identified as duplicates from further analysis.  One novel feature is that this evaluation takes into account reads/fragments with [multiple alignments](#multimap).  Additional information on the duplicate identification procedure can be found [here](https://github.com/jsh58/Genrich#filtering-options).\n\n\n### Genome length calculation<a name=\"genomelen\"></a>\n\nGenrich computes the genome length as the sum of the lengths of the chromosomes (reference sequences) in the header of the SAM/BAM file.  The length is reduced if the user specifies chromosomes (`-e`) or genomic regions (`-E`) to be excluded, as described [below](#eparam).  The calculated length is used for computing a [background pileup value](#pileup) and [*q*-values](#qvalue).\n\n\n### Control/background pileup calculation<a name=\"pileup\"></a>\n\nThe background pileup value is calculated by dividing the total sequence information (sum of read/fragment/interval lengths) in the experimental sample by the [calculated genome length](#genomelen).  The net control pileup value at a particular genomic position is the maximum of the background pileup value and the pileup of the control sample at that position (if a control sample is specified).  Note that control pileups are scaled to match the experimental, based on the total sequence information in each.\n\n\n### *p*-value calculation<a name=\"pvalue\"></a>\n\nThe *p*-value for each base of the genome is calculated assuming a null model with a [log-normal distribution](https://en.wikipedia.org/wiki/Log-normal_distribution).  The control/background pileup value is used as the parameter *&mu;*, and the *&sigma;* parameter is 1.2&times;*&mu;* if *&mu;* &le; 7, and 10&times;log<sub>10</sub>(*&mu;*) if *&mu;* &gt; 7.  These formulas, as well as the choice of the log-normal distribution, were determined from a comprehensive review of control samples from various ChIP-seq experiments (human, mouse, and worm) downloaded from [SRA](https://www.ncbi.nlm.nih.gov/sra/).\n* Because the log-normal is a continuous probability distribution, fractional experimental pileup values can be considered.  Such values need to be evaluated by Genrich due to reads/fragments with [multiple alignments](#multimap).\n* Earlier versions of Genrich (prior to v0.5) used the [exponential distribution](https://en.wikipedia.org/wiki/Exponential_distribution#Alternative_parameterization) for the null model.  Although this distribution has some convenient properties (continuous, one-parameter, -log<sub>10</sub>(*p*) &prop; *x* / *&beta;*), the study described above showed that the exponential distribution was a good fit to the control pileup distribution only *occasionally*.  However, this was still better than the [Poisson distribution](https://en.wikipedia.org/wiki/Poisson_distribution), which is frequently used as a null model in genomics software but was a good fit to the control pileup distribution **never**.\n\n\n### *q*-value calculation<a name=\"qvalue\"></a>\n\nThe *q*-value for each base of the genome is calculated from the *p*-value using the [Benjamini-Hochberg procedure](http://www.math.tau.ac.il/~ybenja/MyPapers/benjamini_hochberg1995.pdf).  The [calculated genome length](#genomelen) is used as the number of hypothesis tests (*m*).\n\n\n### Multiple replicates<a name=\"replicate\"></a>\n\nGenrich calls peaks for multiple replicates collectively.  First, it analyzes the replicates separately, with [*p*-values calculated](#pvalue) for each.  At each genomic position, the multiple replicates' *p*-values are then combined by [Fisher's method](https://en.wikipedia.org/wiki/Fisher's_method#Application_to_independent_test_statistics).  The combined *p*-values are [converted to *q*-values](#qvalue), and peaks are called.  This obviates the need for [IDR](https://www.encodeproject.org/software/idr/) (you're welcome!).\n\n\n## I/O files and options<a name=\"files\"></a>\n\n### Required files<a name=\"required\"></a>\n\n```\n  -t  <file>       Input SAM/BAM file(s) for experimental sample(s)\n```\n* Genrich analyzes alignment files in [SAM/BAM format](https://samtools.github.io/hts-specs/SAMv1.pdf).  SAM files must have a header.\n* The SAM/BAM files must be sorted by queryname (via `samtools sort -n`).\n* SAM/BAM files for [multiple replicates](#replicate) can be specified, comma-separated (or space-separated, in quotes).\n* Multiple SAM/BAM files for a single replicate should be combined in advance via `samtools merge`.\n* Genrich reads from `stdin` with `-t -`.\n* This file need not be specified when peak-calling directly from a log file previously produced by Genrich ([`-P` option](#pparam)).\n<br><br>\n\n```\n  -o  <file>       Output peak file (in ENCODE narrowPeak format)\n```\n* As indicated, the output file is in [ENCODE narrowPeak format](https://genome.ucsc.edu/FAQ/FAQformat.html#format12).  Here are details of the fields:\n<table>\n  <tr>\n    <td>1. chrom</td>\n    <td>Name of the chromosome</td>\n  </tr>\n  <tr>\n    <td>2. chromStart</td>\n    <td>Starting position of the peak (0-based)</td>\n  </tr>\n  <tr>\n    <td>3. chromEnd</td>\n    <td>Ending position of the peak (not inclusive)</td>\n  </tr>\n  <tr>\n    <td>4. name</td>\n    <td><code>peak_N</code>, where <code>N</code> is the 0-based count</td>\n  </tr>\n  <tr>\n    <td>5. score</td>\n    <td>Average AUC (total AUC / bp) &times; 1000, rounded to the nearest int (max. 1000)</td>\n  </tr>\n  <tr>\n    <td>6. strand</td>\n    <td><code>.</code> (no orientation)</td>\n  </tr>\n  <tr>\n    <td nowrap>7. signalValue</td>\n    <td>Total area under the curve (AUC)</td>\n  </tr>\n  <tr>\n    <td>8. pValue</td>\n    <td>Summit -log<sub>10</sub>(<i>p</i>-value)</td>\n  </tr>\n  <tr>\n    <td>9. qValue</td>\n    <td>Summit -log<sub>10</sub>(<i>q</i>-value), or <code>-1</code> if not available (e.g. without <code>-q</code>)</td>\n  </tr>\n  <tr>\n    <td>10. peak</td>\n    <td>Summit position (0-based offset from chromStart): the midpoint of the peak interval with the highest significance (the longest interval in case of ties)</td>\n  </tr>\n</table>\n\n* Here is the portion of the output file corresponding to the peaks called in Figure 1:\n```\nchr1    894446    894988    peak_10    402    .    217.824936    4.344683    1.946031    317\nchr1    895834    896167    peak_11    343    .    114.331093    4.344683    1.946031    90\n```\n* Genrich writes to `stdout` with `-o -`.\n* This file need not be specified when peak-calling is skipped (`-X`).\n\n\n### Optional files<a name=\"optional\"></a>\n\n```\n  -c  <file>       Input SAM/BAM file(s) for control sample(s)\n```\n* Alignment files for control samples (e.g. input DNA) can be specified, although this is not strictly required.\n* SAM/BAM files for [multiple replicates](#replicate) can be listed, comma-separated (or space-separated, in quotes) and in the same order as the experimental files.  Missing control files should be indicated with `null`.\n<br><br>\n\n```\n  -f  <file>       Output bedgraph-ish file for p/q values\n```\n* With a single replicate, this log file lists experimental/control pileup values, *p*- and *q*-values, and significance (`*`) for each interval. Here is the portion of the log file corresponding to the beginning of `peak_10` (Fig. 1):\n```\nchr1    894435    894436    33.000000    2.477916    3.183460    1.208321\nchr1    894436    894442    34.000000    2.477916    3.231466    1.241843\nchr1    894442    894446    35.000000    2.477916    3.278469    1.274561\nchr1    894446    894447    36.000000    2.477916    3.324516    1.306471    *\nchr1    894447    894450    39.000000    2.477916    3.457329    1.398035    *\nchr1    894450    894451    40.000000    2.477916    3.499948    1.427253    *\nchr1    894451    894460    41.000000    2.477916    3.541798    1.455938    *\n```\n* With multiple replicates, this log file lists *p*-values of each replicate, combined *p*-value, *q*-value, and significance for each interval.\n* Note that this file (as well as the `-k` file, below) is called \"bedgraph-ish\" because it contains multiple `dataValue` fields, which isn't strictly allowed in the [bedGraph format](https://genome.ucsc.edu/goldenpath/help/bedgraph.html).  However, a simple application of `awk` can produce the desired bedgraph files for visualization purposes (see this [awk reference](https://www.tutorialspoint.com/awk/awk_basic_examples.htm) for a guide to printing specific fields of input records).\n* When peak-calling is skipped (`-X`), the significance column is not produced.\n* This file functions as the *input* when peak-calling directly from a log file ([`-P` option](#pparam)).\n<br><br>\n\n```\n  -k  <file>       Output bedgraph-ish file for pileups and p-values\n```\n* For each replicate, sequentially, this file lists a header line (`# experimental file: <name>; control file: <name>`), followed by experimental/control pileups and a *p*-value for each interval. This is the way to examine pileup values with multiple replicates, since the `-f` log file does not supply them in that case.\n<br><br>\n\n```\n  -b  <file>       Output BED file for reads/fragments/intervals\n```\n* This is an unsorted [BED file](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) of the reads/fragments/intervals analyzed. The 4th column gives the read name, number of valid alignments, 'E'xperimental or 'C'ontrol, and sample number (0-based), e.g. `SRR5427886.59_2_E_0`.\n<br><br>\n\n```\n  -R  <file>       Output file for PCR duplicates (only with -r)\n```\n* This log file lists the header of each read/fragment classified as a PCR duplicate, followed by the alignment, the header of the read/fragment it matched, and the alignment type. For example:\n```\nSRR5427886.5958     chr4:185201876-185201975            SRR5427886.4688    paired\nSRR5427886.1826     chr12:34372610,+;chr1:91852878,-    SRR5427886.2040    discordant\nSRR5427886.10866    chr14:53438632,+                    SRR5427886.4746    single\n```\n* The duplicates from multiple input files are separated by a comment line listing the next filename, such as `# experimental file #0: SRR5427886.bam`.\n* This file can be used to filter the original SAM/BAM file, using a simple script such as [`getReads.py`](https://github.com/jsh58/rutgers/blob/master/getReads.py), for example.\n<br><br>\n\n\n## Filtering options<a name=\"filter\"></a>\n\n```\n  -r               Remove PCR duplicates\n```\n* With this option, all reads/fragments classified as PCR duplicates are removed from further analysis.  Reads/fragments are evaluated based on their alignments, in three separate groups:\n  * Proper pairs: A properly paired alignment is classified as a duplicate if the reference name (chromosome) and the 5' positions of the two reads match those of another properly paired alignment.\n  * Discordant pairs (where both R1 and R2 reads align, but not in a proper pair): A discordant alignment is classified as a duplicate if the reference name, 5' position, and strand (orientation) of *both* alignments match those of another discordant alignment.\n  * Singletons (where either R1 or R2 aligns): A singleton alignment is classified as a duplicate if the reference name, 5' position, and strand (orientation) match either end of a properly paired alignment, either end of a discordant pair, or another singleton alignment.\n* Within each of the three groups, Genrich analyzes reads/fragments in order based on the total sums of the quality scores (sums of both R1's and R2's quality scores with paired alignments).  In case of ties, reads are processed in the order they appear in the SAM/BAM.\n* Discordant and singleton duplicates are evaluated only if [unpaired alignments](#unpaired) are to be kept.\n* There is no consideration of read order (R1 vs. R2) by Genrich.  That is, if the 5' coordinates of R1 and R2 of one paired alignment match the coordinates of R2 and R1 of another paired alignment, respectively, it is classified as a duplicate.  The same applies to discordant and singleton duplicates.\n* For reads/fragments with [multiple alignments](#multimap), all secondary alignments within the [`-s` threshold](#sparam) are considered.  If *any* of a read's/fragment's alignments is evaluated as a duplicate, then the whole read/fragment is classified as a duplicate, and *all* of its alignments are discarded.  Note that no maximum of 10 alignments per read/fragment is imposed at this stage, and all alignments to skipped chromosomes (`-e`) or genomic regions (`-E`) are still evaluated.\n* For discordant alignments, all combinations of valid R1 and R2 alignments are analyzed.\n<br><br>\n\n\n<a name=\"eparam\"></a>\n```\n  -e  <arg>        Comma-separated list of chromosomes to exclude\n```\n* All alignments to the given list of chromosomes (reference sequences) are excluded from peak-calling.  The alignments' lengths do not factor into the [total sequence information calculation](#pileup), nor to the average fragment length calculation (`-x`), and the alignments are not printed to the `-b` file.\n* For reads/fragments with multiple alignments, the scores of alignments to `-e` chromosomes *are* considered for comparison purposes.\n* The lengths of the `-e` chromosomes are subtracted from the [total genome length](#genomelen) calculated by the program.\n<br><br>\n\n```\n  -E  <file>       Input BED file(s) of genomic regions to exclude\n```\n* All alignments, or portions of alignments, that lie within the given genomic regions are excluded from peak-calling, and no peak may extend into or around an excluded region.  The alignments' lengths (within an excluded region) do not factor into the [total sequence information calculation](#pileup).  However, the full fragment lengths *are* counted for the average fragment length calculation (`-x`), and the full fragments *are* listed in the `-b` file.\n* In the output log files (`-f`, `-k`), excluded regions have experimental/control pileup values of `0.0` and *p*-/*q*-values of `NA`.\n* Multiple BED files can be specified, comma-separated (or space-separated, in quotes).  Overlapping BED intervals are merged appropriately.\n* The regions' lengths are subtracted from the [total genome length](#genomelen) calculated by the program.\n* Genomic regions to which reads typically do not align uniquely can be specified, but this is not recommended.  Instead, one should consider taking advantage of Genrich's ability to [analyze multimapping reads](#multimap).\n* The accessory script [`findNs.py`](https://github.com/jsh58/Genrich/blob/master/findNs.py) produces a BED file of 'N' homopolymers in a fasta file (e.g. a reference genome).  Its output should be given to Genrich via `-E`.\n<br><br>\n\n```\n  -m  <int>        Minimum MAPQ to keep an alignment (def. 0)\n```\n* All alignments with `MAPQ` less than the given value are eliminated.  This is equivalent to filtering with `samtools view -q <int>`.\n* This option should not be used if the SAM/BAM lists [multiple alignments](#multimap) for some reads/fragments.  Instead, filtering should be accomplished via `-s <float>`, below.\n<br><br>\n\n<a name=\"sparam\"></a>\n```\n  -s  <float>      Keep sec alns with AS >= bestAS - <float> (def. 0)\n```\n* Genrich considers all secondary alignments of [multimapping reads](#multimap), but, by default, it keeps only the alignments whose [scores](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#scores-higher-more-similar) are equal to the best score for the read/fragment.  Setting a value such as `-s 20` causes Genrich also to keep secondary alignments whose scores are within 20 of the best.\n* The SAM/BAM should have alignment scores under the extra field `AS`.  If not, all alignments are considered equivalent.\n* Each of the `N` alignments for a read/fragment is counted as `1/N` for the pileup.\n* To avoid excessive memory usage and the imprecision inherent in floating-point values, a maximum of 10 alignments per read/fragment are analyzed by Genrich.  Reads/fragments with more than 10 alignments within the `-s` threshold are subsampled based on the best alignment scores; in the case of ties, alignments appearing first in the SAM/BAM are favored.\n* The alignment score for a fragment (pair of reads) is equal to the sum of the reads' individual scores.\n* Properly paired alignments take precedence over unpaired alignments, regardless of the alignment scores.\n* As stated [above](#multimap), [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) reports secondary alignments in [`-k <int>` mode](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#k-mode-search-for-one-or-more-alignments-report-each) or [`-a` mode](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#a-mode-search-for-and-report-all-alignments).  The short read aligner [BWA](http://bio-bwa.sourceforge.net/bwa.shtml) does not report secondary alignments.\n<br><br>\n\n### Unpaired alignments<a name=\"unpaired\"></a>\n\nBy default, Genrich analyzes only properly paired alignments and infers the full fragments as spanning between the 5' ends of the two alignments (Fig. 2).  It does not analyze unpaired alignments unless one of three options is selected:\n```\n  -y               Keep unpaired alignments (def. false)\n  -w  <int>        Keep unpaired alns, lengths changed to <int>\n  -x               Keep unpaired alns, lengths changed to paired avg\n```\n* `-y`: unpaired alignments are kept, just as they appear in the SAM/BAM\n* `-w <int>`: unpaired alignments are kept, with their lengths changed to the given value (from their 5' ends)\n* `-x`: unpaired alignments are kept, with their lengths changed to the average length of fragments inferred from properly paired alignments (excluding those aligning to skipped chromosomes [`-e`])\n\n<figure>\n  <img src=\"figures/figure2.png\" alt=\"Alignment analysis\" width=\"700\">\n  <figcaption><strong>Figure 2.  Analysis of alignments by Genrich.</strong>  The alignment file <code>example.bam</code> has both properly paired alignments (top left) and unpaired alignments (top right).  By default, Genrich infers the full fragments from the paired alignments and discards the unpaired alignments.  Unpaired alignments can be kept via <code>-y</code>, <code>-w &lt;int&gt;</code>, or <code>-x</code>, as described above.</figcaption>\n</figure>\n<br><br>\n\n## ATAC-seq mode<a name=\"atacseq\"></a>\n\n[ATAC-seq](https://informatics.fas.harvard.edu/atac-seq-guidelines.html#overview) is a method for assessing genomic regions of open chromatin.  Since only the ends of the DNA fragments indicate where the Tn5 transposase enzyme was able to insert into the chromatin, it may not be optimal to interpret alignments as shown above (Fig. 2).  Genrich has an alternative analysis mode for ATAC-seq in which it creates intervals centered on transposase cut sites (Fig. 3).\n\n```\n  -j               Use ATAC-seq mode (def. false)\n  -d  <int>        Expand cut sites to <int> bp (def. 100)\n  -D               Skip Tn5 adjustments of cut sites (def. false)\n```\n\n<figure>\n  <img src=\"figures/figure3.png\" alt=\"ATAC-seq mode\" width=\"700\">\n  <figcaption><strong>Figure 3.  ATAC-seq mode of Genrich.</strong>  Genrich analyzes intervals centered on cut sites (both ends of full fragments, as well as the 5' ends of unpaired alignments if <code>-y</code> is set, adjusted forward by 5bp).  The lengths of the intervals can be changed from the default of <code>-d 100</code>.</figcaption>\n</figure>\n<br><br>\n\nUnpaired alignments can be analyzed with `-y`, though only one interval, centered on the read's 5' end, is inferred.  Both `-w <int>` and `-x` are equivalent to `-y` in ATAC-seq mode.\n\nBy default, Genrich centers the intervals at the ends of the reads/fragments, adjusted *forward* by 5bp to account for the Tn5 transposase occupancy.  That is, for the 5' ends of fragments (or for reads aligning in a normal orientation), the position is increased by +5, and for the 3' ends of fragments (or for reads aligning in a reverse-complement orientation), the position is adjusted by -5.  To avoid this position adjustment (e.g. for DNase-seq), one can use `-D`.\n\nFor full fragments, when the two cut site intervals overlap, they are merged into a single interval.  To get a BED file of cut sites, one can run `-d 1 -b <file>`.\n\nThe remainder of the peak-calling process (calculating pileups and significance values) is identical to the [default analysis mode](#method).  Note that the interval lengths (*not* the fragment lengths) are used to sum the total sequence information for the calculation of [control/background pileup values](#pileup).\n<br><br>\n\n\n## Peak-calling parameters<a name=\"peakcalling\"></a>\n\n```\n  -p  <float>      Maximum p-value (def. 0.01)\n  -q  <float>      Maximum q-value (FDR-adjusted p-value; def. 1)\n```\n* These parameters establish the statistical threshold below which a base is considered significantly enriched in the experimental sample(s) vs. the control/background.  The significance value is automatically converted to a -log<sub>10</sub> scale by Genrich.\n* If a `-q` threshold is specified, the `-p` threshold is ignored.\n* If a `-q` threshold is not specified, *q*-values are not calculated (reported as `-1`).\n<br><br>\n\n```\n  -a  <float>      Minimum AUC for a peak (def. 200.0)\n```\n* The peak-calling method requires that, for a peak to be called, the total significance of the region must exceed a minimum value. The total significance is calculated as the sum of the -log(*p*) values above the threshold set by `-p` over the length of the region (i.e. the Area Under the -log(*p*) Curve).\n* If a `-q` threshold is specified, the Area Under the -log(*q*) Curve is calculated (see Fig. 1).\n<br><br>\n\n```\n  -l  <int>        Minimum length of a peak (def. 0)\n```\n* With this option, any potential peak whose length is below the specified value is discarded, regardless of its significance.  The default of 0 means that no peaks are eliminated on this basis.\n<br><br>\n\n```\n  -g  <int>        Maximum distance between signif. sites (def. 100)\n```\n* This parameter sets the maximum distance between sites that achieve significance in order for them to be linked together into the same potential peak.\n<br><br>\n\n\n## Miscellaneous<a name=\"misc\"></a>\n\n```\n  -X               Skip peak-calling\n```\n* This is a convenience option for those who are unsure of the peak-calling parameters but do not want to run the full analysis multiple times.  Genrich interprets the alignment files (including identifying PCR duplicates) and produces intermediate log files, but does not perform the peak-calling step.  The bedgraph-ish log file listing statistics (`-f`) can subsequently be used to call peaks directly (see `-P`, below).\n* With this option, the requirement to specify an output peak file (`-o`) is suspended, and no such file is produced.\n<br><br>\n\n<a name=\"pparam\"></a>\n```\n  -P               Call peaks directly from a log file (-f)\n```\n* This is a convenience option to call peaks from a bedgraph-ish log file (`-f`) produced by a previous Genrich run.  One can modify the [peak-calling parameters](#peakcalling) and explore the results without needing to run the full analysis (and requiring [much less time and memory](#example)).\n* Any excluded chromosomes (`-e`) and genomic regions (`-E`) that were previously specified in the Genrich run that produced the log file are honored.  Additional chromosomes and regions can be specified via the same command-line options, although the results may be slightly different than running the full analysis, due to differences in the calculations of [genome length](#genomelen) and [sequence information](#pileup).  Note that there is no verification that an excluded region (`-E`) does not extend past the end of a chromosome, unlike when the full analysis is run.\n* With this option, the requirement to specify input SAM/BAM file(s) (`-t`) is suspended, but the input log file (`-f`) *must* be specified.\n* Optional output files (`-k`/`-b`/`-R`) cannot be produced with `-P`.\n<br><br>\n\n```\n  -z               Option to gzip-compress output(s)\n```\n* When selected, all output files are gzip-compressed.\n<br><br>\n\n```\n  -S               Option to skip sort order check\n```\n* When selected, Genrich does not verify that the sort order of the input SAM/BAM file is queryname, even though the file is still parsed under that assumption.  This is a convenience option, to be used only by those who understand the consequences.\n<br><br>\n\n```\n  -L <int>         Set genome length to given value\n```\n* As described [above](#genomelen), Genrich computes the genome length from the header of the SAM/BAM file by default.  With this option, the given value is instead set as the genome length.  This is a convenience option, to be used only by those who understand the consequences.\n<br><br>\n\nOther options:\n```\n  -v/--verbose     Option to print status updates/counts to stderr\n  -h/--help        Print the usage message and exit\n  -V/--version     Print the version and exit\n```\n\n### Full analysis example<a name=\"example\"></a>\n\nA [sequencing run](https://www.ncbi.nlm.nih.gov/sra/SRX2717911[accn]) was downloaded from SRA.  Its reads were adapter-trimmed by [NGmerge](https://github.com/jsh58/NGmerge) and aligned to the human genome (hg19) by [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) with `-k 20`.  The SAM alignments from bowtie2 were piped through [SAMtools](https://www.htslib.org/doc/samtools.html) to sort them by queryname and convert the file to the BAM format.  The resulting alignment file `SRR5427886.bam` was analyzed by Genrich with the options to remove PCR duplicates (`-r`) and to keep unpaired alignments, extended to the average fragment length (`-x`).  All alignments to chrM and chrY were discarded (`-e chrM,chrY`), as well as alignments to two sets of excluded intervals: regions of 'N' homopolymers in the hg19 genome (produced by [`findNs.py`](https://github.com/jsh58/Genrich/blob/master/findNs.py)) and [high mappability regions](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityRegionsExcludable.bed.gz) (`-E hg19_Ns.bed,wgEncodeDukeMapabilityRegionsExcludable.bed.gz`).  There was no control sample.  Peaks were called using a maximum *q*-value of 0.05 (`-q 0.05`) and a minimum AUC of 20.0 (`-a 20.0`).\n```\n$ ./Genrich  -t SRR5427886.bam  -o SRR5427886.narrowPeak  \\\n  -f SRR5427886.log  -r  -x  -q 0.05  -a 20.0  -v  \\\n  -e chrM,chrY  -E hg19_Ns.bed,wgEncodeDukeMapabilityRegionsExcludable.bed.gz \nProcessing experimental file #0: SRR5427886.bam\n  BAM records analyzed:  146277509\n    Unmapped:              2877550\n    To skipped refs:       2560688\n      (chrM,chrY)\n    Paired alignments:   137462998\n      secondary alns:     67724920\n    Unpaired alignments:   3376273\n      secondary alns:      2232056\n  PCR duplicates --\n    Paired aln sets:      35574220\n      duplicates:          4660723 (13.1%)\n    Discordant aln sets:     64679\n      duplicates:             7736 (12.0%)\n    Singleton aln sets:    1036778\n      duplicates:           475201 (45.8%)\n  Fragments analyzed:     31286234\n    Full fragments:       30616993\n      (avg. length: 226.4bp)\n    Half fragments:         669241\n      (from unpaired alns, extended to 226bp)\n- control file #0 not provided -\n  Background pileup value: 2.477916\nPeak-calling parameters:\n  Genome length: 2826865605bp\n  Significance threshold: -log(q) > 1.301\n  Min. AUC: 20.000\n  Max. gap between sites: 100bp\nPeaks identified: 35114 (27918264bp)\n```\nThe total time to execute the above command (analyzing a BAM of 146.3 million alignments and calling peaks) was 10.5min.  It required 17.1GB of memory.\n\nOnce a log file (`SRR5427886.log`) is produced, calling peaks with an alternative set of parameters is accomplished most easily with the [`-P` option](#pparam).  For example, with `-p 0.01 -a 200`:\n```\n$ ./Genrich  -P  -f SRR5427886.log  -o SRR5427886_p01_a200.narrowPeak  -p 0.01  -a 200  -v\nPeak-calling from log file: SRR5427886.log\nPeak-calling parameters:\n  Genome length: 2826865605bp\n  Significance threshold: -log(p) > 2.000\n  Min. AUC: 200.000\n  Max. gap between sites: 100bp\nPeaks identified: 47329 (62194983bp)\n```\nThis required just 36sec and less than 0.1MB of memory.\n\nHowever, if one wanted the alignments to be interpreted differently, such as [ATAC-seq mode (`-j`)](#atacseq), the original command (with `-t SRR5427886.bam`) would need to be rerun.\n\n\n### Warning messages<a name=\"warning\"></a>\n\nIn verbose mode, Genrich may print one or more warnings to `stderr`:\n* `Read N prevented from extending below 0 on <chrom>`: This may occur due to extending unpaired alignments (`-w <int>`, `-x`) or in ATAC-seq mode (`-j`).\n* `Read N prevented from extending past <int> on <chrom>`: This also may occur due to extending unpaired alignments (`-w <int>`, `-x`) or in ATAC-seq mode (`-j`).  A maximum of 128 warning messages of these types (`Read N prevented...`) are printed per SAM/BAM.\n* `Large scaling may mask true signal`: This is printed if the [scaling factor](#pileup) for the control pileup is greater than 5.\n* `BED interval ignored - located off end of reference`: An excluded BED interval (`-E`) whose start coordinate is past the end of the reference sequence is ignored.  One should ensure that the genome version that produced the BED intervals matches that of the SAM/BAM.\n* `BED interval extends past end of ref. - edited to <loc>`: An excluded BED interval (`-E`) whose end coordinate is past the end of the reference sequence is adjusted as indicated.  Again, one should ensure that the genome version that produced the BED intervals matches that of the SAM/BAM.\n* `No paired alignments to calculate avg frag length -- Printing unpaired alignments \"as is\"`: When there are *no* properly paired alignments and the [`-x` average extension option](#unpaired) is selected, the unpaired alignments are printed as they appear in the SAM/BAM.\n* `Read N, alignment at <loc> skipped due to overflow`: The maximum difference in pileup values from one genomic position to the next is +32767, and additional read alignments are skipped due to this limitation.  Removing PCR duplicates (`-r`) may help reduce this issue.\n* `Read N, alignment at <loc> skipped due to underflow`: The minimum difference in pileup values from one genomic position to the next is -32768, and additional read alignments are skipped due to this limitation.  Removing PCR duplicates (`-r`) may help reduce this issue.\n* `Read N has more than 128 alignments`: If a read has more than 128 alignments in the SAM/BAM, only the first 128 are considered.  As described [above](#sparam), the best 10 alignments (at most) are ultimately analyzed by Genrich.\n* `All q-values are 1`: When no *p*-values are smaller than the reciprocal of the [calculated genome length](#genomelen), all [*q*-values](#qvalue) are calculated to be 1.  If peaks are expected to be called, one should consider using a *p*-value threshold instead (`-p`).\n* `Skipping chromosome N -- Reads aligning to it were used...`: This occurs when peak-calling directly from a log file (`-P`), and the log file indicates that reads/fragments aligned to a chromosome to be skipped (`-e`).  When Genrich created the log file, the chromosome was not skipped.  Therefore, the reads/fragments that aligned to it were used in the [background pileup calculation](#pileup), and its length was included in the [genome length calculation](#genomelen).  Those calculations affected the statistics present in the log file, so the called peaks may be slightly different than those called when running the full analysis.\n* `Skipping given BED regions -- Reads aligning to them were used...`: This occurs when peak-calling directly from a log file (`-P`), and the log file indicates that reads/fragments aligned to one or more genomic regions to be skipped (`-E`).  When Genrich created the log file, the regions were not skipped.  Therefore, the reads/fragments that aligned to them were used in the [background pileup calculation](#pileup), and their lengths were included in the [genome length calculation](#genomelen).  Those calculations affected the statistics present in the log file, so the called peaks may be slightly different than those called when running the full analysis.\n* `\"orphan\" alns`: An \"orphan\" alignment is one that the SAM/BAM indicates is properly paired, but its pair could not be found.  This could be due to a poorly formatted SAM/BAM, or possibly (but unlikely) a bug in alignment parsing by Genrich.  This warning appears in the accounting of alignments analyzed.\n\n### Computational requirements<a name=\"memory\"></a>\n\nGenrich runs very quickly but uses a considerable amount of memory.  For starters, it requires 3 bytes for every base-pair of the reference genome, i.e. ~9GB for a human sample.  The number of input files has little effect on memory, but certain analysis options (especially the option to remove PCR duplicates) can greatly increase the memory usage, particularly with large SAM/BAM input files.  See [above](#example) for an example.\n\nGenrich is not multi-threaded.\n<br><br>\n\n\n## Contact<a name=\"contact\"></a>\n\nGenrich\n\nCopyright &copy; 2018  John M. Gaspar (jsh58@wildcats.unh.edu)\n\nAny question, concern, or bug report about the program should be posted as an [Issue](https://github.com/jsh58/Genrich/issues) on GitHub.  Before posting, please check previous issues (both Open and Closed) to see if your issue has been addressed already.  Also, please follow these [good GitHub practices](https://hackernoon.com/45-github-issues-dos-and-donts-dfec9ab4b612).\n\n"
  },
  {
    "path": "findNs.py",
    "content": "#!/usr/bin/python\n\n# JMG 10/2018\n\n# Produce BED file of N homopolymers for a\n#   fasta file (e.g. reference genome).\n\nimport sys\nimport gzip\n\ndef openRead(filename):\n  '''\n  Open filename for reading. '-' indicates stdin.\n    '.gz' suffix indicates gzip compression.\n  '''\n  if filename == '-':\n    return sys.stdin\n  try:\n    if filename[-3:] == '.gz':\n      f = gzip.open(filename, 'rt')\n    else:\n      f = open(filename, 'r')\n  except IOError:\n    sys.stderr.write('Error! Cannot open %s for reading\\n' % filename)\n    sys.exit(-1)\n  return f\n\ndef openWrite(filename):\n  '''\n  Open filename for writing. '-' indicates stdout.\n    '.gz' suffix indicates gzip compression.\n  '''\n  if filename == '-':\n    return sys.stdout\n  try:\n    if filename[-3:] == '.gz':\n      f = gzip.open(filename, 'wb')\n    else:\n      f = open(filename, 'w')\n  except IOError:\n    sys.stderr.write('Error! Cannot open %s for writing\\n' % filename)\n    sys.exit(-1)\n  return f\n\ndef printNs(fOut, head, seq, minLen):\n  '''\n  Print BED intervals of Ns.\n  '''\n  count = 0\n  start = -1\n  for i in range(len(seq)):\n    if seq[i].upper() != 'N':\n      if start != -1:\n        if i - start >= minLen:\n          fOut.write('\\t'.join([head, str(start), str(i)]) + '\\n')\n          count += 1\n        start = -1\n\n    else:\n      if start == -1:\n        start = i\n\n  if start != -1:\n    if i - start >= minLen:\n      fOut.write('\\t'.join([head, str(start), str(i)]) + '\\n')\n      count += 1\n\n  return count\n\ndef parseFasta(fIn, fOut, minLen):\n  '''\n  Parse fasta file, write output on the fly.\n  '''\n  count = pureNs = 0\n  head = ''    # header (1st space-delim token)\n  seq = ''     # sequence is pure Ns\n  length = 0   # length of sequence\n\n  # analyze fasta reads\n  for line in fIn:\n    if line[0] == '>':\n\n      # process previous read\n      if head:\n        count += 1\n        pureNs += printNs(fOut, head, seq, minLen)\n\n      # start new read\n      head = line.rstrip().split(' ')[0][1:]\n      seq = ''\n      length = 0\n\n    elif head:\n      # save sequence\n      seq += line.rstrip()\n\n  if fIn != sys.stdin:\n    fIn.close()\n\n  # process last read\n  if head:\n    count += 1\n    pureNs += printNs(fOut, head, seq, minLen)\n\n  if fOut != sys.stdout:\n    fOut.close()\n\n  return count, pureNs\n\ndef main():\n  '''Main.'''\n  args = sys.argv[1:]\n  if len(args) < 2:\n    sys.stderr.write('Usage: python findNs.py  <input>')\n    sys.stderr.write('  <output>  [<minLen>]\\n')\n    sys.stderr.write('  <input>     Input fasta file\\n')\n    sys.stderr.write('  <output>    Output BED file of \\'N\\' homopolymers\\n')\n    sys.stderr.write('  <minLen>    Minimum length of Ns (def. 100bp)\\n')\n    sys.exit(-1)\n\n  # get CL args\n  fIn = openRead(args[0])\n  fOut = openWrite(args[1])\n  minLen = 100\n  if len(args) > 2:\n    minLen = int(args[2])\n\n  # parse fasta\n  count, pureNs = parseFasta(fIn, fOut, minLen)\n\n  sys.stderr.write('Total fasta sequences in %s: %d\\n' % (args[0], count))\n  sys.stderr.write('Intervals of Ns (min. %dbp): %d\\n' % (minLen, pureNs))\n\nif __name__ == '__main__':\n  main()\n"
  }
]