[
  {
    "path": "BAFregress.c",
    "content": "/* The MIT License\n\n   Copyright (C) 2024-2025 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n#include <stdio.h>\n#include <unistd.h>\n#include <getopt.h>\n#include <errno.h>\n#include <htslib/vcf.h>\n#include <htslib/synced_bcf_reader.h>\n#include <htslib/vcfutils.h>\n#include <htslib/ksort.h>\n#include \"bcftools.h\"\n\n#define BAFREGRESS_VERSION \"2025-08-19\"\n\n#define GT_NC 0\n#define GT_AA 1\n#define GT_AB 2\n#define GT_BB 3\n\nKSORT_INIT_GENERIC(float)\n\n/******************************************\n * PLUGIN                                 *\n ******************************************/\n\ninline static double sqr(double x) { return x * x; }\n\nconst char *about(void) { return \"Detects and estimates sample contamination using BAF intensity data.\\n\"; }\n\nstatic const char *usage_text(void) {\n    return \"\\n\"\n           \"About: Detects and estimates sample contamination. (version \" BAFREGRESS_VERSION\n           \" http://github.com/freeseek/gtc2vcf)\\n\"\n           \"[ Jun, G. et al. Detecting and Estimating Contamination of Human DNA Samples in Sequencing\\n\"\n           \"and Array-Based Genotype Data. AJHG 91, 839-848 (2012) http://doi.org/10.1016/j.ajhg.2012.09.004 ]\\n\"\n           \"\\n\"\n           \"Usage: bcftools +BAFregress [options] <in.vcf.gz>\\n\"\n           \"\\n\"\n           \"Plugin options:\\n\"\n           \"        --threshold <float>         minimum allele frequency for BAF regression [0.1]\\n\"\n           \"    -a, --af <file>                 file with allele frequency information\\n\"\n           \"        --tag <string>              allele frequency INFO tag [AC/AN]\\n\"\n           \"        --adjust-BAF                minimum number of genotypes for a cluster to median adjust BAF (-1 for \"\n           \"no adjustment) [5]\\n\"\n           \"        --truncate-BAF              truncates BAF values between 0 and 1 and turns off adjustment to \"\n           \"recover original behavior\\n\"\n           \"        --use-MAF                   uses minor allele frequency rather than A/B allele frequency to \"\n           \"recover original behavior\\n\"\n           \"    -e, --estimates <file>          write BAF regression estimates to a file [standard output]\\n\"\n           \"    -o, --output <file>             write VCF output to a file\\n\"\n           \"    -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level \"\n           \"[v]\\n\"\n           \"    -r, --regions <region>          restrict to comma-separated list of regions\\n\"\n           \"    -R, --regions-file <file>       restrict to regions listed in a file\\n\"\n           \"        --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant \"\n           \"overlaps (2) [1]\\n\"\n           \"    -t, --targets [^]<region>       similar to -r but streams rather than index-jumps. Exclude regions \"\n           \"with \\\"^\\\" prefix\\n\"\n           \"    -T, --targets-file [^]<file>    similar to -R but streams rather than index-jumps. Exclude regions \"\n           \"with \\\"^\\\" prefix\\n\"\n           \"        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant \"\n           \"overlaps (2) [0]\\n\"\n           \"        --threads <int>             number of extra output compression threads [0]\\n\"\n           \"    -s, --samples [^]<list>         comma separated list of samples to include (or exclude with \\\"^\\\" \"\n           \"prefix)\\n\"\n           \"    -S, --samples-file [^]<file>    file of samples to include (or exclude with \\\"^\\\" prefix)\\n\"\n           \"        --force-samples             only warn about unknown subset samples\\n\"\n           \"    -W, --write-index[=FMT]         Automatically index the output files [off]\\n\"\n           \"\\n\"\n           \"Example:\\n\"\n           \"    bcftools +BAFregress file.bcf\\n\"\n           \"    bcftools +BAFregress --tag AF file.bcf\\n\"\n           \"    bcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf file.bcf\\n\"\n           \"    bcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf --truncate-BAF --use-MAF file.bcf\\n\"\n           \"\\n\";\n}\n\nint run(int argc, char **argv) {\n    float af_threshold = 0.1;\n    char *af_fname = NULL;\n    char *af_tag = NULL;\n    int adj_baf = 5;\n    int truncate_baf = 0;\n    int use_maf = 0;\n    char *estimate_fname = \"-\";\n    char *output_fname = NULL;\n    int output_type = FT_VCF;\n    int clevel = -1;\n    int regions_overlap = 1;\n    int targets_overlap = 0;\n    int n_threads = 0;\n    char *targets_list = NULL;\n    int targets_is_file = 0;\n    char *regions_list = NULL;\n    int regions_is_file = 0;\n    char *sample_names = NULL;\n    int sample_is_file = 0;\n    int force_samples = 0;\n    int write_index = 0;\n    char *index_fname;\n    htsFile *out_fh = NULL;\n\n    static struct option loptions[] = {{\"threshold\", required_argument, NULL, 1},\n                                       {\"af\", required_argument, NULL, 'a'},\n                                       {\"tag\", required_argument, NULL, 2},\n                                       {\"adjust-BAF\", required_argument, NULL, 3},\n                                       {\"truncate-BAF\", no_argument, NULL, 4},\n                                       {\"use-MAF\", no_argument, NULL, 5},\n                                       {\"estimates\", required_argument, NULL, 'e'},\n                                       {\"output\", required_argument, NULL, 'o'},\n                                       {\"output-type\", required_argument, NULL, 'O'},\n                                       {\"threads\", required_argument, NULL, 6},\n                                       {\"regions\", required_argument, NULL, 'r'},\n                                       {\"regions-file\", required_argument, NULL, 'R'},\n                                       {\"regions-overlap\", required_argument, NULL, 7},\n                                       {\"targets\", required_argument, NULL, 't'},\n                                       {\"targets-file\", required_argument, NULL, 'T'},\n                                       {\"targets-overlap\", required_argument, NULL, 8},\n                                       {\"samples\", required_argument, NULL, 's'},\n                                       {\"samples-file\", required_argument, NULL, 'S'},\n                                       {\"force-samples\", no_argument, NULL, 9},\n                                       {\"write-index\", optional_argument, NULL, 'W'},\n                                       {0, 0, 0, 0}};\n    int c;\n    char *tmp;\n    while ((c = getopt_long(argc, argv, \"h?a:e:o:O:r:R:t:T:s:S:\", loptions, NULL)) >= 0) {\n        switch (c) {\n        case 1:\n            af_threshold = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --threshold %s\\n\", optarg);\n            if (af_threshold <= 0.0 || af_threshold >= 1.0) error(\"--threshold must input a value between 0 and 1\\n\");\n            break;\n        case 'a':\n            af_fname = optarg;\n            break;\n        case 2:\n            af_tag = optarg;\n            break;\n        case 3:\n            adj_baf = (int)strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --adjust-BAF %s\\n\", optarg);\n            break;\n        case 4:\n            truncate_baf = 1;\n            break;\n        case 5:\n            use_maf = 1;\n            break;\n        case 'e':\n            estimate_fname = optarg;\n            break;\n        case 'o':\n            output_fname = optarg;\n            break;\n        case 'O':\n            switch (optarg[0]) {\n            case 'b':\n                output_type = FT_BCF_GZ;\n                break;\n            case 'u':\n                output_type = FT_BCF;\n                break;\n            case 'z':\n                output_type = FT_VCF_GZ;\n                break;\n            case 'v':\n                output_type = FT_VCF;\n                break;\n            default: {\n                clevel = strtol(optarg, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9) error(\"The output type \\\"%s\\\" not recognised\\n\", optarg);\n            }\n            };\n            if (optarg[1]) {\n                clevel = strtol(optarg + 1, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9)\n                    error(\"Could not parse argument: --compression-level %s\\n\", optarg + 1);\n            }\n            break;\n        case 6:\n            n_threads = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse argument: --threads %s\\n\", optarg);\n            break;\n        case 'r':\n            regions_list = optarg;\n            break;\n        case 'R':\n            regions_list = optarg;\n            regions_is_file = 1;\n            break;\n        case 7:\n            if (!strcasecmp(optarg, \"0\"))\n                regions_overlap = 0;\n            else if (!strcasecmp(optarg, \"1\"))\n                regions_overlap = 1;\n            else if (!strcasecmp(optarg, \"2\"))\n                regions_overlap = 2;\n            else\n                error(\"Could not parse: --regions-overlap %s\\n\", optarg);\n            break;\n        case 't':\n            targets_list = optarg;\n            break;\n        case 'T':\n            targets_list = optarg;\n            targets_is_file = 1;\n            break;\n        case 8:\n            if (!strcasecmp(optarg, \"0\"))\n                targets_overlap = 0;\n            else if (!strcasecmp(optarg, \"1\"))\n                targets_overlap = 1;\n            else if (!strcasecmp(optarg, \"2\"))\n                targets_overlap = 2;\n            else\n                error(\"Could not parse: --targets-overlap %s\\n\", optarg);\n            break;\n        case 's':\n            sample_names = optarg;\n            break;\n        case 'S':\n            sample_names = optarg;\n            sample_is_file = 1;\n            break;\n        case 9:\n            force_samples = 1;\n            break;\n        case 'W':\n            if (!(write_index = write_index_parse(optarg))) error(\"Unsupported index format '%s'\\n\", optarg);\n            break;\n        case 'h':\n        case '?':\n        default:\n            error(\"%s\", usage_text());\n            break;\n        }\n    }\n\n    if (truncate_baf) adj_baf = -1;\n\n    char *input_fname = NULL;\n    if (optind == argc) {\n        if (!isatty(fileno((FILE *)stdin))) {\n            input_fname = \"-\"; // reading from stdin\n        } else {\n            error(\"%s\", usage_text());\n        }\n    } else if (optind + 1 != argc) {\n        error(\"%s\", usage_text());\n    } else {\n        input_fname = argv[optind];\n    }\n\n    bcf_srs_t *srs = bcf_sr_init();\n    if (af_fname) {\n        bcf_sr_set_opt(srs, BCF_SR_REQUIRE_IDX);\n        bcf_sr_set_opt(srs, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_EXACT);\n    }\n\n    if (regions_list) {\n        bcf_sr_set_opt(srs, BCF_SR_REGIONS_OVERLAP, regions_overlap);\n        if (bcf_sr_set_regions(srs, regions_list, regions_is_file) < 0)\n            error(\"Failed to read the regions: %s\\n\", regions_list);\n    }\n    if (targets_list) {\n        bcf_sr_set_opt(srs, BCF_SR_TARGETS_OVERLAP, targets_overlap);\n        if (bcf_sr_set_targets(srs, targets_list, targets_is_file, 0) < 0)\n            error(\"Failed to read the targets: %s\\n\", targets_list);\n    }\n    if (bcf_sr_set_threads(srs, n_threads) < 0) error(\"Failed to create threads\\n\");\n    if (!bcf_sr_add_reader(srs, input_fname))\n        error(\"Failed to open %s: %s\\n\", input_fname, bcf_sr_strerror(srs->errnum));\n    if (af_fname && !bcf_sr_add_reader(srs, af_fname))\n        error(\"Failed to open %s: %s\\n\", af_fname, bcf_sr_strerror(srs->errnum));\n\n    bcf_hdr_t *hdr = bcf_sr_get_header(srs, 0);\n    bcf_hdr_t *af_hdr = af_fname ? bcf_sr_get_header(srs, 1) : NULL;\n\n    if (sample_names) {\n        int ret = bcf_hdr_set_samples(hdr, sample_names, sample_is_file);\n        if (ret < 0)\n            error(\"Error parsing the list of samples: %s\\n\", sample_names);\n        else if (force_samples && ret > 0)\n            error(\"Sample name mismatch: sample #%d not found in the header\\n\", ret);\n    }\n\n    // get IDs for all VCF formats\n    int gt_id = bcf_hdr_id2int(hdr, BCF_DT_ID, \"GT\");\n    if (gt_id < 0) error(\"Format GT was not found in the input header\\n\");\n    int baf_id = bcf_hdr_id2int(hdr, BCF_DT_ID, \"BAF\");\n    if (baf_id < 0) error(\"Format BAF was not found in the input header\\n\");\n    int allele_a_id = bcf_hdr_id2int(hdr, BCF_DT_ID, \"ALLELE_A\");\n    if (allele_a_id < 0) error(\"Format ALLELE_A was not found in the input header\\n\");\n    int allele_b_id = bcf_hdr_id2int(hdr, BCF_DT_ID, \"ALLELE_B\");\n    if (allele_b_id < 0) error(\"Format ALLELE_B was not found in the input header\\n\");\n    int af_id = -1;\n    if (af_tag) {\n        af_id = bcf_hdr_id2int(af_hdr ? af_hdr : hdr, BCF_DT_ID, af_tag);\n        if (af_id < 0) error(\"Format %s was not found in the allele frequency header\\n\", af_tag);\n    }\n\n    FILE *est_fh = strcmp(\"-\", estimate_fname) ? fopen(estimate_fname, \"w\") : stdout;\n    if (!est_fh) error(\"Error: cannot write to %s\\n\", estimate_fname);\n\n    // output VCF\n    if (output_fname) {\n        char wmode[8];\n        set_wmode(wmode, output_type, output_fname, clevel);\n        out_fh = hts_open(output_fname, wmode);\n        if (out_fh == NULL) error(\"[%s] Error: cannot write to \\\"%s\\\": %s\\n\", __func__, output_fname, strerror(errno));\n        if (n_threads) hts_set_opt(out_fh, HTS_OPT_THREAD_POOL, srs->p);\n        if (bcf_hdr_write(out_fh, hdr) < 0) error(\"Unable to write to output VCF file\\n\");\n        if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0)\n            error(\"Error: failed to initialise index for %s\\n\", output_fname);\n    }\n\n    int n_smpls = bcf_hdr_nsamples(hdr);\n    if (!af_hdr && !af_tag && n_smpls < 30)\n        fprintf(\n            stderr,\n            \"Input VCF only includes %d samples. We recommend using a separate VCF to infer marker allele frequency\\n\",\n            n_smpls);\n\n    int *arr = NULL;\n    int marr = 0;\n    float *baf_arr = NULL;\n    int nbaf_arr = 0;\n    int8_t *gts = (int8_t *)calloc(n_smpls, sizeof(int8_t));\n    float *tmp_arr = (float *)calloc(n_smpls, sizeof(float));\n    float *sumx2 = (float *)calloc(n_smpls, sizeof(float));\n    float *sumxy = (float *)calloc(n_smpls, sizeof(float));\n    float *sumx = (float *)calloc(n_smpls, sizeof(float));\n    float *sumy = (float *)calloc(n_smpls, sizeof(float));\n    int *n = (int *)calloc(n_smpls, sizeof(int));\n\n    // run through each record present in both VCFs\n    int i, j;\n    while (bcf_sr_next_line(srs)) {\n        bcf1_t *line = bcf_sr_get_line(srs, 0);\n        if (!line) continue;\n        if (out_fh && bcf_write1(out_fh, hdr, line) != 0)\n            error(\"[%s] Error: cannot write to %s\\n\", __func__, output_fname);\n\n        bcf1_t *af_line = af_hdr ? bcf_sr_get_line(srs, 1) : line;\n        if (line->n_allele != 2 || !af_line || af_line->n_allele != 2) continue;\n\n        // skip lines where the allele frequency is less than 0.01 (or greater than 0.99)\n        double af;\n        if (af_tag) {\n            bcf_info_t *af_info = bcf_get_info_id(af_line, af_id);\n            af = af_info ? (double)af_info->v1.f : NAN;\n        } else {\n            hts_expand(int, af_line->n_allele, marr, arr);\n            int ret = bcf_calc_ac(af_hdr ? af_hdr : hdr, af_line, arr, BCF_UN_INFO | BCF_UN_FMT);\n            if (ret <= 0) continue;\n            int an = 0;\n            for (i = 0; i < af_line->n_allele; i++) an += arr[i];\n            af = (double)arr[1] / (double)an;\n        }\n        if (isnan(af) || af < af_threshold || af > 1.0 - af_threshold) continue;\n        if (use_maf && af > 0.5) af = 1.0 - af; // uses MAF instead of AF to avoid problems with flipped Illumina probes\n\n        // skip lines where ALLELE_A and ALLELE_B refer to alleles missing from the record (it should not happen)\n        bcf_info_t *allele_a_info = bcf_get_info_id(line, allele_a_id);\n        int8_t allele_a = allele_a_info ? (int8_t)allele_a_info->v1.i : bcf_int8_missing;\n        bcf_info_t *allele_b_info = bcf_get_info_id(line, allele_b_id);\n        int8_t allele_b = allele_b_info ? (int8_t)allele_b_info->v1.i : bcf_int8_missing;\n        if (allele_a < 0 || allele_a >= line->n_allele || allele_b < 0 || allele_b >= line->n_allele) continue;\n        if (allele_b == 0) af = 1.0 - af; // flip the allele frequency if ALLELE_B is the reference\n\n        // skip lines missing genotypes (e.g. intensity only sites) or with ploidy other than 2\n        int n_aa = 0, n_ab = 0, n_bb = 0;\n        bcf_fmt_t *gt_fmt = bcf_get_fmt_id(line, gt_id);\n        if (!gt_fmt || gt_fmt->n != 2) continue;\n#define BRANCH(type_t, bcf_type_vector_end)                                                                            \\\n    {                                                                                                                  \\\n        type_t *p = (type_t *)gt_fmt->p;                                                                               \\\n        for (i = 0; i < n_smpls; i++, p += 2) {                                                                        \\\n            gts[i] = GT_NC;                                                                                            \\\n            if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end                  \\\n                || bcf_gt_is_missing(p[1]))                                                                            \\\n                continue;                                                                                              \\\n            type_t allele_0 = bcf_gt_allele(p[0]);                                                                     \\\n            type_t allele_1 = bcf_gt_allele(p[1]);                                                                     \\\n            if (allele_0 == allele_a && allele_1 == allele_a) {                                                        \\\n                gts[i] = GT_AA;                                                                                        \\\n                n_aa++;                                                                                                \\\n            } else if ((allele_0 == allele_a && allele_1 == allele_b)                                                  \\\n                       || (allele_0 == allele_b && allele_1 == allele_a)) {                                            \\\n                gts[i] = GT_AB;                                                                                        \\\n                n_ab++;                                                                                                \\\n            } else if (allele_0 == allele_b && allele_1 == allele_b) {                                                 \\\n                gts[i] = GT_BB;                                                                                        \\\n                n_bb++;                                                                                                \\\n            }                                                                                                          \\\n        }                                                                                                              \\\n    }\n        switch (gt_fmt->type) {\n        case BCF_BT_INT8:\n            BRANCH(int8_t, bcf_int8_vector_end);\n            break;\n        case BCF_BT_INT16:\n            BRANCH(int16_t, bcf_int16_vector_end);\n            break;\n        case BCF_BT_INT32:\n            BRANCH(int32_t, bcf_int32_vector_end);\n            break;\n        default:\n            error(\"Unexpected type %d\\n\", gt_fmt->type);\n        }\n#undef BRANCH\n\n        int nbaf = bcf_get_format_float(hdr, line, \"BAF\", &baf_arr, &nbaf_arr);\n        if (nbaf != n_smpls) continue; // wrong number of BAF values\n\n        // adjust BAF\n        float adj_baf_aa = 0.0;\n        float adj_baf_bb = 0.0;\n        if (adj_baf != -1) {\n            j = 0;\n            if (n_aa >= adj_baf) {\n                for (i = 0; i < n_smpls; i++)\n                    if (gts[i] == GT_AA) tmp_arr[j++] = baf_arr[i];\n                adj_baf_aa = ks_ksmall_float((size_t)j, tmp_arr, (size_t)j / 2);\n                if (j % 2 == 0) adj_baf_aa = (adj_baf_aa + tmp_arr[j / 2 - 1]) * 0.5f;\n            }\n            j = 0;\n            if (n_bb >= adj_baf) {\n                for (i = 0; i < n_smpls; i++)\n                    if (gts[i] == GT_BB) tmp_arr[j++] = baf_arr[i];\n                adj_baf_bb = ks_ksmall_float((size_t)j, tmp_arr, (size_t)j / 2);\n                if (j % 2 == 0) adj_baf_bb = (adj_baf_bb + tmp_arr[j / 2 - 1]) * 0.5f;\n                adj_baf_bb -= 1.0;\n            }\n        } else if (truncate_baf) { // truncates the BAF between 0.0 and 1.0 like Illumina does\n            for (i = 0; i < n_smpls; i++) {\n                if (baf_arr[i] < 0.0)\n                    baf_arr[i] = 0.0;\n                else if (baf_arr[i] > 1.0)\n                    baf_arr[i] = 1.0;\n            }\n        }\n\n        for (i = 0; i < n_smpls; i++) {\n            double baf;\n            if (gts[i] == GT_AA) {\n                baf = (double)(baf_arr[i] - adj_baf_aa);\n                sumx2[i] += sqr(af);\n                sumxy[i] += af * baf;\n                sumx[i] += af;\n                sumy[i] += baf;\n            } else if (gts[i] == GT_BB) {\n                baf = (double)(baf_arr[i] - adj_baf_bb);\n                sumx2[i] += sqr(1.0 - af);\n                sumxy[i] += (1.0 - af) * (1.0 - baf);\n                sumx[i] += 1.0 - af;\n                sumy[i] += 1.0 - baf;\n            } else\n                continue;\n            n[i]++;\n        }\n    }\n\n    fprintf(est_fh, \"sample_id\\tbaf_regress\\tNhom\\n\");\n    for (i = 0; i < n_smpls; i++) {\n        double denom = (double)n[i] * sumx2[i] - sqr(sumx[i]);\n        double m = denom ? (n[i] * sumxy[i] - sumx[i] * sumy[i]) / denom : NAN;\n        // double b = denom ? (sumy[i] * sumx2[i] - sumx[i] * sumxy[i]) / denom : NAN;\n        fprintf(est_fh, \"%s\\t%.4f\\t%d\\n\", hdr->samples[i], m, n[i]);\n    }\n\n    if (est_fh != stdout && est_fh != stderr) fclose(est_fh);\n\n    // close output VCF\n    if (output_fname) {\n        if (write_index) {\n            if (bcf_idx_save(out_fh) < 0) {\n                if (hts_close(out_fh) != 0)\n                    error(\"Close failed %s\\n\", strcmp(output_fname, \"-\") ? output_fname : \"stdout\");\n                error(\"Error: cannot write to index %s\\n\", index_fname);\n            }\n            free(index_fname);\n        }\n        hts_close(out_fh);\n    }\n\n    free(arr);\n    free(baf_arr);\n    free(gts);\n    free(tmp_arr);\n    free(sumx2);\n    free(sumxy);\n    free(sumx);\n    free(sumy);\n    free(n);\n    bcf_sr_destroy(srs);\n\n    return 0;\n}\n"
  },
  {
    "path": "HapMap.md",
    "content": "HapMap\n======\n\nA tutorial for how to convert HapMap data from Illumina and Affymetrix arrays to a GRCh38 VCF using gtc2vcf\n\n<!--ts-->\n   * [Download manifest files](#download-manifest-files)\n   * [Download and unpack IDAT and CEL files](#download-and-unpack-idat-and-cel-files)\n   * [Create sample maps](#create-sample-maps)\n   * [Convert IDATs to GTCs](#convert-idats-to-gtcs)\n   * [Convert GTCs to VCF](#convert-gtcs-to-vcf)\n   * [Convert CELs to CHPs](#convert-cels-to-chps)\n   * [Convert CHPs to VCF](#convert-chps-to-vcf)\n<!--te-->\n\nDownload manifest files\n=======================\n\nDownload HumanCNV370v1 manifest and cluster files from [Illumina](http://support.illumina.com/downloads/humancnv370-duo_v10_product_files.html) and [GEO](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6986)\n```\nwget ftp://webdata:webdata@ftp.illumina.com/downloads/ProductFiles/HumanCNV370/HumanCNV370-Duo/humancnv370v1_c.bpm\nwget ftp://webdata2:webdata2@ftp.illumina.com/downloads/ProductFiles/HumanCNV370/HumanCNV370-Duo/HumanCNV370v1_C.egt\nwget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL6nnn/GPL6986/suppl/GPL6986_HumanCNV370v1_C.csv.gz\ngunzip GPL6986_HumanCNV370v1_C.csv.gz\n/bin/mv GPL6986_HumanCNV370v1_C.csv HumanCNV370v1_C.csv\n```\n\nDownload HumanOmni2.5-4v1 manifest and cluster files from [Illumina](http://support.illumina.com/downloads/humanomni2-5-quad_product_files.html)\n```\nwget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/94afb35e-7c11-45cc-8a65-d868af527c54/HumanOmni2.5-4v1_H.bpm\nwget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/f003e017-1761-4348-958f-03997a30cf67/HumanOmni2.5-4v1_H.egt\nwget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/d5578cf6-bb3b-4b4b-98d3-21edc5bcbd45/HumanOmni2.5-4v1_H.csv\n```\n\nDownload HumanOmni25M-8v1-1 manifest and cluster files from [Illumina](ftp://webdata2:webdata2@ftp.illumina.com/downloads/productfiles/humanomni25) and [GEO](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL20641)\n```\nwget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL20nnn/GPL20641/suppl/GPL20641_HumanOmni2.5M-8v1-1_B.bpm.gz\nwget ftp://webdata2:webdata2@ftp.illumina.com/downloads/productfiles/humanomni25/humanomni2-5m-8v1-1_b.egt\nwget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL20nnn/GPL20641/suppl/GPL20641_HumanOmni25M-8v1-1_B.csv.gz\ngunzip GPL20641_HumanOmni2.5M-8v1-1_B.bpm.gz\ngunzip GPL20641_HumanOmni25M-8v1-1_B.csv.gz\n/bin/mv GPL20641_HumanOmni2.5M-8v1-1_B.bpm HumanOmni25M-8v1-1_B.bpm\n/bin/mv GPL20641_HumanOmni25M-8v1-1_B.csv HumanOmni25M-8v1-1_B.csv\n```\n\nDownload GenomeWideEx_6 and GenomeWideSNP_6 library and annotation files from [Affymetrix](http://www.affymetrix.com/support/technical/byproduct.affx?product=genomewidesnp_6)\n```\nwget http://tools.thermofisher.com/content/sfs/supportfiles/genomewidesnp6_libraryfile.zip\nwget http://www.affymetrix.com/Auth/analysis/downloads/lf/genotyping/GenomeWideSNP_6/SNP6_supplemental_axiom_analysis_files.zip\nwget http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping/GenomeWideSNP_6.na35.annot.csv.zip\nunzip -oj genomewidesnp6_libraryfile.zip CD_GenomeWideSNP_6_rev3/Full/GenomeWideSNP_6/LibFiles/GenomeWideSNP_6.{cdf,chr{X,Y}probes,specialSNPs}\nunzip -o SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.{generic_prior.txt,apt-probeset-genotype.AxiomGT1.xml,AxiomGT1.sketch}\nunzip -o GenomeWideSNP_6.na35.annot.csv.zip GenomeWideSNP_6.na35.annot.csv\n/bin/rm genomewidesnp6_libraryfile.zip SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.na35.annot.csv.zip\n```\n\nRe-align flanking sequences to GRCh38\n```\nfor chip in HumanCNV370v1_C humanomni25m-8v1-1_b HumanOmni2.5-4v1_H; do\n  bcftools +gtc2vcf --csv $chip.csv --fasta-flank | \\\n    bwa mem -M $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - | \\\n    samtools view -bS -o $chip.bam\ndone\nbcftools +affy2vcf --csv GenomeWideSNP_6.na35.annot.csv --fasta-flank | \\\n  bwa mem -M $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - | \\\n  samtools view -bS -o $chip.bam\n```\n\nDownload and unpack IDAT and CEL files\n======================================\n\n```\nwget http://bioconductor.org/packages/release/data/annotation/src/contrib/hapmap370k_1.0.1.tar.gz\nwget -nH --cut-dirs 2 -r ftp://ftp.ncbi.nlm.nih.gov/hapmap/raw_data/hapmap3_affy6.0/\nwget -nH --cut-dirs 5 -r ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/\n\nmkdir -p idats\ntar xzvf hapmap370k_1.0.1.tar.gz -C idats hapmap370k/inst/idatFiles\ntar xzvf hd_genotype_chip/broad_intensities/Omni25_idats_gtcs_2141_samples.tgz -C idats\ntar xzvf hd_genotype_chip/sanger_intensities/ALL.wgs.sanger_omni_2_5_8.20130805.snps.genotypes.idats.tar.gz -C idats\n\nmkdir -p cels\nfor tgz in hapmap3_affy6.0/*.tgz; do tar xzvf $tgz -C cels; done\ntar xzvf hd_genotype_chip/coriell_affy6_intensities/Affy60_Coriell_CEL_files.tar.gz -C cels\n\n# one sample is mapped to HG03171 but should be mapped to HG01171, most likely a typo here\n/bin/mv \"cels/affy6/1000 Genomes phase 1 and 2 cel files/NA18489 .CEL\" \"cels/affy6/1000 Genomes phase 1 and 2 cel files/NA18489.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03616.CEL\" \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03616-1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03660.CEL\" \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03660-1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG04149.CEL\" \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG04149-1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG01171.CEL\" \"cels/affy6/1000 Genomes phase 1 and 2 cel files/HG01171-1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 3 cel files/HG03616.CEL\" \"cels/affy6/1000 Genomes phase 3 cel files/HG03616-C1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 3 cel files/HG03660.CEL\" \"cels/affy6/1000 Genomes phase 3 cel files/HG03660-C1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 3 cel files/HG04149.CEL\" \"cels/affy6/1000 Genomes phase 3 cel files/HG04149-C1.CEL\"\n/bin/mv \"cels/affy6/1000 Genomes phase 3 cel files/HG03171.CEL\" \"cels/affy6/1000 Genomes phase 3 cel files/HG01171-C1.CEL\"\n```\n\nCreate sample maps\n==================\n\n```\nawk -F, 'NR>1 {print $5\"\\t\"$1\".HumanCNV370v1\"}' idats/hapmap370k/inst/idatFiles/samples370k.csv > HapMap.HumanCNV370v1.tsv\n\nawk -F, 'NR>15 {print $2\"_\"$3\"\\t\"$6\".HumanOmni2.5-4v1\"}' idats/SampleSheet.csv > HapMap.HumanOmni2.5-4v1.tsv\n\nawk 'NR==FNR {x[$2]=$1} NR>FNR {print $2\"\\t\"x[substr($1,12)]\".HumanOmni25M-8v1-1\"}' \\\n  hd_genotype_chip/sanger_intensities/sanger_omni_chip.20130805.internal_to_coriell_id.map \\\n  idats/omni2.5-8_otgeno_20130805.idats/log.txt > HapMap.HumanOmni25M-8v1-1.tsv\n\n# one sample is mapped to NA19787 but should be mapped to NA19730, most likely a sample swap\n# samples mapped to NA21742 and NA21743 are the same individual, most likely a collection issue\ncat hapmap3_affy6.0/{passing,excluded}_cels_sample_map.txt | sed 's/.CEL$//' | \\\n  sed 's/NA19787\\tCHEAP_p_HapMapP3Redo2_GenomeWideSNP_6_B09_235604.CEL/NA19730\\tCHEAP_p_HapMapP3Redo2_GenomeWideSNP_6_B09_235604.CEL/' | \\\n  awk '{sm=$1; if (sm in x) sm=sm\"-\"x[sm]; print $2\"\\t\"sm\".GenomeWideEx_6\"; x[$1]++}' > HapMap.GenomeWideEx_6.tsv\n\nls cels/affy6/1000\\ Genomes\\ phase\\ {1\\ and\\ 2,3}\\ cel\\ files/*.CEL | sed 's/.CEL$//' | \\\n  sed 's/.CEL$//' | awk -F/ '{print $4\"\\t\"$4\".GenomeWideSNP_6\"}' > HapMap.GenomeWideSNP_6.tsv\n```\n\nConvert IDATs to GTCs\n=====================\n\n```\ndeclare -A bpm=( [\"HumanCNV370v1\"]=\"humancnv370v1_c.bpm\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.bpm\"\n                 [\"HumanOmni25M-8v1-1\"]=\"HumanOmni25M-8v1-1_B.bpm\" )\ndeclare -A egt=( [\"HumanCNV370v1\"]=\"HumanCNV370v1_C.egt\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.egt\"\n                 [\"HumanOmni25M-8v1-1\"]=\"humanomni2-5m-8v1-1_b.egt\" )\nbcftools +gtc2vcf -i $(find idats -iname *.idat) -o gtc2vcf.idat.tsv\nmkdir -p HumanCNV370v1 HumanOmni25M-8v1-1 HumanOmni2.5-4v1\nfor idat in $(cut -f1 gtc2vcf.idat.tsv | grep _Grn.idat$); do\n  chip=$(grep ^$idat gtc2vcf.idat.tsv | cut -f16)\n  mono $HOME/bin/autoconvert/AutoConvert.exe $(find idats -iname $idat) $chip ${bpm[$chip]} ${egt[$chip]}\ndone\nbcftools +gtc2vcf {HumanCNV370v1,HumanOmni25M-8v1-1,HumanOmni2.5-4v1}/*.gtc -o gtc2vcf.gtc.tsv\n```\n\nConvert GTCs to VCF\n===================\n\n```\ndeclare -A bpm=( [\"HumanCNV370v1\"]=\"humancnv370v1_c.bpm\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.bpm\"\n                 [\"HumanOmni25M-8v1-1\"]=\"HumanOmni25M-8v1-1_B.bpm\" )\ndeclare -A egt=( [\"HumanCNV370v1\"]=\"HumanCNV370v1_C.egt\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.egt\"\n                 [\"HumanOmni25M-8v1-1\"]=\"humanomni2-5m-8v1-1_b.egt\" )\ndeclare -A csv=( [\"HumanCNV370v1\"]=\"HumanCNV370v1_C.csv\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.csv\"\n                 [\"HumanOmni25M-8v1-1\"]=\"humanomni25m-8v1-1_b.csv\" )\ndeclare -A sam=( [\"HumanCNV370v1\"]=\"HumanCNV370v1_C.bam\"\n                 [\"HumanOmni2.5-4v1\"]=\"HumanOmni2.5-4v1_H.bam\"\n                 [\"HumanOmni25M-8v1-1\"]=\"humanomni25m-8v1-1_b.bam\" )\nfor chip in HumanCNV370v1 HumanOmni25M-8v1-1 HumanOmni2.5-4v1; do\n  bcftools +gtc2vcf \\\n    --no-version -Ou \\\n    --fasta-ref $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \\\n    --bpm ${bpm[$chip]} \\\n    --egt ${egt[$chip]} \\\n    --csv ${csv[$chip]} \\\n    --sam ${sam[$chip]} \\\n    --gtcs $chip \\\n    --extra HapMap.$chip.sex \\\n    --do-not-check-bpm | \\\n    bcftools sort -Ou -T ./bcftools. | \\\n    bcftools norm --no-version -Ob -o HapMap.$chip.bcf -c x -f $ref && \\\n    bcftools index -f HapMap.$chip.bcf\"\ndone\n```\n\nConvert CELs to CHPs\n====================\n\n```\n(echo cel_files; ls cels/{,Broad_hapmap3_r2_Affy6_cels_excluded/}*.CEL) > cels.GenomeWideEx_6.lst\n(echo cel_files; ls cels/affy6/1000\\ Genomes\\ phase\\ {1\\ and\\ 2,3}\\ cel\\ files/*.CEL) > cels.GenomeWideSNP_6.lst\nfor chip in GenomeWideEx_6 GenomeWideSNP_6; do\n  mkdir -p $chip\n  apt-probeset-genotype \\\n    --out-dir $chip \\\n    --special-snps GenomeWideSNP_6.specialSNPs \\\n    --read-models-brlmmp GenomeWideSNP_6.generic_prior.txt \\\n    --chip-type $chip \\\n    --xml-file GenomeWideSNP_6.apt-probeset-genotype.AxiomGT1.xml \\\n    --cel-files cels.$chip.lst \\\n    --table-output false \\\n    --cc-chp-output \\\n    --cc-chp-out-dir $chip \\\n    --write-models\ndone\n```\n\nConvert CHPs to VCF\n===================\n\n```\nfor chip in GenomeWideEx_6 GenomeWideSNP_6; do\n  bcftools +affy2vcf \\\n    --no-version -Ou \\\n    --fasta-ref HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \\\n    --csv GenomeWideSNP_6.na35.annot.csv \\\n    --sam GenomeWideSNP_6.na35.annot.bam \\\n    --models $chip/AxiomGT1.snp-posteriors.txt \\\n    --report $chip/AxiomGT1.report.txt \\\n    --chps $chip \\\n    --extra HapMap.$chip.sex | \\\n    bcftools sort -Ou -T ./bcftools. | \\\n    bcftools norm --no-version -Ob -o HapMap.$chip.bcf -c x -f $ref && \\\n    bcftools index -f HapMap.$chip.bcf\"\ndone\n```\n"
  },
  {
    "path": "Illumina.md",
    "content": "\nArchived Human Products\n-----------------------\n\n| array                                                                                                                                                                   | date       | bpm                                       | egt                                       | csv                                       |\n|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|-------------------------------------------|-------------------------------------------|-------------------------------------------|\n| [Human-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/Human-1_product_files>)                             | 12/21/2004 | Exon-Centric_100K_(v1.2.1).bpm            | Exon-Centric_100K_(v1.2.1).egt            | NA                                        |\n| [HumanHap240S](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap240S_product_files>)                   | 03/13/2006 | BDCHP-1X10-HUMANHAP240S_11216501_B.bpm    | BDCHP-1X10-HUMANHAP240S_11216501_B.egt    | BDCHP-1X10-HUMANHAP240S_11216501_B.csv    |\n| [HumanHap300_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap300_v1_product_files>)               | 03/30/2006 | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.bpm | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.egt | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.csv |\n| [Human1M](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/Human1M_product_files>)                             | 4/24/2006  | Human1Mv1_C.bpm                           | Human1Mv1_C.egt                           | Human1Mv1_C.csv                           |\n| [HumanExon510S-2](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanExon510S-2_product_files>)             | 4/24/2006  | HumanExon510Sv1_D.bpm                     | HumanExon510Sv1_D.egt                     | Human510Sv1_A.csv                         |\n| [HumanHap550_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap550_v1_product_files>)               | 05/01/2006 | BDCHP-1X10-HUMANHAP550_11218540_C.bpm     | BDCHP-1X10-HUMANHAP550_11218540_C.egt     | BDCHP-1X10-HUMANHAP550_11218540_C_csv     |\n| [HumanNS-12](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanNS-12_product_files>)                       | 11/7/2006  | HumanNS-12.bpm                            | HumanNS-12.egt                            | HumanNS-12.csv                            |\n| [HumanHap300-Duo_v2](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap300-Duo_v2 product files>)       | 12/21/2006 | HumanHap300v2_A.bpm                       | HumanHap300v2_A.egt                       | HumanHap300v2_A.csv                       |\n| [HumanHap550-Duo_v3](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap550-Duo_v3_product_files>)       | 12/21/2006 | HumanHap550-2v3_B.bpm                     | HumanHap550-2v3_B.egt                     | HumanHap550-2v3_B.csv                     |\n| [HumanHap550_v3](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap550_v3_product_files>)               | 12/21/2006 | HumanHap550v3_A.bpm                       | HumanHap550v3_A.egt                       | HumanHap550v3_A.csv                       |\n| [HumanHap650Y_v3](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanHap650Y_v3_product_files>)             | 12/21/2006 | HumanHap650Y_v3.bpm                       | HumanHap650Yv3_A.egt                      | HumanHap650Yv3_A.csv                      |\n| [HumanCNV-12_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanCNV-12_v1_product_files>)               | 5/15/2007  | HumanCNV12v1_C.bpm                        | HumanCNV12v1_C.egt                        | NA                                        |\n| [HumanCNV370-Duo_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanCNV370-Duo_v1_product_files>)       | 5/15/2007  | HumanCNV370v1_C.bpm                       | HumanCNV370v1_C.egt                       | HumanCNV370v1_C.csv                       |\n| [HumanLinkage-12](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanLinkage-12>)                           | 7/10/2007  | HumanLinkage-12 _E.bpm                    | HumanLinkage-12 _E.egt                    | NA                                        |\n| [HumanCVDSNP55](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanCVDSNP55>)                               | 3/31/2008  | CVDSNP55v1_A.bpm                          | Human CVD.egt                             | HumanCVDv1_A.csv                          |\n| [HumanCNV370-Quad_v3](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanCNV370-Quad_v3_product_files>)     | 3/17/2008  | HumanCNV370-Quadv3_C.bpm                  | HumanCNV370-Quadv3_C.egt                  | HumanCNV370-Quadv3_C.csv                  |\n| [HumanCNV-12_v2](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanCNV-12_v2_product_files>)               | 4/3/2008   | HumanCNV12v2_B.bpm                        | NA                                        | NA                                        |\n| [Human1M-Duo_v3](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/Human1M-Duo_v3_product_files>)               | 4/4/2008   | Human1M-Duov3_B.bpm                       | NA                                        | Human1M-Duov3_B.csv                       |\n| [HumanLinkage-24](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanLinkage-24>)                           | 02/02/2010 | InfiniumLinkage-24_11419173_A.bpm         | NA                                        | InfiniumLinkage-24_11419173_A.csv         |\n| [Human610-Quad_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/Human610-Quad_v1_product_files>)           | 10/13/2010 | Human610-Quadv1_C.bpm                     | Human610-Quadv1_C.egt                     | Human610-Quadv1_C.csv                     |\n| [HumanOmniExpress-12v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/HumanOmniExpress-12v1_Product_Files>) | 10/14/2010 | HumanOmniExpress-12v1_C.bpm               | HumanOmniExpress-12v1_C.egt               | HumanOmniExpress-12v1_C.csv               |\n| [Human660W-Quad_v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_Human_Products/Human660W-Quad_v1_H_product_files>)       | 4/21/2011  | Human660W-Quad_v1_H.bpm                   | Human660W-Quad_v1_H.egt                   | Human660W-Quad_v1_H.csv                   |\n\nArchived_non-Human_Products\n---------------------------\n\n| array                                                                                                                                                                   | date      | bpm                  | egt                                 | csv                                 |\n|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|----------------------|-------------------------------------|-------------------------------------|\n| [CanineSNP20](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/CanineSNP20_ProductFiles>)                  | 7/10/2007 | CanineSNP20_A.bpm    | CanineSNP20_A.egt                   | NA                                  | \n| [BovineSNP50VERSION1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/BOVINESNP50VERSION1_product files>) | 8/10/2007 | BovineSNP50_B.bpm    | BovineSNP50_A.egt/BovineSNP50_B.egt | BovineSNP50_A.csv/BovineSNP50_B.csv | \n| [EquineSNP50](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/EquineSNP50_product_files>)                 | 6/9/2008  | EquineSNP50_C.bpm    | EquineSNP50_C.egt                   | EquineSNP50_C.csv                   |\n| [PorcineSNP60](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/PorcineSNP60_product_files>)               | 1/7/2009  | PorcineSNP60_B.bpm   | PorcineSNP60_A.egt                  | PorcineSNP60_B.csv                  |\n| [OvineSNP50](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/OvineSNP50_product_files>)                   | 1/7/2009  | OvineSNP50_B.bpm     | OvineSNP50_A.egt                    | OvineSNP50_B.csv                    | \n| [CanineHD](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/CanineHD_Product_files>)                       | 9/2/2009  | CanineHD_A.bpm       | CanineHD-A.egt                      | CanineHD_A.csv                      |\n| [Maize_SNP50](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/Maize_SNP50>)                               | 2/3/2010  | MaizeSNP50_A.bpm     | MaizeSNP50_B.egt                    | MaizeSNP50_A.csv                    |\n| [BovineSNP50VERSION2](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/BovineSNP50VERSION2_product_files>) | 5/20/2010 | BovineSNP50_v2_C.bpm | BovineSNP50v2_A.egt                 | BovineSNP50_v2_C.csv                |\n| [BOVINEHD](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/Archived_non-Human_Products/BOVINEHD_Product_Files>)                       | 6/18/2010 | BovineHD_B.bpm       | BovineHD_A.egt                      | BovineHD_B.csv                      |\n\nOld Products\n------------\n\n| array                                                                                                                                                                             | date       | bpm                                   | egt                                     | csv                           |\n|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|---------------------------------------|-----------------------------------------|-------------------------------|\n| [HumanOmni5Exome-4v1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanOmni5Exome v1.0>)                     | 2/10/2012  | HumanOmni5Exome-4v1_A.bpm             | NA                                      | NA                            |\n| [HumanOmniExpress-12v1-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanOmniExpress-12v1.1>)              | 10/30/2012 | HumanOmniExpress-12v1-1_A.bpm         | NA                                      | NA                            |\n| [OmniExpressExome-8v1-1_15036758](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanOmniExpressExome-12v1.1>) | 12/17/2012 | OmniExpressExome-8v1-1_15036758_A.bpm | HumanOmniExpressExome-8v1-1_2012.12.egt | NA                            |\n| [HumanOmni25M-8v1-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanOmni-2.5-8-v1.1>)                      | 2/13/2013  | HumanOmni25M-8v1-1_B.bpm              | HumanOmni2-5M-8v1-1_B.egt               | HumanOmni25M-8v1-1_B.csv      |\n| [OmniExpressExome-8v1-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/OmniExpressExome-8v1-1_B>)              | 2/5/2013   | OmniExpressExome-8v1-1_B.bpm          | HumanOmniExpressExome-8v1-1_B.egt       | OmniExpressExome-8v1-1_B.csv  |\n| [OmniExpressExome-8v1-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanOmniExpressExome-8v1-1_B>)         | 2/5/2013   | OmniExpressExome-8v1-1_B.bpm          | HumanOmniExpressExome-8v1-1_B.egt       | OmniExpressExome-8v1-1_B.csv  |\n| [HumanCoreExome-12v1-0](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanCoreExomev1-0_A>)                   | 2/6/2013   | HumanCoreExome-12v1-0_A.bpm           | HumanCoreExome-12v1-0_A.egt             | HumanCoreExome-12v1-0_A.csv   |\n| [HumanOmniExpress-12v1-1](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/OmniExpress-12v1.1_B>)                 | 2/6/2013   | HumanOmniExpress-12v1-1_B.bpm         | HumanOmniExpress-12v1-1_B.egt           | HumanOmniExpress-12v1-1_B.csv |\n| [PsychChip_15048346](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Whole Genome Genotyping Files/GT_Call_Files_Current_Products/HumanPsychChipv-1-0>)                       | 10/23/2013 | PsychChip_15048346_A.bpm              | NA                                      | PsychChip_15048346_A.csv      |\n\nConsortium Products\n-------------------\n\n| array                                                                    | date       | bpm                                      | egt | csv                                      |\n|--------------------------------------------------------------------------|------------|------------------------------------------|-----|------------------------------------------|\n| [ASA-24v1-0-Consort_20022506](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/Consortium Asian Screening Array>)        | 1/23/2018  | ASA-24v1-0-Consort_20022506_A2.bpm       | NA  | ASA-24v1-0-Consort_20022506_A2.csv       |\n| [CGCA-24v1-0_20034773](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/Consortium Chinese Genotyping Array>)            | 5/13/2020  | CGCA-24v1-0_20034773_A1.bpm              | NA  | CGCA-24v1-0_20034773_A1.csv              |\n| [DrugDevConsortium-24v1-2_20024394](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/Consortium Drug Dev Array>)         | 3/14/2018  | DrugDevConsortium-24v1-2_20024394_A1.bpm | NA  | DrugDevConsortium-24v1-2_20024394_A1.csv |\n| [GDAConfluence_20032938X375356](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/Global Diversity Array/GDA-Confluence>) | 3/11/2021  | GDAConfluence_20032938X375356_A2.bpm     | NA  | GDAConfluence_20032938X375356_A2.csv     |\n| [NeuroBooster_20042459](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/Global Diversity Array/GDA-Neuro Booster>)      | 7/16/2020  | NeuroBooster_20042459_A2.bpm             | NA  | NeuroBooster_20042459_A2.bpm             |\n| [H3Africa_2017_20021485_A2.csv](<ftp://webdata:webdata@ftp.illumina.com/Public_Docs/Genotyping_Array_Support_Files/H3Africa/v1>)                           | 10/27/2017 | H3Africa_2017_20021485`_A2.bpm           | NA  | H3Africa_2017_20021485_A2.csv            |\n"
  },
  {
    "path": "LICENSE",
    "content": "The MIT License\n\nCopyright (C) 2018-2025 Giulio Genovese\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "gtc2vcf\n=======\n\nA set of tools to convert Illumina and Affymetrix DNA microarray intensity data files into VCF files <b>without</b> using Microsoft Windows. You can use the final output to run the pipeline to detect [mosaic chromosomal alterations](http://github.com/freeseek/mocha). If you use this tool in your publication, please cite this website. For any feedback or questions, contact the [author](mailto:giulio.genovese@gmail.com)\n\n![](gtc2vcf.png)\n\n<!--ts-->\n   * [Usage](#usage)\n   * [Installation](#installation)\n   * [Software Installation](#software-installation)\n   * [Identifying chip type for IDAT and CEL files](#identifying-chip-type-for-idat-and-cel-files)\n   * [Convert Illumina IDAT files to GTC files](#convert-illumina-idat-files-to-gtc-files)\n   * [Convert Illumina GTC files to VCF](#convert-illumina-gtc-files-to-vcf)\n   * [Convert Affymetrix CEL files to CHP files](#convert-affymetrix-cel-files-to-chp-files)\n   * [Convert Affymetrix CHP files to VCF](#convert-affymetrix-chp-files-to-vcf)\n   * [Using an alternative genome reference](#using-an-alternative-genome-reference)\n   * [Detect contamination](#detect-contamination)\n   * [Plot variants](#plot-variants)\n   * [Illumina GenCall](#illumina-gencall)\n      * [Illumina AutoConvert](#illumina-autoconvert)\n      * [Illumina AutoConvert 2.0](#illumina-autoconvert-2-0)\n      * [Illumina Array Analysis Platform Genotyping Command Line Interface](#illumina-array-analysis-platform-genotyping-command-line-interface)\n      * [Illumina Microarray Analytics Array Analysis Command Line Interface](#illumina-microarray-analytics-array-analysis-command-line-interface)\n   * [Acknowledgements](#acknowledgements)\n<!--te-->\n\nUsage\n=====\n\nIllumina data tool:\n```\nUsage: bcftools +gtc2vcf [options] [<A.gtc> ...]\n\nPlugin options:\n    -l, --list-tags                   list available FORMAT tags with description for VCF output\n    -t, --tags LIST                   list of output FORMAT tags [GT,GQ,IGC,BAF,LRR,NORMX,NORMY,R,THETA,X,Y]\n    -b, --bpm <file>                  BPM manifest file\n    -c, --csv <file>                  CSV manifest file (can be gzip compressed)\n    -e, --egt <file>                  EGT cluster file\n    -f, --fasta-ref <file>            reference sequence in fasta format\n        --set-cache-size <int>        select fasta cache size in bytes\n        --gc-window-size <int>        window size in bp used to compute the GC content (-1 for no estimate) [200]\n    -g, --gtcs <dir|file>             GTC genotype files from directory or list from file\n    -i, --idat                        input IDAT files rather than GTC files\n        --capacity <int>              number of variants to read from intensity files per I/O operation [32768]\n        --adjust-clusters             adjust cluster centers in (Theta, R) space (requires --bpm and --egt)\n        --use-gtc-sample-names        use sample name in GTC files rather than GTC file name\n        --do-not-check-bpm            do not check whether BPM and GTC files match manifest file name\n        --do-not-check-eof            do not check whether the BPM and EGT readers reach the end of the file\n        --genome-studio <file>        input a GenomeStudio final report file (in matrix format)\n        --no-version                  do not append version and command line to the header\n    -o, --output <file>               write output to a file [standard output]\n    -O, --output-type u|b|v|z|t[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF\n                                      t: GenomeStudio tab-delimited text output, 0-9: compression level [v]\n        --threads <int>               number of extra output compression threads [0]\n    -x, --extra <file>                write GTC metadata to a file\n    -v, --verbose                     print verbose information\n    -W, --write-index[=FMT]           Automatically index the output files [off]\n\nManifest options:\n        --beadset-order               output BeadSetID normalization order (requires --bpm and --csv)\n        --fasta-flank                 output flank sequence in FASTA format (requires --csv)\n    -s, --sam-flank <file>            input flank sequence alignment in SAM/BAM format (requires --csv)\n        --genome-build <assembly>     genome build ID used to update the manifest file [GRCh38]\n\nExamples:\n    bcftools +gtc2vcf -i 5434246082_R03C01_Grn.idat\n    bcftools +gtc2vcf 5434246082_R03C01.gtc\n    bcftools +gtc2vcf -b HumanOmni2.5-4v1_H.bpm -c HumanOmni2.5-4v1_H.csv\n    bcftools +gtc2vcf -e HumanOmni2.5-4v1_H.egt\n    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv -e GSA-24v3-0_A1_ClusterFile.egt -f human_g1k_v37.fasta -o GSA-24v3-0_A1.vcf\n    bcftools +gtc2vcf -c HumanOmni2.5-4v1_H.csv -f human_g1k_v37.fasta 5434246082_R03C01.gtc -o 5434246082_R03C01.vcf\n    bcftools +gtc2vcf -f human_g1k_v37.fasta --genome-studio GenotypeReport.txt -o GenotypeReport.vcf\n\nExamples of manifest file options:\n    bcftools +gtc2vcf -b GSA-24v3-0_A1.bpm -c GSA-24v3-0_A1.csv --beadset-order\n    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --fasta-flank -o GSA-24v3-0_A1.fasta\n    bwa mem -M GCA_000001405.15_GRCh38_no_alt_analysis_set.fna GSA-24v3-0_A1.fasta -o GSA-24v3-0_A1.sam\n    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --sam-flank GSA-24v3-0_A1.sam -o GSA-24v3-0_A1.GRCh38.csv\n```\n\nAffymetrix data tool:\n```\nUsage: bcftools +affy2vcf [options] --csv <file> --fasta-ref <file> [<A.chp> ...]\n\nPlugin options:\n    -l, --list-tags                 list available FORMAT tags with description for VCF output\n    -t, --tags LIST                 list of output FORMAT tags [GT,CONF,BAF,LRR,NORMX,NORMY,DELTA,SIZE]\n    -c, --csv <file>                CSV manifest file (can be gzip compressed)\n    -f, --fasta-ref <file>          reference sequence in fasta format\n        --set-cache-size <int>      select fasta cache size in bytes\n        --gc-window-size <int>      window size in bp used to compute the GC content (-1 for no estimate) [200]\n        --probeset-ids              tab delimited file with column 'probeset_id' specifying probesets to convert\n        --calls <file>              apt-probeset-genotype calls output (can be gzip compressed)\n        --confidences <file>        apt-probeset-genotype confidences output (can be gzip compressed)\n        --summary <file>            apt-probeset-genotype summary output (can be gzip compressed)\n        --snp <file>                apt-probeset-genotype SNP posteriors output (can be gzip compressed)\n        --chps <dir|file>           input CHP files rather than tab delimited files\n        --cel <file>                input CEL files rather CHP files\n        --adjust-clusters           adjust cluster centers in (Contrast, Size) space (requires --snp)\n        --no-version                do not append version and command line to the header\n    -o, --output <file>             write output to a file [standard output]\n    -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n        --threads <int>             number of extra output compression threads [0]\n    -x, --extra <file>              write CHP metadata to a file (requires CHP files)\n    -v, --verbose                   print verbose information\n    -W, --write-index[=FMT]         Automatically index the output files [off]\n\nManifest options:\n        --fasta-flank               output flank sequence in FASTA format (requires --csv)\n    -s, --sam-flank <file>          input flank sequence alignment in SAM/BAM format (requires --csv)\n\nExamples:\n    bcftools +affy2vcf \\\n        --csv GenomeWideSNP_6.na35.annot.csv \\\n        --fasta-ref human_g1k_v37.fasta \\\n        --chps cc-chp/ \\\n        --snp AxiomGT1.snp-posteriors.txt \\\n        --output AxiomGT1.vcf \\\n        --extra report.tsv\n    bcftools +affy2vcf \\\n        --csv GenomeWideSNP_6.na35.annot.csv \\\n        --fasta-ref human_g1k_v37.fasta \\\n        --calls AxiomGT1.calls.txt \\\n        --confidences AxiomGT1.confidences.txt \\\n        --summary AxiomGT1.summary.txt \\\n        --snp AxiomGT1.snp-posteriors.txt \\\n        --output AxiomGT1.vcf\n\nExamples of manifest file options:\n    bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv --fasta-flank -o  GenomeWideSNP_6.fasta\n    bwa mem -M GCA_000001405.15_GRCh38_no_alt_analysis_set.fna GenomeWideSNP_6.fasta -o GenomeWideSNP_6.sam\n    bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv -s GenomeWideSNP_6.sam -o GenomeWideSNP_6.na35.annot.GRCh38.csv\n```\n\nInstallation\n============\n\nInstall basic tools (Debian/Ubuntu specific if you have admin privileges)\n```\nsudo apt install wget unzip git g++ zlib1g-dev bwa unzip samtools msitools cabextract mono-devel libgdiplus icu-devtools bcftools\n```\n\nOptionally, you can install these libraries to activate further HTSlib features\n```\nsudo apt install libbz2-dev libssl-dev liblzma-dev libgsl0-dev\n```\n\nPreparation steps\n```\nmkdir -p $HOME/bin $HOME/GRCh3{7,8} && cd /tmp\n```\n\nWe recommend compiling the source code but, wherever this is not possible, Linux x86_64 pre-compiled binaries are available for download [here](http://software.broadinstitute.org/software/gtc2vcf). However, notice that you will require BCFtools version 1.20 or newer. You can also download a previous version of the plugin through [bioconda](http://anaconda.org/bioconda/bcftools-gtc2vcf-plugin)\n\nDownload latest version of [HTSlib](http://github.com/samtools/htslib) and [BCFtools](http://github.com/samtools/bcftools) (if not downloaded already)\n```\nwget http://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2\ntar xjvf bcftools-1.20.tar.bz2\n```\n\nDownload and compile plugins code (make sure you are using gcc version 5 or newer)\n```\ncd bcftools-1.20/\n/bin/rm -f plugins/{idat2gtc.c,gtc2vcf.{c,h},affy2vcf.c}\nwget -P plugins http://raw.githubusercontent.com/freeseek/gtc2vcf/master/{idat2gtc.c,gtc2vcf.{c,h},affy2vcf.c,BAFregress.c}\nmake\n/bin/cp bcftools plugins/{idat2gtc,gtc2vcf,affy2vcf,BAFregress}.so $HOME/bin/\n```\n\nMake sure the directory with the plugins is available to BCFtools\n```\nexport PATH=\"$HOME/bin:$PATH\"\nexport BCFTOOLS_PLUGINS=\"$HOME/bin\"\n```\n\nInstall the GRCh37 human genome reference\n```\nwget -O- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz | \\\n  gzip -d > $HOME/GRCh37/human_g1k_v37.fasta\nsamtools faidx $HOME/GRCh37/human_g1k_v37.fasta\nbwa index $HOME/GRCh37/human_g1k_v37.fasta\n```\n\nInstall the GRCh38 human genome reference (following the suggestion from [Heng Li](http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use))\n```\nwget -O- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | \\\n  gzip -d > $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\nsamtools faidx $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\nbwa index $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\n```\n\nAffymetrix provides the [Analysis Power Tools (APT)](http://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-analysis-partners-programs/affymetrix-developers-network/affymetrix-power-tools.html) for free which allow to call genotypes from raw intensity data using an algorithm derived from [BRLMM-P](http://tools.thermofisher.com/content/sfs/brochures/brlmmp_whitepaper.pdf)\n```\nmkdir -p $HOME/bin && cd /tmp\nwget http://downloads.thermofisher.com/APT/APT_2.11.8/apt_2.11.8_linux_64_x86_binaries.zip\nunzip -ojd $HOME/bin apt_2.11.8_linux_64_x86_binaries.zip apt_2.11.8_linux_64_x86_binaries/bin/apt-probeset-genotype\nchmod a+x $HOME/bin/apt-probeset-genotype\n```\n\nIdentifying chip type for IDAT and CEL files\n============================================\n\nTo convert a pair of green and red IDAT files with raw Illumina intensities into a GTC file with genotype calls you need to provide both a BPM manifest file with the location of the probes and an EGT cluster file with the expected intensities of each genotype cluster. It is important to provide the correct BPM and EGT files otherwise the calling will fail possibly generating a GTC file with meaningless calls. Unfortunately newer IDAT files do not contain information about which BPM manifest file to use. The gtc2vcf bcftools plugin can be used to guess which files to use\n```\npath_to_idat_folder=\"...\"\nbcftools +gtc2vcf \\\n  -i -g $path_to_idat_folder\n```\nThis will generate a spreadsheet table with information about each IDAT file including a guess for what manifest and cluster files you should use. If a guess is not provided, contact the [author](mailto:giulio.genovese@gmail.com) for troubleshooting\n\nSimilarly, you can use the affy2vcf bcftools plugin to extract chip type information from CEL files\n```\npath_to_cel_folder=\"...\"\nbcftools +affy2vcf \\\n  --cel --chps $path_to_cel_folder\n```\n\nConvert Illumina IDAT files to GTC files\n========================================\n\nThe idat2gtc bcftools plugin can be used to convert Illumina IDAT files to GTC files\n```\nbpm_manifest_file=\"...\"\negt_cluster_file=\"...\"\nbcftools +idat2gtc \\\n  --bpm $bpm_manifest_file \\\n  --egt $egt_cluster_file \\\n  --idats $path_to_idat_folder \\\n  --output $path_to_gtc_folder\n```\nThe output is equivalent to the output of the Illumina GenCall algorithm while being significantly faster\n\nIf you do not have the manifest and cluster files for the Illumina IDAT files you are trying to convert, make sure to check the links [here](Illumina.md)\n\nIf you run the command with the option `--autocall-date \"\"` then the output should be deterministic and using the `--preset` option you can generate output equivalent to the output you obtain with any of the following:\n\n* [Illumina AutoConvert](#autoconvert)\n* [Illumina AutoConvert 2.0](#autoconvert-2-0)\n* [Illumina Array Analysis Platform Genotyping Command Line Interface](#iaap-cli)\n* [Illumina Microarray Analytics Array Analysis Command Line Interface](#array-analysis-cli)\n\nIf you similarly patch those tools to make them generate deterministic output, you should be able to verify that you get the same md5sum\n\nConvert Illumina GTC files to VCF\n=================================\n\nSpecifications for Illumina BPM, EGT, and GTC files were obtained through Illumina's [BeadArrayFiles](http://github.com/Illumina/BeadArrayFiles) library and [GTCtoVCF](http://github.com/Illumina/GTCtoVCF) script. Specifications for IDAT files were obtained through Henrik Bengtsson's [illuminaio](http://github.com/HenrikBengtsson/illuminaio) package\n```\nbpm_manifest_file=\"...\"\ncsv_manifest_file=\"...\"\negt_cluster_file=\"...\"\npath_to_gtc_folder=\"...\"\nref=\"$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\" # or ref=\"$HOME/GRCh37/human_g1k_v37.fasta\"\nout_prefix=\"...\"\nbcftools +gtc2vcf \\\n  --no-version -Ou \\\n  --bpm $bpm_manifest_file \\\n  --csv $csv_manifest_file \\\n  --egt $egt_cluster_file \\\n  --gtcs $path_to_gtc_folder \\\n  --fasta-ref $ref \\\n  --extra $out_prefix.tsv | \\\n  bcftools sort -Ou -T ./bcftools. | \\\n  bcftools norm --no-version -o $out_prefix.bcf -Ob -c x -f $ref --write-index\n```\nHeavy random access to the reference will be needed, so it is important that enough extra memory be available for the operating system to cache the reference or else the task can run excruciatingly slowly. Notice that the gtc2vcf bcftools plugin will drop unlocalized variants. The final VCF might contain duplicates. If this is an issue `bcftools norm -d exact` can be used to remove such variants. At least one of the BPM or the CSV manifest files has to be provided. Normalized intensities cannot be computed without the BPM manifest file. Indel alleles cannot be inferred and will be skipped without the CSV manifest file. Information about genotype cluster centers will be included in the VCF if the EGT cluster file is provided. You can use gtc2vcf to convert one GTC file at a time, but we strongly advise to convert multiple files at once as single sample VCF files will consume a lot of storage space. If you convert hundreds of GTC files at once, you can use the `--adjust-clusters` option which will recenter the genotype clusters rather than using those provided in the EGT cluster file and will compute less noisy LRR values. If you use the `--adjust-clusters` option and you are using the output for calling [mosaic chromosomal alterations](http://github.com/freeseek/mocha), then it is safe to turn the median BAF/LRR adjustments off during that step (i.e. use `--adjust-BAF-LRR -1`)\n\nOptionally, between the conversion and the sorting step you can include a `bcftools reheader --samples <file>` command to assign new names to the samples where `<file>` contains `old_name new_name\\n` pairs separated by whitespaces, each on a separate line, with `old_name` being the GTC file name without the `.gtc` extension in this case\n\nWhen running the conversion, the gtc2vcf plugin will double check that the SNP manifest metadata information in the GTC file matches the descriptor file name in the BPM file to make sure you are using the correct manifest file. Sometimes, due to discrepancies between the BPM file name provided by Illumina and the internal descriptor file name, this safety check fails. To turn off this feature in these cases, you can use option `--do-not-check-bpm`\n\nConvert Affymetrix CEL files to CHP files\n=========================================\n\nAffymetrix provides a best practice workflow for genotyping data generated using [SNP6](http://www.affymetrix.com/support/developer/powertools/changelog/VIGNETTE-snp6-on-axiom.html) and [Axiom](http://www.affymetrix.com/support/developer/powertools/changelog/VIGNETTE-Axiom-probeset-genotype.html) arrays. As an example, the following command will run the genotyping for the Affymetrix SNP6 array:\n```\npath_to_output_folder=\"...\"\ncel_list_file=\"...\"\napt-probeset-genotype \\\n  --analysis-files-path . \\\n  --xml-file GenomeWideSNP_6.apt-probeset-genotype.AxiomGT1.xml \\\n  --out-dir $path_to_output_folder \\\n  --cel-files $cel_list_file \\\n  --special-snps GenomeWideSNP_6.specialSNPs \\\n  --chip-type GenomeWideEx_6 \\\n  --chip-type GenomeWideSNP_6 \\\n  --table-output false \\\n  --cc-chp-output \\\n  --write-models \\\n  --read-models-brlmmp GenomeWideSNP_6.generic_prior.txt\n```\nAffymetrix provides Library and NetAffx Annotation files for their arrays ([here](http://www.affymetrix.com/support/technical/byproduct.affx?cat=dnaarrays), [here](http://media.affymetrix.com/analysis/downloads/lf/genotyping), and [here](http://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-data-analysis/genechip-array-annotation-files.html))\n\nAs an example, the following commands will obtain the files necessary to run the genotyping for the Affymetrix SNP6 array:\n```\nwget http://tools.thermofisher.com/content/sfs/supportfiles/genomewidesnp6_libraryfile.zip\nwget http://tools.thermofisher.com/content/sfs/supportfiles/SNP6_supplemental_axiom_analysis_files.zip\nwget http://tools.thermofisher.com/content/sfs/supportfiles/GenomeWideSNP_6-na35-annot-csv.zip\nunzip -oj genomewidesnp6_libraryfile.zip CD_GenomeWideSNP_6_rev3/Full/GenomeWideSNP_6/LibFiles/GenomeWideSNP_6.{cdf,chrXprobes,chrYprobes,specialSNPs}\nunzip -o SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.{generic_prior.txt,apt-probeset-genotype.AxiomGT1.xml,AxiomGT1.sketch}\nunzip -o GenomeWideSNP_6-na35-annot-csv.zip GenomeWideSNP_6.na35.annot.csv\n```\n\nNote: If the program exits due to different chip types or probe counts with error message such as `Wrong CEL ChipType: expecting: 'GenomeWideSNP_6' and #######.CEL is: 'GenomeWideEx_6'` then make sure you included the option `--chip-type GenomeWideEx_6 --chip-type GenomeWideSNP_6` or `--force` to the command line to solve the problem\n\nConvert Affymetrix CHP files to VCF\n===================================\n\nThe affy2vcf bcftools plugin can be used to convert Affymetrix CHP files to VCF\n```\ncsv_manifest_file=\"...\" # for example csv_manifest_file=\"GenomeWideSNP_6.na35.annot.csv\"\nref=\"$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\" # or ref=\"$HOME/GRCh37/human_g1k_v37.fasta\"\npath_to_chp_folder=\"cc-chp\"\npath_to_txt_folder=\"...\"\nout_prefix=\"...\"\nbcftools +affy2vcf \\\n  --no-version -Ou \\\n  --csv $csv_manifest_file \\\n  --fasta-ref $ref \\\n  --chps $path_to_chp_folder \\\n  --snp $path_to_txt_folder/AxiomGT1.snp-posteriors.txt \\\n  --extra $out_prefix.tsv | \\\n  bcftools sort -Ou -T ./bcftools. | \\\n  bcftools norm --no-version -o $out_prefix.bcf -Ob -c x -f $ref --write-index\n```\nHeavy random access to the reference will be needed, so it is important that enough extra memory be available for the operating system to cache the reference or else the task can run excruciatingly slowly. The final VCF might contain duplicates. If this is an issue `bcftools norm -d exact` can be used to remove such variants. There is often no need to use the `--adjust-clusters` option for Affymetrix data as the cluster posteriors are already adjusted using the data processed by the genotype caller\n\nOptionally, between the conversion and the sorting step you can include a `bcftools reheader --samples <file>` command to assign new names to the samples where `<file>` contains `old_name new_name\\n` pairs separated by whitespaces, each on a separate line, with `old_name` being the CHP file name without the `.chp` extension\n\nUsing an alternative genome reference\n=====================================\n\nIllumina provides [GRCh38/hg38](http://support.illumina.com/bulletins/2017/04/infinium-human-genotyping-manifests-and-support-files--with-anno.html) manifests for many of its genotyping arrays. However, if your genotyping array is not supported for the newer reference by Illumina, you can use the `--fasta-flank` and `--sam-flank` options to realign the flank sequences from the manifest files you have and recompute the marker positions. This approach uses [flank sequence](http://support.illumina.com/bulletins/2016/05/infinium-genotyping-manifest-column-headings.html) and [strand](http://support.illumina.com/bulletins/2017/06/how-to-interpret-dna-strand-and-allele-information-for-infinium-.html) information to identify the marker [coordinates](http://support.illumina.com/bulletins/2016/06/-infinium-genotyping-array-manifest-files-what-does-chr-or-mapinfo---mean.html). It will need a sequence aligner such as `bwa` to realign the sequences and it seems to reproduce the coordinates provided from Illumina more than 99.9% of the times. Mapping information will follow the [implicit dbSNP standard](http://github.com/Illumina/GTCtoVCF#manifests). Occasionally the flank sequence provided by Illumina is incorrect and it is impossible to recover the correct marker coordinate from the flank sequence alone\n\nYou first have to generate an alignment file for the flank sequences from a CSV manifest file\n```\ncsv_manifest_file=\"...\"\nref=\"$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna\" # or ref=\"$HOME/GRCh37/human_g1k_v37.fasta\"\nbam_alignment_file=\"...\"\nbcftools +gtc2vcf \\\n  -c $csv_manifest_file \\\n  --fasta-flank | \\\n  bwa mem -M $ref - | \\\n  samtools view -bS \\\n  -o $bam_alignment_file\n```\nNotice that you need to use the `-M` option to mark shorter split hits as secondary and you should not sort the output BAM file as gtc2vcf expects it to have the sequences in the same order as in the CSV file . Then you load the alignment file while converting your GTC files to VCF including the `-s $bam_alignment_file` option\n\nSome older manifest files from Illumina have thousands of markers with incorrect RefStrand annotations that will lead to incorrect genotypes. While Illumina has not explained why this is the case, it still distributes incorrect manifests. If you are using one of the following manifests\n```\nHuman1M-Duov3_H\nHuman610-Quadv1_H\nHuman660W-Quad_v1_H\nHumanCytoSNP-12v2-1_Anova\nHumanOmni1-Quad_v1-0-Multi_H\nHumanOmni1-Quad_v1-0_H\n```\nWe advise to either contact Illumina to demand a fixed version or to use gtc2vcf to realign the flank sequences\n\nAlso, Illumina assigns chromosomal positions to indels by first left aligning the flank sequences in an incoherent way (see [here](http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py)). Apparently this is incoherent enough that Illumina also cannot get the coordinates of homopolymer indels right. For example, chromosome 13 ClinVar indel [rs80359507](http://www.ncbi.nlm.nih.gov/clinvar/variation/37959) is assigned to position 32913838 in the manifest file for the GSA-24v2-0 array, but it is assigned to position 32913837 in the manifest file for GSA-24v3-0 array (GRCh37 coordinates). If you want to trust genotypes at homopolymer indels, we advise to use gtc2vcf to realign the flank sequences\n\nWe also found numerous examples of markers from Illumina manifest files that are mapped to the wrong chromosome, such as markers rs10465468, rs12401272, rs185597746, rs188145685 which are localized over XY in the Illumina manifest files for the GSA-24v2-0 array and the GSA-24v3-0 array but their flank sequences map to chromosome Y. If you trust the flank sequences better than the coordinates from the Illumina manifest files, we advise to use gtc2vcf to realign the flank sequences\n\nThe same functionality exists for the affy2vcf tool to convert Affymetrix data\n\nDetect contamination\n====================\n\nTo detect contamination we use a model similar to what employed by [BAFRegress](http://genome.sph.umich.edu/wiki/BAFRegress) and described in [Jun et al. 2012](http://doi.org/10.1016/j.ajhg.2012.09.004) which estimates BAF deviations at homozygous sites towards reference population means. The model needs allele frequencies which can be inferred from the BCFtools/gtc2vcf output:\n```\nbcftools +BAFregress $out_prefix.bcf\n```\nor they can be inferred from a separate resource:\n```\nbcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf --tag AF $out_prefix.bcf\n```\n\nPlot variants\n=============\n\nInstall basic tools (Debian/Ubuntu specific if you have admin privileges):\n```\nsudo apt install r-cran-optparse r-cran-ggplot2 r-cran-data.table r-cran-gridextra\n```\n\nDownload R scripts\n```\n/bin/rm -f $HOME/bin/gtc2vcf_plot.R\nwget -P $HOME/bin http://raw.githubusercontent.com/freeseek/gtc2vcf/master/gtc2vcf_plot.R\nchmod a+x $HOME/bin/gtc2vcf_plot.R\n```\n\nPlot variant (for Illumina data)\n```\ngtc2vcf_plot.R \\\n  --illumina \\\n  --vcf input.vcf \\\n  --chrom 11 \\\n  --pos 66328095 \\\n  --png rs1815739.png\n```\n\n![](rs1815739.png)\n\nPlot variant (for Affymetrix data)\n```\ngtc2vcf_plot.R \\\n  --affymetrix \\\n  --vcf input.vcf \\\n  --chrom 1 \\\n  --pos 196642233 \\\n  --png rs800292.png\n```\n\n![](rs800292.png)\n\nIllumina GenCall\n================\n\nTo genotype raw Illumina IDAT intensity files using Illumina GenCall algorithms, Illumina over the course of the year has provided several command line interfaces written in the .NET language:\n- [AutoConvert](http://support.illumina.com/array/array_software/beeline/downloads.html) (2011)\n- [AutoConvert 2.0](http://support.illumina.com/array/array_software/beeline/downloads.html) (2017)\n- [IAAP CLI](http://support.illumina.com/array/array_software/illumina-array-analysis-platform.html) (2019)\n- [Array Analysis CLI](http://support.illumina.com/array/array_software/ima-array-analysis-cli/downloads.html) (2023)\n\nWe provide instructions to install and run these interfaces. The `sed -i -e ':a' -e 'N' -e '$!ba'` installation commands are used to prevent the interfaces from timestamping the output GTC files by removing the [System.DateTime](http://learn.microsoft.com/en-us/dotnet/api/system.datetime) calls and accesses to the [CreationTime](http://learn.microsoft.com/en-us/dotnet/api/system.io.filesysteminfo.creationtime) property from the binaries, with the goal of making each execution completely reproducible. AutoConvert 2.0, IAAP-CLI, and Array Analysis CLI binaries will both perform version 1.2.0 of the normalization step and seem to produce the exact same results while AutoConvert will only perform version 1.1.2 of the normalization step yielding somewhat different results. If you want to run these binaries but fail to download them, contact the [author](mailto:giulio.genovese@gmail.com) for troubleshooting\n\nIllumina also provides the [Beeline](http://support.illumina.com/array/array_software/beeline.html) software for free and this includes the AutoConvert.exe command line executable which allows to call genotypes from raw intensity data using Illumina's proprietary GenCall algorithm. AutoConvert is almost entirely written in Mono/.Net language, except for one small mathmatical function (findClosestSitesToPointsAlongAxis) which is included within a Windows PE32+ library (MathRoutines.dll). As this is [unmanaged code](http://www.mono-project.com/docs/advanced/embedding/), to be run on Linux with [Mono](http://www.mono-project.com/) it needs to be embedded in an equivalent Linux ELF64 library (libMathRoutines.dll.so) as shown below. This function is run as part of the [normalization](http://doi.org/10.1093/bioinformatics/btm443) of the raw intensities when sampling [400 candidate homozygotes](http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf) before calling genotypes.\n\nIllumina AutoConvert\n--------------------\n\nTo run Illumina AutoConvert (version 1.6.3.1) you will need to fix the hardcoded Windows [backlashes](http://en.wikipedia.org/wiki/Backslash) into UNIX [slashes](http://en.wikipedia.org/wiki/Slash_(punctuation), as shown below\n```\nmkdir -p $HOME/bin && cd /tmp\nwget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/beeline/autoconvert-software-v1-6-3-installer.zip\nwget http://raw.githubusercontent.com/freeseek/gtc2vcf/master/nearest_neighbor.c\nunzip -o autoconvert-software-v1-6-3-installer.zip \nmsiextract -C Illumina/AutoConvert SetupAutoConvert64_1.6.3.1.msi\nmsiextract -l SetupAutoConvert64_1.6.3.1.msi | grep DLL$ | while read dll; do mv Illumina/AutoConvert/$dll Illumina/AutoConvert/${dll%DLL}dll; done\ngcc -fPIC -shared -O2 -o Illumina/AutoConvert/libMathRoutines.dll.so nearest_neighbor.c\nsed -i 's/\\x00\\x03\\\\\\x00/\\x00\\x03\\/\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nsed -i 's/G\\x00R\\x00N\\x00.\\x00i\\x00d\\x00a\\x00t\\x00/G\\x00r\\x00n\\x00.\\x00i\\x00d\\x00a\\x00t\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nsed -i 's/R\\x00E\\x00D\\x00.\\x00i\\x00d\\x00a\\x00t\\x00/R\\x00e\\x00d\\x00.\\x00i\\x00d\\x00a\\x00t\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nsed -i 's/\\\\\\x00M\\x00o\\x00d\\x00u\\x00l\\x00e\\x00s\\x00\\\\\\x00B\\x00S\\x00G\\x00T\\x00\\\\\\x00C\\x00l\\x00u\\x00s\\x00t\\x00e\\x00r\\x00A\\x00l\\x00g\\x00o\\x00r\\x00i\\x00t\\x00h\\x00m\\x00s\\x00\\\\\\x00/\\/\\x00M\\x00o\\x00d\\x00u\\x00l\\x00e\\x00s\\x00\\/\\x00B\\x00S\\x00G\\x00T\\x00\\/\\x00C\\x00l\\x00u\\x00s\\x00t\\x00e\\x00r\\x00A\\x00l\\x00g\\x00o\\x00r\\x00i\\x00t\\x00h\\x00m\\x00s\\x00\\/\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nsed -i 's/\\\\\\x00M\\x00o\\x00d\\x00u\\x00l\\x00e\\x00s\\x00\\\\\\x00B\\x00S\\x00G\\x00T\\x00/\\/\\x00M\\x00o\\x00d\\x00u\\x00l\\x00e\\x00s\\x00\\/\\x00B\\x00S\\x00G\\x00T\\x00/' Illumina/AutoConvert/Modules/BSGT/ClusterAlgorithms/{GoldenGate/GGCA,InfiniumII/I2CA,GenTrain/ILCA}.dll\nsed -i 's/\\\\\\x00d\\x00a\\x00t\\x00.\\x00b\\x00i\\x00n\\x00/\\/\\x00d\\x00a\\x00t\\x00.\\x00b\\x00i\\x00n\\x00/' Illumina/AutoConvert/Modules/BSGT/ClusterAlgorithms/{GoldenGate/GGCA,InfiniumII/I2CA,GenTrain/ILCA}.dll\nsed -i -e ':a' -e 'N' -e '$!ba' -e 's/\\x28\\xa6\\x00\\x00\\x0a\\x13\\x40\\x12\\x40\\x28\\xa7\\x00\\x00\\x0a\\x72\\xad\\x12\\x00\\x70\\x28\\xa6\\x00\\x00\\x0a\\x13\\x40\\x12\\x40\\x28\\xa8\\x00\\x00\\x0a\\x28\\x23\\x00\\x00\\x0a/\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x7e\\x16\\x00\\x00\\x0a\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nsed -i -e ':a' -e 'N' -e '$!ba' -e 's/\\x11\\x0e\\x6f\\xe5\\x00\\x00\\x0a\\x13\\x11\\x12\\x11\\x28\\xe6\\x00\\x00\\x0a\\x72\\xad\\x12\\x00\\x70\\x11\\x0e\\x6f\\xe5\\x00\\x00\\x0a\\x13\\x12\\x12\\x12\\x28\\xe7\\x00\\x00\\x0a\\x28\\x23\\x00\\x00\\x0a/\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x7e\\x16\\x00\\x00\\x0a\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00/' Illumina/AutoConvert/AutoCallLib.dll\nrm autoconvert-software-v1-6-3-installer.zip SetupAutoConvert64_1.6.3.1.msi nearest_neighbor.c\nmv Illumina/AutoConvert $HOME/bin/\nrmdir Illumina\n```\n\nYou can run Illumina's proprietary GenCall algorithm on a single IDAT file pair\n```\nmono $HOME/bin/AutoConvert/AutoConvert.exe \\\n  $idat_green_file \\\n  $path_to_output_folder \\\n  $bpm_manifest_file \\\n  $egt_cluster_file\n```\nMake sure that the red IDAT file is in the same folder as the green IDAT file. Alternatively you can run on multiple IDAT file pairs\n```\nmono $HOME/bin/AutoConvert/AutoConvert.exe \\\n  $path_to_idat_folder \\\n  $path_to_output_folder \\\n  $bpm_manifest_file \\\n  $egt_cluster_file\n```\n\nIllumina AutoConvert 2.0\n------------------------\n\nTo run Illumina AutoConvert 2.0 (version 2.0.1.179) you will need to separately download an additional Mono/.Net library (Heatmap.dll) from [GenomeStudio](http://support.illumina.com/array/array_software/genomestudio.html) or the [polyploid clustering module](http://support.illumina.com/downloads/genomestudio_polyploid_clustering_module_v1-0_software.html) and include it in your binary directory, most likely due to differences in which Mono and .Net resolve library dependencies, as shown below\n```\nmkdir -p $HOME/bin && cd /tmp\nwget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/beeline/autoconvert-software-v2-0-1-installer.zip\nwget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/genomestudio/genomestudiopolyploidclusteringv1-0.msi\nwget http://raw.githubusercontent.com/freeseek/gtc2vcf/master/nearest_neighbor.c\nunzip -o autoconvert-software-v2-0-1-installer.zip\nmsiextract AutoConvertInstaller.msi\nmsiextract genomestudiopolyploidclusteringv1-0.msi\nmv Heatmap.DLL Illumina/AutoConvert\\ 2.0/\ngcc -fPIC -shared -O2 -o Illumina/AutoConvert\\ 2.0/libMathRoutines.dll.so nearest_neighbor.c\nsed -i 's/^\\(     <AutosomalCallRateThreshold>\\)0.97\\(<\\/AutosomalCallRateThreshold>\\r\\)$/\\10.0\\2/' Illumina/AutoConvert\\ 2.0/AutoCallConfig.xml\nsed -i 's/\\\\\\x00d\\x00a\\x00t\\x00.\\x00b\\x00i\\x00n\\x00/\\/\\x00d\\x00a\\x00t\\x00.\\x00b\\x00i\\x00n\\x00/' Illumina/AutoConvert\\ 2.0/{GGCA,I2CA,HDCA,ILCA,ILCA3}.dll\nsed -i -e ':a' -e 'N' -e '$!ba' -e 's/\\x28\\xc7\\x00\\x00\\x0a\\x13\\x3f\\x12\\x3f\\x28\\xc8\\x00\\x00\\x0a\\x72\\xa8\\x15\\x00\\x70\\x28\\xc7\\x00\\x00\\x0a\\x13\\x3f\\x12\\x3f\\x28\\xc9\\x00\\x00\\x0a\\x28\\x1f\\x00\\x00\\x0a/\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x7e\\x12\\x00\\x00\\x0a\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00/' Illumina/AutoConvert\\ 2.0/AutoCallLib.dll\nmsiextract -l genomestudiopolyploidclusteringv1-0.msi | grep -v Heatmap.DLL | xargs rm\nrmdir Modules/BSPC/clusteralgorithms/*\nrmdir -p Modules/BSPC/clusteralgorithms\nrm autoconvert-software-v2-0-1-installer.zip AutoConvertInstaller.msi genomestudiopolyploidclusteringv1-0.msi nearest_neighbor.c\nmv Illumina/AutoConvert\\ 2.0 $HOME/bin/\nrmdir Illumina\n```\nWe change the autosomal call rate threshold to 0.0 to more aggressively call gender in lower quality samples\n\nIf you need to get the Heatmap.dll library from GenomeStudio indtead, you can use the following code\n```\nwget ftp://webdata2:webdata2@ftp.illumina.com/downloads/software/genomestudio/genomestudio-software-v2-0-4-5-installer.zip\nunzip -oj genomestudio-software-v2-0-4-5-installer.zip\ncabextract GenomeStudioInstaller.exe\nmsiextract a0\nmv Illumina/GenomeStudio\\ 2.0/Heatmap.dll Illumina/AutoConvert\\ 2.0/\nrm genomestudio-software-v2-0-4-5-installer.zip GenomeStudioInstaller.exe {,a}0 u{0..5} Illumina/GenomeStudio\\ 2.0 -r\n```\n\nYou can run Illumina's proprietary GenCall algorithm on a single IDAT file pair\n```\nmono $HOME/bin/AutoConvert\\ 2.0/AutoConvert.exe \\\n  $idat_green_file \\\n  $path_to_output_folder \\\n  $bpm_manifest_file \\\n  $egt_cluster_file\n```\nMake sure that the red IDAT file is in the same folder as the green IDAT file. Alternatively you can run on multiple IDAT file pairs\n```\nmono $HOME/bin/AutoConvert\\ 2.0/AutoConvert.exe \\\n  $path_to_idat_folder \\\n  $path_to_output_folder \\\n  $bpm_manifest_file \\\n  $egt_cluster_file\n```\n\nMake sure that the IDAT files have the same name prefix as the IDAT folder name. The software might require up to 8GB of RAM to run. Illumina provides manifest (BPM) and cluster (EGT) files for their arrays [here](http://support.illumina.com/array/downloads.html). Notice that if you provide the wrong BPM file, you will get an error such as: `Normalization failed!  Unable to normalize!` and if you provide the wrong EGT file, you will get an error such as `System.Exception: Unrecoverable Error...Exiting! Unable to find manifest entry ######## in the cluster file!`\n\nIllumina Array Analysis Platform Genotyping Command Line Interface\n------------------------------------------------------------------\n\nIllumina provides the [Illumina Array Analysis Platform Genotyping Command Line Interface](http://support.illumina.com/array/array_software/illumina-array-analysis-platform.html) software for free for research use and this includes the iaap-cli 1.1.0 which runs natively on Linux\n```\nmkdir -p $HOME/bin && cd /tmp\nwget ftp://webdata2:webdata2@ftp.illumina.com/downloads/software/iaap/iaap-cli-linux-x64-1.1.0.tar.gz\ntar xzvf iaap-cli-linux-x64-1.1.0.tar.gz -C $HOME/bin/ iaap-cli-linux-x64-1.1.0/iaap-cli --strip-components=1\nsed -i -e ':a' -e 'N' -e '$!ba' -e 's/\\x28\\x17\\x01\\x00\\x0a\\x13\\x07\\x12\\x07\\x72\\xdd\\x23\\x00\\x70\\x28\\x18\\x01\\x00\\x0a/\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x7e\\x92\\x00\\x00\\x0a\\x00\\x00\\x00\\x00\\x00/' $HOME/bin/iaap-cli/ArrayAnalysis.NormToGenCall.Services.dll\nrm iaap-cli-linux-x64-1.1.0.tar.gz\n```\n\nOnce iaap-cli is properly installed in your system, run Illumina's proprietary GenCall algorithm on multiple IDAT file pairs\n```\nCLR_ICU_VERSION_OVERRIDE=\"$(uconv -V | sed 's/.* //g')\" LANG=\"en_US.UTF-8\" $HOME/bin/iaap-cli/iaap-cli \\\n  gencall \\\n  $bpm_manifest_file \\\n  $egt_cluster_file \\\n  $path_to_output_folder \\\n  --idat-folder $path_to_idat_folder \\\n  --output-gtc \\\n  --gender-estimate-call-rate-threshold 0.0\n```\nIt is important to set the `LANG` environmental variable to `en_US.UTF-8`, if this is set to other values, due to a bug in `iaap-cli` causing malformed GTC files to be generated as a result. Due to another bug in `iaap-cli`, IDAT filenames cannot include more than two `_` characters and should be formatted as `BARCODE_POSITION_(Red|Grn).idat`. When using `iaap_cli` you cannot process old array manifest files with loci data encoded as version 5 or older, such as `HumanHap650Yv3_A.bpm`, as the corresponding code was not carried over and you will get the error `Error in reading file.  Unknown Manifest version`. The AutoConvert command line tool can read older manifest files. We change the autosomal call rate threshold to 0.0 both to more aggressively call gender in lower quality samples and to deal with an implementation issue that causes loci with null cluster scores to be included in the determination of the autosomal call rate threshold\n\nIllumina Microarray Analytics Array Analysis Command Line Interface\n-------------------------------------------------------------------\n\nIllumina provides the [Illumina Microarray Analytics Array Analysis Command Line Interface](http://support.illumina.com/array/array_software/ima-array-analysis-cli/downloads.html) software for free for research use and this includes the array-analysis-cli 2.1.0 which runs natively on Linux\n```\nmkdir -p $HOME/bin && cd /tmp\nwget http://support.illumina.com/softwaredownload.html?assetId=72f8a34f-0933-4256-bad6-73d830436c74&assetDetails=IlluminaMicroarrayAnalyticsArrayAnalysisCLIv2.1LinuxInstaller-2.1-array-analysis-cli-linux-x64-v2.1.0.tar.gz\ntar xzvf array-analysis-cli-linux-x64-v2.1.0.tar.gz -C $HOME/bin/ --strip-components=1\nsed -i -e ':a' -e 'N' -e '$!ba' -e 's/\\x28\\x89\\x00\\x00\\x0a\\x0A\\x12\\x00\\x72\\xa3\\x15\\x00\\x70\\x28\\x8a\\x00\\x00\\x0a/\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x72\\xfc\\x0d\\x00\\x70\\x00\\x00\\x00\\x00\\x00/' $HOME/bin/array-analysis-cli//ArrayAnalysis.Core.dll\nrm array-analysis-cli-linux-x64-v2.1.0.tar.gz\n```\n\nOnce array-analysis-cli is properly installed in your system, run Illumina's proprietary GenCall algorithm on multiple IDAT file pairs\n```\n$HOME/bin/array-analysis-cli/array-analysis-cli \\\n  genotype call \\\n  --bpm-manifest $bpm_manifest_file \\\n  --cluster-file $egt_cluster_file \\\n  --idat-folder .\n```\nWe cannot change the autosomal call rate threshold to 0.0 both to more aggressively call gender in lower quality samples as the default 0.97 value is hardcoded\n\nAcknowledgements\n================\n\nThis work is supported by NIH grant [R01 HG006855](http://grantome.com/grant/NIH/R01-HG006855), NIH grant [R01 MH104964](http://grantome.com/grant/NIH/R01-MH104964), NIH grant [R01MH123451](http://grantome.com/grant/NIH/R01-MH123451), US Department of Defense Breast Cancer Research Breakthrough Award W81XWH-16-1-0316 (project BC151244), and the Stanley Center for Psychiatric Research\n"
  },
  {
    "path": "affy2vcf.c",
    "content": "/* The MIT License\n\n   Copyright (c) 2018-2025 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n#include <getopt.h>\n#include <errno.h>\n#include <wchar.h>\n#include <sys/resource.h>\n#include <arpa/inet.h>\n#include <htslib/vcf.h>\n#include <htslib/kseq.h>\n#include <htslib/khash_str2int.h>\n#include \"bcftools.h\"\n#include \"gtc2vcf.h\"\n\n#define AFFY2VCF_VERSION \"2025-10-08\"\n\n#define TAG_LIST_DFLT \"GT,CONF,BAF,LRR,NORMX,NORMY,DELTA,SIZE\"\n#define GC_WIN_DFLT \"200\"\n\n#define VERBOSE (1 << 0)\n#define LOAD_CEL (1 << 1)\n#define PROBESET_IDS_LOADED (1 << 2)\n#define CALLS_LOADED (1 << 3)\n#define CONFIDENCES_LOADED (1 << 4)\n#define SUMMARY_LOADED (1 << 5)\n#define SNP_LOADED (1 << 6)\n#define ADJUST_CLUSTERS (1 << 7)\n#define NO_INFO_GC (1 << 8)\n#define FORMAT_GT (1 << 9)\n#define FORMAT_CONF (1 << 10)\n#define FORMAT_BAF (1 << 11)\n#define FORMAT_LRR (1 << 12)\n#define FORMAT_NORMX (1 << 13)\n#define FORMAT_NORMY (1 << 14)\n#define FORMAT_DELTA (1 << 15)\n#define FORMAT_SIZE (1 << 16)\n\n// #%affymetrix-algorithm-param-apt-opt-use-copynumber-call-codes=0\n// #%call-code-1=NoCall:-1:2\n// #%call-code-2=AA:0:2\n// #%call-code-3=AB:1:2\n// #%call-code-4=BB:2:2\n#define GT_NC -1\n#define GT_AA 0\n#define GT_AB 1\n#define GT_BB 2\n\n// #%max-alleles=4\n// #%max-cn-states=2\n// #%call-code-1=OTV_1:-4:1\n// #%call-code-2=NoCall_1:-3:1\n// #%call-code-3=OTV:-2:2\n// #%call-code-4=NoCall:-1:2\n// #%call-code-5=AA:0:2\n// #%call-code-6=AB:1:2\n// #%call-code-7=BB:2:2\n// #%call-code-8=ZeroCN:3:0\n// #%call-code-9=A:4:1\n// #%call-code-10=B:5:1\n// #%call-code-11=C:6:1\n// #%call-code-12=AC:7:2\n// #%call-code-13=BC:8:2\n// #%call-code-14=CC:9:2\n// #%call-code-15=D:10:1\n// #%call-code-16=AD:11:2\n// #%call-code-17=BD:12:2\n// #%call-code-18=CD:13:2\n// #%call-code-19=DD:14:2\n// #%call-code-20=E:15:1\n// #%call-code-21=AE:16:2\n// #%call-code-22=BE:17:2\n// #%call-code-23=CE:18:2\n// #%call-code-24=DE:19:2\n// #%call-code-25=EE:20:2\n// #%call-code-26=F:21:1\n// #%call-code-27=AF:22:2\n// #%call-code-28=BF:23:2\n// #%call-code-29=CF:24:2\n// #%call-code-30=DF:25:2\n// #%call-code-31=EF:26:2\n// #%call-code-32=FF:27:2\nstatic const int txt_gt[32] = {GT_NC, GT_NC, GT_NC, GT_NC, GT_AA, GT_AB, GT_BB, GT_NC,\n                               GT_AA, GT_BB, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC,\n                               GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC,\n                               GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC};\nstatic const int chp_gt[16] = {-1, -1, -1, -1, -1, -1, GT_AA, GT_BB, GT_AB, -1, -1, GT_NC, -1, -1, -1, -1};\n\n/****************************************\n * hFILE READING FUNCTIONS              *\n ****************************************/\n\n// read long in network order\nstatic inline uint32_t read_long(hFILE *hfile) {\n    uint32_t value;\n    read_bytes(hfile, (void *)&value, sizeof(uint32_t));\n    value = ntohl(value);\n    return value;\n}\n\n// read float in network order\nstatic inline float read_float(hFILE *hfile) {\n    union {\n        uint32_t u;\n        float f;\n    } convert;\n    read_bytes(hfile, (void *)&convert.u, sizeof(uint32_t));\n    convert.u = ntohl(convert.u);\n    return convert.f;\n}\n\n// read string in network order\nstatic inline int32_t read_string8(hFILE *hfile, char **buffer) {\n    int32_t len = (int32_t)read_long(hfile);\n    if (len) {\n        *buffer = (char *)malloc((1 + len) * sizeof(char));\n        read_bytes(hfile, (void *)*buffer, len * sizeof(char));\n        (*buffer)[len] = '\\0';\n    } else {\n        *buffer = NULL;\n    }\n    return len;\n}\n\n// read wide-character string in network order\nstatic inline int32_t read_string16(hFILE *hfile, wchar_t **buffer) {\n    int32_t len = (int32_t)read_long(hfile);\n    if (len) {\n        *buffer = (wchar_t *)malloc((1 + len) * sizeof(wchar_t));\n        int i;\n        for (i = 0; i < len; i++) {\n            uint16_t cvalue;\n            read_bytes(hfile, (void *)&cvalue, sizeof(unsigned short));\n            (*buffer)[i] = (wchar_t)ntohs(cvalue);\n        }\n        (*buffer)[len] = L'\\0';\n    } else {\n        *buffer = NULL;\n    }\n    return len;\n}\n\n/****************************************\n * CEL FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/index.html\n\ntypedef struct {\n    float mean __attribute__((packed));\n    float dev __attribute__((packed));\n    int16_t N;\n} Cell;\n\ntypedef struct {\n    int16_t x;\n    int16_t y;\n} Entry;\n\ntypedef struct {\n    int32_t row;\n    int32_t col;\n    float upper_left_x;\n    float upper_left_y;\n    float upper_right_x;\n    float upper_right_y;\n    float lower_left_x;\n    float lower_left_y;\n    float lower_right_x;\n    float lower_right_y;\n    int32_t left_cell;\n    int32_t top_cell;\n    int32_t right_cell;\n    int32_t bottom_cell;\n} SubGrid;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int32_t version;\n    int32_t num_rows;\n    int32_t num_cols;\n    int32_t num_cells;\n    int32_t n_header;\n    char *header;\n    int32_t n_algorithm;\n    char *algorithm;\n    int32_t n_parameters;\n    char *parameters;\n    int32_t cell_margin;\n    uint32_t num_outlier_cells;\n    uint32_t num_masked_cells;\n    int32_t num_sub_grids;\n    Cell *cells;\n    Entry *masked_entries;\n    Entry *outlier_entries;\n    SubGrid *sub_grids;\n} xda_cel_t;\n\nstatic xda_cel_t *xda_cel_init(const char *fn, hFILE *hfile, int flags) {\n    xda_cel_t *xda_cel = (xda_cel_t *)calloc(1, sizeof(xda_cel_t));\n    xda_cel->fn = strdup(fn);\n    xda_cel->hfile = hfile;\n\n    int32_t magic;\n    read_bytes(xda_cel->hfile, (void *)&magic, sizeof(int32_t));\n    if (magic != 64) error(\"XDA CEL file %s magic number is %d while it should be 64\\n\", xda_cel->fn, magic);\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->version, sizeof(int32_t));\n    if (xda_cel->version != 4)\n        error(\"Cannot read XDA CEL file %s. Unsupported XDA CEL file format version: %d\\n\", xda_cel->fn,\n              xda_cel->version);\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_rows, sizeof(int32_t));\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_cols, sizeof(int32_t));\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_cells, sizeof(int32_t));\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->n_header, sizeof(int32_t));\n    xda_cel->header = (char *)malloc((1 + xda_cel->n_header) * sizeof(char));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->header, xda_cel->n_header * sizeof(char));\n    xda_cel->header[xda_cel->n_header] = '\\0';\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->n_algorithm, sizeof(int32_t));\n    xda_cel->algorithm = (char *)malloc((1 + xda_cel->n_algorithm) * sizeof(char));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->algorithm, xda_cel->n_algorithm * sizeof(char));\n    xda_cel->algorithm[xda_cel->n_algorithm] = '\\0';\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->n_parameters, sizeof(int32_t));\n    xda_cel->parameters = (char *)malloc((1 + xda_cel->n_parameters) * sizeof(char));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->parameters, xda_cel->n_parameters * sizeof(char));\n    xda_cel->parameters[xda_cel->n_parameters] = '\\0';\n\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->cell_margin, sizeof(int32_t));\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_outlier_cells, sizeof(uint32_t));\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_masked_cells, sizeof(uint32_t));\n    read_bytes(xda_cel->hfile, (void *)&xda_cel->num_sub_grids, sizeof(int32_t));\n\n    if (flags) return xda_cel;\n\n    xda_cel->cells = (Cell *)malloc(xda_cel->num_cells * sizeof(Cell));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->cells, xda_cel->num_cells * sizeof(Cell));\n\n    xda_cel->masked_entries = (Entry *)malloc(xda_cel->num_masked_cells * sizeof(Entry));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->masked_entries, xda_cel->num_masked_cells * sizeof(Entry));\n\n    xda_cel->outlier_entries = (Entry *)malloc(xda_cel->num_outlier_cells * sizeof(Entry));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->outlier_entries, xda_cel->num_outlier_cells * sizeof(Entry));\n\n    xda_cel->sub_grids = (SubGrid *)malloc(xda_cel->num_sub_grids * sizeof(SubGrid));\n    read_bytes(xda_cel->hfile, (void *)xda_cel->sub_grids, xda_cel->num_sub_grids * sizeof(SubGrid));\n\n    if (!heof(xda_cel->hfile))\n        error(\"XDA CEL reader did not reach the end of file %s at position %ld\\n\", xda_cel->fn, htell(xda_cel->hfile));\n\n    return xda_cel;\n}\n\nstatic void xda_cel_destroy(xda_cel_t *xda_cel) {\n    if (!xda_cel) return;\n    free(xda_cel->fn);\n    if (hclose(xda_cel->hfile) < 0) error(\"Error closing XDA CEL file\\n\");\n    free(xda_cel->header);\n    free(xda_cel->algorithm);\n    free(xda_cel->parameters);\n    free(xda_cel->cells);\n    free(xda_cel->masked_entries);\n    free(xda_cel->outlier_entries);\n    free(xda_cel->sub_grids);\n    free(xda_cel);\n}\n\nstatic void xda_cel_print(const xda_cel_t *xda_cel, FILE *stream, int verbose) {\n    fprintf(stream, \"[CEL]\\n\");\n    fprintf(stream, \"Version=3\\n\");\n    fprintf(stream, \"\\n[HEADER]\\n\");\n    fprintf(stream, \"%s\", xda_cel->header);\n    fprintf(stream, \"\\n[INTENSITY]\\n\");\n    fprintf(stream, \"NumberCells=%d\\n\", xda_cel->num_cells);\n    fprintf(stream, \"CellHeader=X\\tY\\tMEAN\\tSTDV\\tNPIXELS\\n\");\n    int i;\n    if (!verbose)\n        fprintf(stream, \"... use --verbose to visualize Cell Entries ...\\n\");\n    else\n        for (i = 0; i < xda_cel->num_cells; i++)\n            fprintf(stream, \"%3d\\t%3d\\t%.1f\\t%.1f\\t%3d\\n\", i % xda_cel->num_cols, i / xda_cel->num_cols,\n                    xda_cel->cells[i].mean, xda_cel->cells[i].dev, xda_cel->cells[i].N);\n    fprintf(stream, \"\\n[MASKS]\\n\");\n    fprintf(stream, \"NumberCells=%d\\n\", xda_cel->num_masked_cells);\n    fprintf(stream, \"CellHeader=X\\tY\\n\");\n    if (!verbose)\n        fprintf(stream, \"... use --verbose to visualize Masked Entries ...\\n\");\n    else\n        for (i = 0; i < xda_cel->num_masked_cells; i++)\n            fprintf(stream, \"%d\\t%d\\n\", xda_cel->masked_entries[i].x, xda_cel->masked_entries[i].y);\n    fprintf(stream, \"\\n[OUTLIERS]\\n\");\n    fprintf(stream, \"NumberCells=%d\\n\", xda_cel->num_outlier_cells);\n    fprintf(stream, \"CellHeader=X\\tY\\n\");\n    if (!verbose)\n        fprintf(stream, \"... use --verbose to visualize Outlier Entries ...\\n\");\n    else\n        for (i = 0; i < xda_cel->num_outlier_cells; i++)\n            fprintf(stream, \"%d\\t%d\\n\", xda_cel->outlier_entries[i].x, xda_cel->outlier_entries[i].y);\n    fprintf(stream, \"\\n[MODIFIED]\\n\");\n    fprintf(stream, \"NumberCells=0\\n\");\n    fprintf(stream, \"CellHeader=X\\tY\\tORIGMEAN\\n\");\n}\n\n/****************************************\n * CHP FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/index.html\n\n#define BYTE 0\n#define UBYTE 1\n#define SHORT 2\n#define USHORT 3\n#define INT 4\n#define UINT 5\n#define FLOAT 6\n#define STRING 7\n#define WSTRING 8\n\ntypedef struct {\n    wchar_t *name;\n    char *value;\n    wchar_t *mime_type;\n    int32_t n_value;\n    int8_t type;\n} Parameter;\n\ntypedef struct DataHeader DataHeader;\n\nstruct DataHeader {\n    char *data_type_identifier;\n    char *guid;\n    wchar_t *datetime;\n    wchar_t *locale;\n    int32_t n_parameters;\n    Parameter *parameters;\n    int32_t n_parents;\n    DataHeader *parents;\n};\n\ntypedef struct {\n    wchar_t *name;\n    int8_t type;\n    int32_t size;\n} ColHeader;\n\ntypedef struct {\n    uint32_t pos_first_element;\n    uint32_t pos_next_data_set;\n    wchar_t *name;\n    int32_t n_parameters;\n    Parameter *parameters;\n    uint32_t n_cols;\n    ColHeader *col_headers;\n    uint32_t n_rows;\n    hFILE *hfile; // this should not be destroyed\n    uint32_t n_buffer;\n    uint32_t *col_offsets;\n    char *buffer;\n} DataSet;\n\ntypedef struct {\n    uint32_t pos_next_data_group;\n    uint32_t pos_first_data_set;\n    int32_t num_data_sets;\n    wchar_t *name;\n    DataSet *data_sets;\n} DataGroup;\n\ntypedef struct {\n    wchar_t *name;\n    int8_t type;\n    int32_t size;\n} ColumnHeader;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    uint8_t magic;\n    uint8_t version;\n    int32_t num_data_groups;\n    uint32_t pos_first_data_group;\n    DataHeader data_header;\n    DataGroup *data_groups;\n    off_t size;\n    char *display_name;\n} agcc_t;\n\nstatic void agcc_read_parameters(Parameter *parameter, hFILE *hfile, int flags) {\n    read_string16(hfile, &parameter->name);\n    parameter->n_value = read_string8(hfile, &parameter->value);\n    read_string16(hfile, &parameter->mime_type);\n    if (wcscmp(parameter->mime_type, L\"text/x-calvin-integer-8\") == 0)\n        parameter->type = BYTE;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-unsigned-integer-8\") == 0)\n        parameter->type = UBYTE;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-integer-16\") == 0)\n        parameter->type = SHORT;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-unsigned-integer-16\") == 0)\n        parameter->type = USHORT;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-integer-32\") == 0)\n        parameter->type = INT;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-unsigned-integer-32\") == 0)\n        parameter->type = UINT;\n    else if (wcscmp(parameter->mime_type, L\"text/x-calvin-float\") == 0)\n        parameter->type = FLOAT;\n    else if (wcscmp(parameter->mime_type, L\"text/ascii\") == 0)\n        parameter->type = STRING;\n    else if (wcscmp(parameter->mime_type, L\"text/plain\") == 0)\n        parameter->type = WSTRING;\n    else\n        error(\"MIME type %ls not allowed\\n\", parameter->mime_type);\n\n    // drop parameters that can increase the size of the header dramatically\n    if (flags && wcsncmp(parameter->name, L\"affymetrix-algorithm-param-apt-opt-cel\", 38) == 0) {\n        free(parameter->name);\n        parameter->name = NULL;\n        parameter->n_value = 0;\n        free(parameter->value);\n        parameter->value = NULL;\n        free(parameter->mime_type);\n        parameter->mime_type = NULL;\n    }\n}\n\nstatic void agcc_read_data_header(DataHeader *data_header, hFILE *hfile, int flags) {\n    int i;\n    read_string8(hfile, &data_header->data_type_identifier);\n    read_string8(hfile, &data_header->guid);\n    read_string16(hfile, &data_header->datetime);\n    read_string16(hfile, &data_header->locale);\n\n    data_header->n_parameters = (int32_t)read_long(hfile);\n    data_header->parameters = (Parameter *)malloc(data_header->n_parameters * sizeof(Parameter));\n    for (i = 0; i < data_header->n_parameters; i++) agcc_read_parameters(&data_header->parameters[i], hfile, flags);\n\n    data_header->n_parents = (int32_t)read_long(hfile);\n    data_header->parents = (DataHeader *)malloc(data_header->n_parents * sizeof(DataHeader));\n    for (i = 0; i < data_header->n_parents; i++) agcc_read_data_header(&data_header->parents[i], hfile, flags);\n}\n\nstatic void agcc_read_data_set(DataSet *data_set, hFILE *hfile, int flags) {\n    int i;\n    data_set->pos_first_element = read_long(hfile);\n    data_set->pos_next_data_set = read_long(hfile);\n    read_string16(hfile, &data_set->name);\n\n    data_set->n_parameters = (int32_t)read_long(hfile);\n    data_set->parameters = (Parameter *)malloc(data_set->n_parameters * sizeof(Parameter));\n    for (i = 0; i < data_set->n_parameters; i++) agcc_read_parameters(&data_set->parameters[i], hfile, flags);\n\n    data_set->n_cols = read_long(hfile);\n    data_set->col_headers = (ColHeader *)malloc(data_set->n_cols * sizeof(ColHeader));\n    for (i = 0; i < data_set->n_cols; i++) {\n        read_string16(hfile, &data_set->col_headers[i].name);\n        read_bytes(hfile, (void *)&data_set->col_headers[i].type, sizeof(int8_t));\n        data_set->col_headers[i].size = read_long(hfile);\n    }\n    data_set->n_rows = read_long(hfile);\n\n    data_set->hfile = hfile;\n    data_set->col_offsets = (uint32_t *)malloc(data_set->n_cols * sizeof(uint32_t *));\n    data_set->n_buffer = 0;\n    for (i = 0; i < data_set->n_cols; i++) {\n        data_set->col_offsets[i] = data_set->n_buffer;\n        data_set->n_buffer += data_set->col_headers[i].size;\n    }\n    data_set->buffer = (char *)malloc(data_set->n_buffer * sizeof(char));\n\n    if (data_set->pos_next_data_set)\n        if (hseek(hfile, data_set->pos_next_data_set, SEEK_SET) < 0)\n            error(\"Fail to seek to position %d in AGCC file\\n\", data_set->pos_next_data_set);\n}\n\nstatic void agcc_read_data_group(DataGroup *data_group, hFILE *hfile, int flags) {\n    int i;\n    data_group->pos_next_data_group = read_long(hfile);\n    data_group->pos_first_data_set = read_long(hfile);\n    data_group->num_data_sets = read_long(hfile);\n    read_string16(hfile, &data_group->name);\n    if (hseek(hfile, data_group->pos_first_data_set, SEEK_SET) < 0)\n        error(\"Fail to seek to position %d in AGCC file\\n\", data_group->pos_first_data_set);\n    data_group->data_sets = (DataSet *)malloc(data_group->num_data_sets * sizeof(DataSet));\n    for (i = 0; i < data_group->num_data_sets; i++) agcc_read_data_set(&data_group->data_sets[i], hfile, flags);\n    if (data_group->pos_next_data_group)\n        if (hseek(hfile, data_group->pos_next_data_group, SEEK_SET) < 0)\n            error(\"Fail to seek to position %d in AGCC file\\n\", data_group->pos_next_data_group);\n}\n\nstatic agcc_t *agcc_init(const char *fn, hFILE *hfile, int flags) {\n    int i;\n    agcc_t *agcc = (agcc_t *)calloc(1, sizeof(agcc_t));\n    agcc->fn = strdup(fn);\n    agcc->hfile = hfile;\n\n    // read File Header\n    read_bytes(agcc->hfile, (void *)&agcc->magic, sizeof(uint8_t));\n    if (agcc->magic != 59) error(\"AGCC file %s magic number is %d while it should be 59\\n\", agcc->fn, agcc->magic);\n    read_bytes(agcc->hfile, (void *)&agcc->version, sizeof(uint8_t));\n    if (agcc->version != 1)\n        error(\"Cannot read AGCC file %s. Unsupported AGCC file format version: %d\\n\", agcc->fn, agcc->version);\n    agcc->num_data_groups = (int32_t)read_long(agcc->hfile);\n    agcc->pos_first_data_group = read_long(agcc->hfile);\n\n    // read Generic Data Header\n    agcc_read_data_header(&agcc->data_header, agcc->hfile, flags);\n\n    // read Data Groups\n    if (hseek(agcc->hfile, agcc->pos_first_data_group, SEEK_SET) < 0)\n        error(\"Fail to seek to position %d in AGCC %s file\\n\", agcc->pos_first_data_group, agcc->fn);\n    agcc->data_groups = (DataGroup *)malloc(agcc->num_data_groups * sizeof(DataGroup));\n    for (i = 0; i < agcc->num_data_groups; i++) agcc_read_data_group(&agcc->data_groups[i], agcc->hfile, flags);\n\n    if (!heof(agcc->hfile))\n        error(\"AGCC reader did not reach the end of file %s at position %ld\\n\", agcc->fn, htell(agcc->hfile));\n\n    if (hseek(agcc->hfile, 0L, SEEK_END) < 0) error(\"Fail to seek to end of AGCC %s file\\n\", agcc->fn);\n    agcc->size = htell(agcc->hfile);\n\n    char *ptr = strrchr(agcc->fn, '/') ? strrchr(agcc->fn, '/') + 1 : agcc->fn;\n    agcc->display_name = strdup(ptr);\n    ptr = strrchr(agcc->display_name, '.');\n    if (ptr && strcmp(ptr + 1, \"chp\") == 0) {\n        *ptr = '\\0';\n        ptr = strrchr(agcc->display_name, '.');\n        if (ptr && (strcmp(ptr + 1, \"AxiomGT1\") == 0 || strcmp(ptr + 1, \"birdseed-v2\") == 0)) *ptr = '\\0';\n    }\n\n    return agcc;\n}\n\nstatic void agcc_destroy_parameters(Parameter *parameters, int32_t n_parameters) {\n    int i;\n    for (i = 0; i < n_parameters; i++) {\n        free(parameters[i].name);\n        free(parameters[i].value);\n        free(parameters[i].mime_type);\n    }\n    free(parameters);\n}\n\nstatic void agcc_destroy_data_header(DataHeader *data_header) {\n    int i;\n    free(data_header->data_type_identifier);\n    free(data_header->guid);\n    free(data_header->datetime);\n    free(data_header->locale);\n    agcc_destroy_parameters(data_header->parameters, data_header->n_parameters);\n    for (i = 0; i < data_header->n_parents; i++) agcc_destroy_data_header(&data_header->parents[i]);\n    free(data_header->parents);\n}\n\nstatic void agcc_destroy_data_set(DataSet *data_set) {\n    int i;\n    free(data_set->name);\n    agcc_destroy_parameters(data_set->parameters, data_set->n_parameters);\n    for (i = 0; i < data_set->n_cols; i++) free(data_set->col_headers[i].name);\n    free(data_set->col_headers);\n    free(data_set->col_offsets);\n    free(data_set->buffer);\n}\n\nstatic void agcc_destroy_data_group(DataGroup *data_group) {\n    int i;\n    free(data_group->name);\n    for (i = 0; i < data_group->num_data_sets; i++) agcc_destroy_data_set(&data_group->data_sets[i]);\n    free(data_group->data_sets);\n}\n\nstatic void agcc_destroy(agcc_t *agcc) {\n    if (!agcc) return;\n    int i;\n    free(agcc->fn);\n    if (hclose(agcc->hfile) < 0) error(\"Error closing AGCC file\\n\");\n    agcc_destroy_data_header(&agcc->data_header);\n    for (i = 0; i < agcc->num_data_groups; i++) agcc_destroy_data_group(&agcc->data_groups[i]);\n    free(agcc->data_groups);\n    free(agcc->display_name);\n    free(agcc);\n}\n\nstatic void buffer_string16(const uint16_t *value, int32_t n_value, size_t *m_buffer, wchar_t **buffer) {\n    int i;\n    hts_expand(wchar_t, n_value / 2 + 1, *m_buffer, *buffer);\n    for (i = 0; i < n_value / 2; i++) (*buffer)[i] = (wchar_t)ntohs(value[i]);\n    (*buffer)[n_value / 2] = L'\\0';\n}\n\nstatic void agcc_print_parameters(const Parameter *parameters, int32_t n_parameters, FILE *stream) {\n    int i;\n    union {\n        uint32_t u;\n        float f;\n    } convert;\n    wchar_t *buffer = NULL;\n    size_t m_buffer = 0;\n    for (i = 0; i < n_parameters; i++) {\n        fprintf(stream, \"#%%%ls=\", parameters[i].name ? parameters[i].name : L\"\");\n        switch (parameters[i].type) {\n        case BYTE:\n            fprintf(stream, \"%d\\n\", (int8_t)ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case UBYTE:\n            fprintf(stream, \"%u\\n\", (uint8_t)ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case SHORT:\n            fprintf(stream, \"%d\\n\", (int16_t)ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case USHORT:\n            fprintf(stream, \"%u\\n\", (uint16_t)ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case INT:\n            fprintf(stream, \"%d\\n\", (int32_t)ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case UINT:\n            fprintf(stream, \"%u\\n\", ntohl(*(uint32_t *)parameters[i].value));\n            break;\n        case FLOAT:\n            convert.u = ntohl(*(uint32_t *)parameters[i].value);\n            fprintf(stream, \"%f\\n\", convert.f);\n            break;\n        case STRING:\n            fprintf(stream, \"%s\\n\", parameters[i].value);\n            break;\n        case WSTRING:\n            buffer_string16((uint16_t *)parameters[i].value, parameters[i].n_value, &m_buffer, &buffer);\n            fprintf(stream, \"%ls\\n\", buffer);\n            break;\n        default:\n            break;\n        }\n    }\n    free(buffer);\n}\n\nstatic void agcc_print_data_header(const DataHeader *data_header, FILE *stream) {\n    int i;\n    if (data_header->guid) fprintf(stream, \"#%%FileIdentifier=%s\\n\", data_header->guid);\n    fprintf(stream, \"#%%FileTypeIdentifier=%s\\n\", data_header->data_type_identifier);\n    fprintf(stream, \"#%%FileLocale=%ls\\n\", data_header->locale);\n    agcc_print_parameters(data_header->parameters, data_header->n_parameters, stream);\n    for (i = 0; i < data_header->n_parents; i++) agcc_print_data_header(&data_header->parents[i], stream);\n}\n\ntypedef void (*col_print_t)(const char *, FILE *stream);\n\nvoid agcc_print_probe_set_name(const char *s, FILE *stream) {\n    uint32_t size = ntohl(*(uint32_t *)s);\n    fwrite(s + 4, 1, size, stream);\n}\n\nvoid agcc_print_call(const char *s, FILE *stream) {\n    static const char a[16] = \"......ABA..N....\";\n    static const char b[16] = \"......ABB..C....\";\n    int c = s[0] & 0x0F;\n    fputc(a[c], stream);\n    fputc(b[c], stream);\n}\n\nvoid agcc_print_float(const char *s, FILE *stream) {\n    union {\n        uint32_t u;\n        float f;\n    } convert;\n    convert.u = ntohl(*(uint32_t *)s);\n    fprintf(stream, \"%g\", convert.f);\n}\n\nstatic void agcc_print_data_set(const DataSet *data_set, FILE *stream, int verbose) {\n    fprintf(stream, \"#%%SetName=%ls\\n\", data_set->name);\n    fprintf(stream, \"#%%Columns=%d\\n\", data_set->n_cols);\n    fprintf(stream, \"#%%Rows=%d\\n\", data_set->n_rows);\n    int i, j;\n    agcc_print_parameters(data_set->parameters, data_set->n_parameters, stream);\n    for (i = 0; i < data_set->n_cols; i++)\n        fprintf(stream, \"%ls%c\", data_set->col_headers[i].name, i + 1 < data_set->n_cols ? '\\t' : '\\n');\n    if (data_set->n_rows == 0) return;\n\n    if (!verbose) {\n        fprintf(stream, \"... use --verbose to visualize Data Set ...\\n\");\n        return;\n    }\n    if (wcscmp(data_set->name, L\"Genotype\") != 0) {\n        fprintf(stream, \"... can only visualize Genotype Data Set ...\\n\");\n        return;\n    }\n\n    char *col_ends = (char *)malloc(data_set->n_cols * sizeof(char *));\n    col_print_t *col_prints = (col_print_t *)malloc(data_set->n_cols * sizeof(col_print_t *));\n    for (i = 0; i < data_set->n_cols; i++) {\n        col_ends[i] = i + 1 < data_set->n_cols ? '\\t' : '\\n';\n        if (wcscmp(data_set->col_headers[i].name, L\"ProbeSetName\") == 0)\n            col_prints[i] = agcc_print_probe_set_name;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Call\") == 0)\n            col_prints[i] = agcc_print_call;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Confidence\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Contrast\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Log Ratio\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Strength\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Signal A\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Signal B\") == 0)\n            col_prints[i] = agcc_print_float;\n        else if (wcscmp(data_set->col_headers[i].name, L\"Forced Call\") == 0)\n            col_prints[i] = agcc_print_call;\n        else\n            error(\"Unknown column type %ls in AGCC file with type %d\\n\", data_set->col_headers[i].name,\n                  data_set->col_headers[i].type);\n    }\n    if (hseek(data_set->hfile, data_set->pos_first_element, SEEK_SET) < 0)\n        error(\"Fail to seek to position %d in AGCC file\\n\", data_set->pos_first_element);\n    for (i = 0; i < data_set->n_rows; i++) {\n        read_bytes(data_set->hfile, (void *)data_set->buffer, data_set->n_buffer);\n        for (j = 0; j < data_set->n_cols; j++) {\n            col_prints[j](data_set->buffer + data_set->col_offsets[j], stream);\n            fputc(col_ends[j], stream);\n        }\n    }\n    free(col_ends);\n    free(col_prints);\n}\n\nstatic void agcc_print_data_group(const DataGroup *data_group, FILE *stream, int verbose) {\n    fprintf(stream, \"#%%GroupName=%ls\\n\", data_group->name);\n    int i;\n    for (i = 0; i < data_group->num_data_sets; i++) agcc_print_data_set(&data_group->data_sets[i], stream, verbose);\n}\n\nstatic void agcc_print(const agcc_t *agcc, FILE *stream, int verbose) {\n    fprintf(stream, \"#%%File=%s\\n\", agcc->fn);\n    fprintf(stream, \"#%%FileSize=%ld\\n\", agcc->size);\n    fprintf(stream, \"#%%Magic=%d\\n\", agcc->magic);\n    fprintf(stream, \"#%%Version=%d\\n\", agcc->version);\n    int i;\n    agcc_print_data_header(&agcc->data_header, stream);\n    for (i = 0; i < agcc->num_data_groups; i++) agcc_print_data_group(&agcc->data_groups[i], stream, verbose);\n}\n\nstatic void chps_to_tsv(uint8_t *magic, agcc_t **agcc, int n, FILE *stream) {\n    int i, j, k;\n    // AxiomGT1 analysis has also cn-probe-chrXY-ratio_gender_meanX,\n    // cn-probe-chrXY-ratio_gender_meanY, cn-probe-chrXY-ratio_gender_ratio,\n    // cn-probe-chrXY-ratio_gender while BRLMM-P analysis has also em-cluster-chrX-het-contrast_gender\n    // em-cluster-chrX-het-contrast_gender_chrX_het_rate\n    // pm_mean\n    static const wchar_t *chipsummary[] = {L\"computed_gender\",\n                                           L\"call_rate\",\n                                           L\"total_call_rate\",\n                                           L\"het_rate\",\n                                           L\"total_het_rate\",\n                                           L\"hom_rate\",\n                                           L\"total_hom_rate\",\n                                           L\"cluster_distance_mean\",\n                                           L\"cluster_distance_stdev\",\n                                           L\"allele_summarization_mean\",\n                                           L\"allele_summarization_stdev\",\n                                           L\"allele_deviation_mean\",\n                                           L\"allele_deviation_stdev\",\n                                           L\"allele_mad_residuals_mean\",\n                                           L\"allele_mad_residuals_stdev\"};\n    fputs(\"chp\", stream);\n    for (j = 0; j < 15; j++) fprintf(stream, \"\\t%ls\", chipsummary[j]);\n    fputc('\\n', stream);\n    for (i = 0; i < n; i++) {\n        if (magic[i] != 59) continue;\n        if (strcmp(agcc[i]->data_header.data_type_identifier, \"affymetrix-multi-data-type-analysis\") != 0) {\n            if (strcmp(agcc[i]->data_header.data_type_identifier, \"affymetrix-calvin-intensity\") == 0\n                || strcmp(agcc[i]->data_header.data_type_identifier, \"affymetrix-calvin-multi-intensity\") == 0)\n                error(\n                    \"AGCC file %s contains calvin intensities rather multi data type analysis (use --cel to extract \"\n                    \"metadata)\\n\",\n                    agcc[i]->fn);\n            else\n                error(\"AGCC file %s does not contain multi data type analysis as data type identifier is %s\\n\",\n                      agcc[i]->fn, agcc[i]->data_header.data_type_identifier);\n        }\n        fputs(strrchr(agcc[i]->fn, '/') ? strrchr(agcc[i]->fn, '/') + 1 : agcc[i]->fn, stream);\n        DataHeader *data_header = &agcc[i]->data_header;\n        for (j = 0, k = 0; j < 15; j++) {\n            fputc('\\t', stream);\n            while (!data_header->parameters[k].name\n                   || wcsncmp(data_header->parameters[k].name, L\"affymetrix-chipsummary-\", 23) != 0\n                   || wcscmp(&data_header->parameters[k].name[23], chipsummary[j]) != 0) {\n                k++;\n                k %= data_header->n_parameters;\n            }\n            union {\n                uint32_t u;\n                float f;\n            } convert;\n            switch (data_header->parameters[k].type) {\n            case FLOAT:\n                convert.u = ntohl(*(uint32_t *)data_header->parameters[k].value);\n                fprintf(stream, \"%.5f\", convert.f);\n                break;\n            case STRING:\n                fputs(data_header->parameters[k].value, stream);\n                break;\n            default:\n                error(\"Unable to print parameter of type %d from %s AGCC file\\n\", data_header->parameters[k].type,\n                      agcc[i]->fn);\n                break;\n            }\n        }\n        fputc('\\n', stream);\n    }\n}\n\n/****************************************\n * PRINT CEL SUMMARY                    *\n ****************************************/\n\n// this function returns\n// fusion-experiment-name\n// pixel-cols\n// pixel-rows\n// XIN\n// YIN\n// VE\n// temp\n// power\n// scan-date\n// scanner-id\n// scanner-type\n// array-type\nstatic void parse_dat_header(char *dat_header, char *str[12], int n_str[12]) {\n    char *ss = strchr(dat_header, ' ') + 2;\n    char *se = strchr(dat_header, '\\0');\n    if (!se) goto fail;\n\n    se = strchr(ss, ':');\n    if (!se) goto fail;\n    str[0] = ss;\n    n_str[0] = se - ss;\n\n    ss = se + 5;\n    for (se = ss + 4; isspace(*se) && se >= ss; se--);\n    str[1] = ss;\n    n_str[1] = se - ss + 1;\n\n    ss = ss + 9;\n    for (se = ss + 4; isspace(*se) && se >= ss; se--);\n    str[2] = ss;\n    n_str[2] = se - ss + 1;\n\n    ss = ss + 9;\n    for (se = ss + 2; isspace(*se) && se >= ss; se--);\n    str[3] = ss;\n    n_str[3] = se - ss + 1;\n\n    ss = ss + 7;\n    for (se = ss + 2; isspace(*se) && se >= ss; se--);\n    str[4] = ss;\n    n_str[4] = se - ss + 1;\n\n    ss = ss + 6;\n    for (se = ss + 2; isspace(*se) && se >= ss; se--);\n    str[5] = ss;\n    n_str[5] = se - ss + 1;\n\n    ss = ss + 3;\n    for (se = ss + 6; isspace(*se) && se >= ss; se--);\n    str[6] = ss;\n    n_str[6] = se - ss + 1;\n\n    ss = ss + 7;\n    for (se = ss + 3; isspace(*se) && se >= ss; se--);\n    str[7] = ss;\n    n_str[7] = se - ss + 1;\n\n    ss = ss + 4;\n    for (se = ss + 17; isspace(*se) && se >= ss; se--);\n    str[8] = ss;\n    n_str[8] = se - ss + 1;\n\n    ss = ss + 18;\n    se = strchr(ss, ' ');\n    if (!se) goto fail;\n    str[9] = ss;\n    n_str[9] = se - ss;\n\n    ss = se + 2;\n    se = strstr(ss, \"\\x14 \");\n    if (!se) goto fail;\n    for (se--; isspace(*se) && se >= ss; se--);\n    str[10] = ss;\n    n_str[10] = se - ss + 1;\n\n    se = strstr(ss, \"\\x14 \");\n    if (!se) goto fail;\n    ss = se + 2;\n    se = strstr(ss, \"\\x14 \");\n    if (!se) goto fail;\n    ss = se + 2;\n    se = strstr(ss, \".1sq\");\n    if (!se) goto fail;\n    str[11] = ss;\n    n_str[11] = se - ss;\n\n    return;\n\nfail:\n    error(\"DAT header malformed\\n\");\n}\n\n// http://github.com/HenrikBengtsson/affxparser/blob/master/R/parseDatHeaderString.R\nstatic void cels_to_tsv(uint8_t *magic, void **files, int n, FILE *stream) {\n    int i, j;\n    wchar_t *array_type = NULL;             // affymetrix-array-type\n    wchar_t *scanner_type = NULL;           // affymetrix-scanner-type\n    wchar_t *scanner_id = NULL;             // affymetrix-scanner-id\n    wchar_t *scan_date = NULL;              // affymetrix-scan-date\n    wchar_t *fusion_experiment_name = NULL; // affymetrix-fusion-experiment-name\n    size_t m_array_type = 0, m_scanner_type = 0, m_scanner_id = 0, m_scan_date = 0, m_fusion_experiment_name = 0;\n    int32_t pixel_rows = 0; // affymetrix-pixel-rows\n    int32_t pixel_cols = 0; // affymetrix-pixel-cols\n\n    char *str[12];\n    int n_str[12];\n\n    fprintf(stream,\n            \"cel\\tarray_type\\tscanner_type\\tscanner_id\\tscan_date\\tfusion_experiment_name\\tpixel_rows\\tpixel_cols\\n\");\n    for (i = 0; i < n; i++) {\n        char *ss, *se;\n        agcc_t *agcc = (agcc_t *)files[i];\n        xda_cel_t *xda_cel = (xda_cel_t *)files[i];\n        switch (magic[i]) {\n        case 59:\n            if (strcmp(agcc->data_header.data_type_identifier, \"affymetrix-calvin-intensity\") != 0\n                && strcmp(agcc->data_header.data_type_identifier, \"affymetrix-calvin-multi-intensity\") != 0)\n                error(\"AGCC file %s does not contain calvin intensities as data type identifier is %s\\n\", agcc->fn,\n                      agcc->data_header.data_type_identifier);\n            if (agcc->data_header.n_parents == 0\n                || (strcmp(agcc->data_header.parents[0].data_type_identifier, \"affymetrix-calvin-scan-acquisition\") != 0\n                    && strcmp(agcc->data_header.parents[0].data_type_identifier,\n                              \"affymetrix-calvin-multi-scan-acquisition\")\n                           != 0))\n                error(\"AGCC file %s is missing scan acquisition information as data type identifier is %s\\n\", agcc->fn,\n                      agcc->data_header.parents[0].data_type_identifier);\n\n            const Parameter *parameter;\n            for (j = 0; j < agcc->data_header.parents[0].n_parameters; j++) {\n                parameter = &agcc->data_header.parents[0].parameters[j];\n                if (wcscmp(parameter->name, L\"affymetrix-array-type\") == 0 && parameter->type == WSTRING)\n                    buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_array_type, &array_type);\n                else if (wcscmp(parameter->name, L\"affymetrix-scanner-type\") == 0 && parameter->type == WSTRING)\n                    buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scanner_type, &scanner_type);\n                else if (wcscmp(parameter->name, L\"affymetrix-scanner-id\") == 0 && parameter->type == WSTRING)\n                    buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scanner_id, &scanner_id);\n                else if (wcscmp(parameter->name, L\"affymetrix-scan-date\") == 0 && parameter->type == WSTRING)\n                    buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scan_date, &scan_date);\n                else if (wcscmp(parameter->name, L\"affymetrix-fusion-experiment-name\") == 0\n                         && parameter->type == WSTRING)\n                    buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_fusion_experiment_name,\n                                    &fusion_experiment_name);\n                if (wcscmp(parameter->name, L\"affymetrix-pixel-rows\") == 0 && parameter->type == INT)\n                    pixel_rows = (int32_t)ntohl(*(uint32_t *)parameter->value);\n                if (wcscmp(parameter->name, L\"affymetrix-pixel-cols\") == 0 && parameter->type == INT)\n                    pixel_cols = (int32_t)ntohl(*(uint32_t *)parameter->value);\n            }\n            fputs(strrchr(agcc->fn, '/') ? strrchr(agcc->fn, '/') + 1 : agcc->fn, stream);\n            fputc('\\t', stream);\n            if (array_type) {\n                fprintf(stream, \"%ls\", array_type);\n                array_type[0] = L'\\0';\n            }\n            fputc('\\t', stream);\n            if (scanner_type) {\n                fprintf(stream, \"%ls\", scanner_type);\n                scanner_type[0] = L'\\0';\n            }\n            fputc('\\t', stream);\n            if (scanner_id) {\n                fprintf(stream, \"%ls\", scanner_id);\n                scanner_id[0] = L'\\0';\n            }\n            fputc('\\t', stream);\n            if (scan_date) {\n                fprintf(stream, \"%ls\", scan_date);\n                scan_date[0] = L'\\0';\n            }\n            fputc('\\t', stream);\n            if (fusion_experiment_name) {\n                fprintf(stream, \"%ls\", fusion_experiment_name);\n                fusion_experiment_name[0] = L'\\0';\n            }\n            fputc('\\t', stream);\n            if (pixel_rows) {\n                fprintf(stream, \"%d\", pixel_rows);\n                pixel_rows = 0;\n            }\n            fputc('\\t', stream);\n            if (pixel_cols) {\n                fprintf(stream, \"%d\", pixel_cols);\n                pixel_cols = 0;\n            }\n            fputc('\\n', stream);\n            break;\n        case 64:\n            ss = strstr(xda_cel->header, \"\\nDatHeader=[\");\n            if (!ss) error(\"XDA CEL file %s is missing DAT header\\n\", xda_cel->fn);\n            ss = strchr(ss + 12, ']');\n            if (!ss) error(\"XDA CEL file %s is missing DAT header\\n\", xda_cel->fn);\n            ss++;\n            se = strchr(ss, '\\n');\n            if (!se) error(\"XDA CEL file %s is missing DAT header\\n\", xda_cel->fn);\n            *se = '\\0';\n            parse_dat_header(ss, str, n_str);\n            *se = '\\n';\n            fprintf(stream, \"%s\\t%.*s\\t%.*s\\t%.*s\\t%.*s\\t%.*s\\t%.*s\\t%.*s\\n\",\n                    strrchr(xda_cel->fn, '/') ? strrchr(xda_cel->fn, '/') + 1 : xda_cel->fn, n_str[11], str[11],\n                    n_str[10], str[10], n_str[9], str[9], n_str[8], str[8], n_str[0], str[0], n_str[1], str[1],\n                    n_str[2], str[2]);\n            break;\n        default:\n            break;\n        }\n    }\n    free(array_type);\n    free(scanner_type);\n    free(scanner_id);\n    free(scan_date);\n    free(fusion_experiment_name);\n}\n\n/****************************************\n * htsFILE READING FUNCTIONS            *\n ****************************************/\n\nstatic htsFile *unheader(const char *fn, kstring_t *str) {\n    htsFile *fp = hts_open(fn, \"r\");\n    if (fp == NULL) error(\"Could not open %s: %s\\n\", fn, strerror(errno));\n\n    do // skip header\n        if (hts_getline(fp, KS_SEP_LINE, str) <= 0) error(\"Empty file: %s\\n\", fn);\n    while (str->s[0] == '#');\n\n    return fp;\n}\n\n/************************************************\n * PROBEST IDS FILE IMPLEMENTATION              *\n ************************************************/\n\nstatic void *probeset_ids_init(const char *fn) {\n    void *probeset_ids = khash_str2int_init();\n    kstring_t str = {0, 0, NULL};\n    htsFile *fp = unheader(fn, &str);\n    int moff = 0, *off = NULL, ncols;\n    ncols = ksplit_core(str.s, '\\t', &moff, &off);\n    if (ncols < 1 || strcmp(&str.s[off[0]], \"probeset_id\"))\n        error(\"Malformed first line from probeset IDs file: %s\\n%s\\n\", fn, str.s);\n    while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {\n        ncols = ksplit_core(str.s, '\\t', &moff, &off);\n        if (khash_str2int_has_key(probeset_ids, &str.s[off[0]]))\n            error(\"Probe Set %s present multiple times in file %s\\n\", &str.s[off[0]], fn);\n        khash_str2int_inc(probeset_ids, strdup(&str.s[off[0]]));\n    }\n    free(off);\n    free(str.s);\n    hts_close(fp);\n    return probeset_ids;\n}\n\n/************************************************\n * SNP CLUSTER POSTERIORS FILE IMPLEMENTATION   *\n ************************************************/\n\n// http://www.affymetrix.com/support/developer/powertools/changelog/SnpModelConverter_8cpp_source.html\n\ntypedef struct {\n    float xm;   // delta mean of cluster\n    float xss;  // delta variance of cluster\n    float k;    // strength of mean (pseudo-observations)\n    float v;    // strength of variance (pseudo-observations)\n    float ym;   // size mean of cluster in other dimension\n    float yss;  // size variance of cluster in other dimension\n    float xyss; // covariance of cluster in both directions\n} cluster_t;\n\ntypedef struct {\n    char *probeset_id;\n    int copynumber;\n    cluster_t aa;\n    cluster_t ab;\n    cluster_t bb;\n} snp_t;\n\ntypedef struct {\n    int is_birdseed;\n    void *probeset_id[2];\n    snp_t *snps[2];\n    int n_snps[2];\n    int m_snps[2];\n} snp_models_t;\n\nstatic inline void brlmmp_cluster_init(const char *s, const int *off, cluster_t *cluster) {\n    cluster->xm = strtof(&s[off[0]], NULL);\n    cluster->xss = strtof(&s[off[1]], NULL);\n    cluster->k = strtof(&s[off[2]], NULL);\n    cluster->v = strtof(&s[off[3]], NULL);\n    cluster->ym = strtof(&s[off[4]], NULL);\n    cluster->yss = strtof(&s[off[5]], NULL);\n    cluster->xyss = strtof(&s[off[6]], NULL);\n}\n\nstatic inline void birdseed_cluster_init(const char *s, const int *off, cluster_t *cluster) {\n    cluster->xm = strtof(&s[off[0]], NULL);\n    cluster->ym = strtof(&s[off[1]], NULL);\n    cluster->xss = strtof(&s[off[2]], NULL);\n    cluster->xyss = strtof(&s[off[3]], NULL);\n    cluster->yss = strtof(&s[off[4]], NULL);\n    cluster->k = strtof(&s[off[5]], NULL);\n    cluster->v = strtof(&s[off[5]], NULL);\n}\n\nstatic snp_models_t *snp_models_init(const char *fn) {\n    int i;\n    snp_models_t *snp_models = (snp_models_t *)calloc(1, sizeof(snp_models_t));\n    for (i = 0; i < 2; i++) {\n        snp_models->probeset_id[i] = khash_str2int_init();\n    }\n\n    kstring_t str = {0, 0, NULL};\n    htsFile *fp = unheader(fn, &str);\n\n    int sep1, sep2, sep3, exp_cols;\n    if (strcmp(str.s, \"id\\tBB\\tAB\\tAA\\tCV\") == 0 || strcmp(str.s, \"id\\tBB\\tAB\\tAA\\tCV\\tOTV\") == 0) {\n        if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error(\"Missing information in SNP posteriors file: %s\\n\", fn);\n        sep1 = '\\t';\n        sep2 = ',';\n        sep3 = ':';\n        exp_cols = 7;\n    } else if (!strchr(str.s, '\\t')) {\n        snp_models->is_birdseed = 1;\n        sep1 = ';';\n        sep2 = ' ';\n        sep3 = '-';\n        exp_cols = 6;\n    } else {\n        error(\"Malformed header line in SNP model file %s:\\n%s\\n\", fn, str.s);\n    }\n\n    snp_t *snp;\n    int moff1 = 0, *off1 = NULL, ncols1;\n    int moff2 = 0, *off2 = NULL, ncols2;\n    do {\n        ncols1 = ksplit_core(str.s, sep1, &moff1, &off1);\n        char *col_str = &str.s[off1[0]];\n\n        int len = strlen(col_str);\n        int copynumber;\n        if (col_str[len - 2] == sep3) {\n            char *tmp;\n            copynumber = strtol(&col_str[len - 1], &tmp, 0);\n            if (*tmp) error(\"Could not parse copynumber %s from file: %s\\n\", &col_str[len - 1], fn);\n            len -= 2;\n            col_str[len] = '\\0';\n        } else {\n            copynumber = 2;\n        }\n\n        int idx = copynumber == 2;\n        hts_expand(snp_t, snp_models->n_snps[idx] + 1, snp_models->m_snps[idx], snp_models->snps[idx]);\n        snp = &snp_models->snps[idx][snp_models->n_snps[idx]];\n        snp->probeset_id = strdup(&str.s[off1[0]]);\n        snp->copynumber = copynumber;\n        if (khash_str2int_has_key(snp_models->probeset_id[idx], snp->probeset_id))\n            error(\"Probe Set %s present multiple times in file %s\\n\", snp->probeset_id, fn);\n        khash_str2int_inc(snp_models->probeset_id[idx], snp->probeset_id);\n\n        if (ncols1 < 4 - (2 - copynumber) * snp_models->is_birdseed)\n            error(\"Missing information for probeset %s in SNP posteriors file: %s\\n\", str.s, fn);\n        col_str = &str.s[off1[1]];\n        ncols2 = ksplit_core(col_str, sep2, &moff2, &off2);\n\n        if (ncols2 < exp_cols) error(\"Missing information for probeset %s in SNP posteriors file: %s\\n\", str.s, fn);\n        if (snp_models->is_birdseed)\n            birdseed_cluster_init(col_str, off2, &snp->aa);\n        else\n            brlmmp_cluster_init(col_str, off2, &snp->bb);\n\n        col_str = &str.s[off1[2]];\n        if (snp_models->is_birdseed && copynumber == 1) {\n            snp->ab.xm = NAN;\n            snp->ab.xss = NAN;\n            snp->ab.k = NAN;\n            snp->ab.v = NAN;\n            snp->ab.ym = NAN;\n            snp->ab.yss = NAN;\n            snp->ab.xyss = NAN;\n        } else {\n            ncols2 = ksplit_core(col_str, sep2, &moff2, &off2);\n            if (ncols2 < exp_cols) error(\"Missing information for probeset %s in SNP posteriors file: %s\\n\", str.s, fn);\n            if (snp_models->is_birdseed)\n                birdseed_cluster_init(col_str, off2, &snp->ab);\n            else\n                brlmmp_cluster_init(col_str, off2, &snp->ab);\n            col_str = &str.s[off1[3]];\n        }\n\n        ncols2 = ksplit_core(col_str, sep2, &moff2, &off2);\n        if (ncols2 < exp_cols) error(\"Missing information for probeset %s in SNP posteriors file: %s\\n\", str.s, fn);\n        if (snp_models->is_birdseed)\n            birdseed_cluster_init(col_str, off2, &snp->bb);\n        else\n            brlmmp_cluster_init(col_str, off2, &snp->aa);\n\n        snp_models->n_snps[idx]++;\n    } while (hts_getline(fp, KS_SEP_LINE, &str) > 0);\n\n    free(off2);\n    free(off1);\n    free(str.s);\n    hts_close(fp);\n    return snp_models;\n}\n\nstatic void snp_models_destroy(snp_models_t *snp_models) {\n    int i, j;\n    for (i = 0; i < 2; i++) {\n        khash_str2int_destroy(snp_models->probeset_id[i]);\n        for (j = 0; j < snp_models->n_snps[i]; j++) free(snp_models->snps[i][j].probeset_id);\n        free(snp_models->snps[i]);\n    }\n    free(snp_models);\n}\n\n/****************************************\n * ANNOT.CSV FILE IMPLEMENTATION        *\n ****************************************/\n\ntypedef struct {\n    char *probeset_id;\n    char *affy_snp_id;\n    char *dbsnp_rs_id;\n    char *chromosome;\n    int position;\n    int strand;\n    char *flank;\n} record_t;\n\ntypedef struct {\n    void *probeset_id;\n    record_t *records;\n    int n_records, m_records;\n} annot_t;\n\nstatic inline char *unquote(char *str) {\n    if (strcmp(str, \"\\\"---\\\"\") == 0) return NULL;\n    char *ptr = strrchr(str, '\"');\n    if (ptr) *ptr = '\\0';\n    return str + 1;\n}\n\nstatic annot_t *annot_init(const char *fn, const char *sam_fn, const char *out_fn, int flags) {\n    annot_t *annot = NULL;\n    FILE *out_txt = get_file_handle(out_fn);\n    htsFile *hts = NULL;\n    sam_hdr_t *sam_hdr = NULL;\n    bam1_t *b = NULL;\n    if (sam_fn) {\n        hts = hts_open(sam_fn, \"r\");\n        if (hts == NULL || hts_get_format(hts)->category != sequence_data)\n            error(\"File %s does not contain sequence data\\n\", sam_fn);\n        sam_hdr = sam_hdr_read(hts);\n        if (sam_hdr == NULL) error(\"Reading header from \\\"%s\\\" failed\", sam_fn);\n        b = bam_init1();\n        if (b == NULL) error(\"Cannot create SAM record\\n\");\n    }\n    kstring_t str = {0, 0, NULL};\n\n    htsFile *fp = hts_open(fn, \"r\");\n    if (!fp) error(\"Could not read: %s\\n\", fn);\n    if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error(\"Empty file: %s\\n\", fn);\n    const char *null_strand = \"---\";\n    while (str.s[0] == '#') {\n        if (strcmp(str.s, \"#%netaffx-annotation-tabular-format-version=1.0\") == 0) null_strand = \"---\";\n        if (strcmp(str.s, \"#%netaffx-annotation-tabular-format-version=1.5\") == 0) null_strand = \"+\";\n        if (hts && out_txt) fprintf(out_txt, \"%s\\n\", str.s);\n        hts_getline(fp, KS_SEP_LINE, &str);\n    }\n\n    if (hts && out_txt) fprintf(out_txt, \"%s\\n\", str.s);\n\n    int probe_set_id_idx = -1;\n    int affy_snp_id_idx = -1;\n    int dbsnp_rs_id_idx = -1;\n    int chromosome_idx = -1;\n    int position_idx = -1;\n    int position_end_idx = -1;\n    int strand_idx = -1;\n    int flank_idx = -1;\n    int allele_a_idx = -1;\n    int allele_b_idx = -1;\n\n    int i, moff = 0, *off = NULL;\n    int ncols = ksplit_core(str.s, ',', &moff, &off);\n    for (i = 0; i < ncols; i++) {\n        if (strcmp(&str.s[off[i]], \"\\\"Probe Set ID\\\"\") == 0)\n            probe_set_id_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Affy SNP ID\\\"\") == 0)\n            affy_snp_id_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"dbSNP RS ID\\\"\") == 0)\n            dbsnp_rs_id_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Chromosome\\\"\") == 0)\n            chromosome_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Physical Position\\\"\") == 0)\n            position_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Position End\\\"\") == 0)\n            position_end_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Strand\\\"\") == 0)\n            strand_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Flank\\\"\") == 0)\n            flank_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Allele A\\\"\") == 0)\n            allele_a_idx = i;\n        else if (strcmp(&str.s[off[i]], \"\\\"Allele B\\\"\") == 0)\n            allele_b_idx = i;\n    }\n    if (probe_set_id_idx != 0) error(\"Probe Set ID not the first column in file: %s\\n\", fn);\n    if (flank_idx == -1) error(\"Flank missing from file: %s\\n\", fn);\n    if (allele_a_idx == -1) error(\"Allele A missing from file: %s\\n\", fn);\n    if (allele_b_idx == -1) error(\"Allele B missing from file: %s\\n\", fn);\n    const char *probeset_id, *flank, *allele_a, *allele_b;\n\n    if (!hts && out_txt) {\n        while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {\n            ncols = ksplit_core(str.s, ',', &moff, &off);\n            probeset_id = unquote(&str.s[off[probe_set_id_idx]]);\n            flank = unquote(&str.s[off[flank_idx]]);\n            if (flank) flank2fasta(probeset_id, flank, out_txt);\n        }\n    } else {\n        if (dbsnp_rs_id_idx == -1) error(\"dbSNP RS ID missing from file: %s\\n\", fn);\n        if (chromosome_idx == -1) error(\"Chromosome missing from file: %s\\n\", fn);\n        if (position_idx == -1) error(\"Physical Position missing from file: %s\\n\", fn);\n        if (strand_idx == -1) error(\"Strand missing from file: %s\\n\", fn);\n\n        if (!out_txt) {\n            annot = (annot_t *)calloc(1, sizeof(annot_t));\n            annot->probeset_id = khash_str2int_init();\n        }\n\n        int n_total = 0, n_unmapped = 0;\n        while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {\n            ncols = ksplit_core(str.s, ',', &moff, &off);\n            probeset_id = unquote(&str.s[off[probe_set_id_idx]]);\n            flank = unquote(&str.s[off[flank_idx]]);\n            allele_a = unquote(&str.s[off[allele_a_idx]]);\n            allele_b = unquote(&str.s[off[allele_b_idx]]);\n            const char *chromosome = NULL;\n            int strand = -1, position = 0, idx = -1;\n            if (hts) {\n                if (!flank) {\n                    if (flags & VERBOSE) fprintf(stderr, \"Missing flank sequence for marker %s\\n\", probeset_id);\n                    n_unmapped++;\n                } else {\n                    idx = get_position(hts, sam_hdr, b, probeset_id, flank, 0, &chromosome, &position, &strand);\n                    if (idx < 0)\n                        error(\"Reading from %s failed\", sam_fn);\n                    else if (idx == 0) {\n                        if (flags & VERBOSE)\n                            fprintf(stderr, \"Unable to determine position for marker %s\\n\", probeset_id);\n                        n_unmapped++;\n                    }\n                }\n                n_total++;\n            } else {\n                chromosome = unquote(&str.s[off[chromosome_idx]]);\n                const char *ptr = unquote(&str.s[off[position_idx]]);\n                char *tmp = NULL;\n                if (ptr) {\n                    position = strtol(ptr, &tmp, 0);\n                    if (*tmp) error(\"Could not parse position %s from file: %s\\n\", ptr, fn);\n                } else {\n                    position = 0;\n                }\n                ptr = unquote(&str.s[off[strand_idx]]);\n                if (!ptr)\n                    strand = -1;\n                else if (strcmp(ptr, \"+\") == 0)\n                    strand = 0;\n                else if (strcmp(ptr, \"-\") == 0)\n                    strand = 1;\n                else\n                    strand = -1;\n            }\n\n            if (out_txt) {\n                // \"Ref Allele\" and \"Alt Allele\" will not be updated\n                fprintf(out_txt, \"\\\"%s\\\"\", probeset_id);\n                for (i = 1; i < ncols; i++) {\n                    if (i == flank_idx) {\n                        fprintf(out_txt, \",\\\"%s\\\"\", flank);\n                    } else if (i == allele_a_idx) {\n                        fprintf(out_txt, \",\\\"%s\\\"\", allele_a);\n                    } else if (i == allele_b_idx) {\n                        fprintf(out_txt, \",\\\"%s\\\"\", allele_b);\n                    } else if (i == chromosome_idx) {\n                        if (chromosome)\n                            fprintf(out_txt, \",\\\"%s\\\"\", chromosome);\n                        else\n                            fprintf(out_txt, \",\\\"---\\\"\");\n                    } else if (i == position_idx) {\n                        if (position)\n                            fprintf(out_txt, \",\\\"%d\\\"\", position);\n                        else\n                            fprintf(out_txt, \",\\\"---\\\"\");\n                    } else if (i == position_end_idx) {\n                        if (flank && position && idx > 0) {\n                            const char *left = strchr(flank, '[');\n                            const char *middle = strchr(flank, '/');\n                            const char *right = strchr(flank, ']');\n                            if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank);\n\n                            fprintf(out_txt, \",\\\"%d\\\"\",\n                                    position + (int)(idx > 1 ? right - middle : middle - left + (*(left + 1) == '-'))\n                                        - 2);\n                        } else {\n                            fprintf(out_txt, \",\\\"---\\\"\");\n                        }\n                    } else if (i == strand_idx) {\n                        fprintf(out_txt, \",\\\"%s\\\"\", strand == 0 ? \"+\" : (strand == 1 ? \"-\" : null_strand));\n                    } else {\n                        fprintf(out_txt, \",%s\", &str.s[off[i]]);\n                    }\n                }\n                fprintf(out_txt, \"\\n\");\n            } else {\n                hts_expand0(record_t, annot->n_records + 1, annot->m_records, annot->records);\n                annot->records[annot->n_records].probeset_id = strdup(probeset_id);\n                if (khash_str2int_has_key(annot->probeset_id, annot->records[annot->n_records].probeset_id))\n                    error(\"Probe Set %s present multiple times in file %s\\n\",\n                          annot->records[annot->n_records].probeset_id, fn);\n                khash_str2int_inc(annot->probeset_id, annot->records[annot->n_records].probeset_id);\n                const char *dbsnp_rs_id = unquote(&str.s[off[dbsnp_rs_id_idx]]);\n                if (dbsnp_rs_id) annot->records[annot->n_records].dbsnp_rs_id = strdup(dbsnp_rs_id);\n                if (affy_snp_id_idx >= 0) {\n                    const char *affy_snp_id = unquote(&str.s[off[affy_snp_id_idx]]);\n                    if (affy_snp_id) annot->records[annot->n_records].affy_snp_id = strdup(affy_snp_id);\n                }\n                if (chromosome) annot->records[annot->n_records].chromosome = strdup(chromosome);\n                annot->records[annot->n_records].position = position;\n                if (flank) {\n                    annot->records[annot->n_records].flank = strdup(flank);\n                    // check whether alleles A and B need to be flipped in\n                    // the flank sequence (happens with T/C and T/G SNPs\n                    // only)\n                    char *left = strchr(annot->records[annot->n_records].flank, '[');\n                    char *middle = strchr(annot->records[annot->n_records].flank, '/');\n                    char *right = strchr(annot->records[annot->n_records].flank, ']');\n                    if (strncmp(left + 1, allele_b, middle - left - 1) == 0\n                        && strncmp(middle + 1, allele_a, right - middle - 1) == 0) {\n                        memcpy(left + 1, allele_a, right - middle - 1);\n                        *(left + (right - middle)) = '/';\n                        memcpy(left + (right - middle) + 1, allele_b, middle - left - 1);\n                    }\n                }\n                annot->records[annot->n_records].strand = strand;\n                annot->n_records++;\n            }\n        }\n        if (hts) fprintf(stderr, \"Lines   total/unmapped:\\t%d/%d\\n\", n_total, n_unmapped);\n\n        bam_destroy1(b);\n        sam_hdr_destroy(sam_hdr);\n        if (hts && hts_close(hts) < 0) error(\"closing \\\"%s\\\" failed\", fn);\n    }\n\n    free(off);\n    free(str.s);\n    hts_close(fp);\n\n    if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt);\n    return annot;\n}\n\nstatic void annot_destroy(annot_t *annot) {\n    int i;\n    khash_str2int_destroy(annot->probeset_id);\n    for (i = 0; i < annot->n_records; i++) {\n        free(annot->records[i].probeset_id);\n        free(annot->records[i].affy_snp_id);\n        free(annot->records[i].dbsnp_rs_id);\n        free(annot->records[i].chromosome);\n        free(annot->records[i].flank);\n    }\n    free(annot->records);\n    free(annot);\n}\n\n/****************************************\n * READER ITERATORS                     *\n ****************************************/\n\n#define MAX_LENGTH_PROBE_SET_ID 17\ntypedef struct {\n    int nsmpl;\n\n    DataSet **data_sets;\n    int *nrows;\n    int *is_brlmm_p;\n    htsFile *calls_fp;\n    htsFile *confidences_fp;\n    htsFile *summary_fp;\n    char probeset_id[MAX_LENGTH_PROBE_SET_ID + 1];\n\n    int *gts;\n    float *conf_arr;\n    float *norm_x_arr;\n    float *norm_y_arr;\n    float *delta_arr;\n    float *size_arr;\n    float *baf_arr;\n    float *lrr_arr;\n} varitr_t;\n\nstatic void varitr_init_common(varitr_t *varitr) {\n    varitr->gts = (int *)malloc(varitr->nsmpl * sizeof(int));\n    varitr->conf_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->norm_x_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->norm_y_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->delta_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->size_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->baf_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n    varitr->lrr_arr = (float *)malloc(varitr->nsmpl * sizeof(float));\n}\n\nstatic varitr_t *varitr_init_cc(bcf_hdr_t *hdr, agcc_t **agcc, int n) {\n    int i;\n    varitr_t *varitr = (varitr_t *)calloc(1, sizeof(varitr_t));\n    varitr->nsmpl = n;\n    varitr->data_sets = (DataSet **)malloc(n * sizeof(DataSet *));\n    varitr->nrows = (int *)calloc(n, sizeof(int));\n    varitr->is_brlmm_p = (int *)malloc(n * sizeof(int));\n    for (i = 0; i < n; i++) {\n        if (strcmp(agcc[i]->data_header.data_type_identifier, \"affymetrix-multi-data-type-analysis\") != 0)\n            error(\"AGCC file %s does not contain multi data type analysis as \\n\", agcc[i]->fn);\n        if (agcc[i]->num_data_groups == 0 || wcscmp(agcc[i]->data_groups[0].name, L\"MultiData\") != 0)\n            error(\"AGCC file %s does not contain multi data\\n\", agcc[i]->fn);\n        if (agcc[i]->data_groups[0].num_data_sets == 0\n            || wcscmp(agcc[i]->data_groups[0].data_sets[0].name, L\"Genotype\") != 0)\n            error(\"AGCC file %s does not contain genotype data\\n\", agcc[i]->fn);\n        DataSet *data_set = &agcc[i]->data_groups[0].data_sets[0];\n        if (wcscmp(data_set->col_headers[0].name, L\"ProbeSetName\") != 0\n            || wcscmp(data_set->col_headers[1].name, L\"Call\") != 0\n            || wcscmp(data_set->col_headers[2].name, L\"Confidence\") != 0\n            || wcscmp(data_set->col_headers[5].name, L\"Forced Call\") != 0)\n            error(\"AGCC file %s does not contain genotype data in the expected format\\n\", agcc[i]->fn);\n        if (wcscmp(data_set->col_headers[3].name, L\"Contrast\") == 0\n            || wcscmp(data_set->col_headers[3].name, L\"Log Ratio\") == 0\n            || wcscmp(data_set->col_headers[4].name, L\"Strength\") == 0)\n            varitr->is_brlmm_p[i] = 1; // ProbeSetName / Call / Confidence / Contrast/Log Ratio\n                                       // / Strength / Forced Call\n        else if (wcscmp(data_set->col_headers[3].name, L\"Signal A\") == 0\n                 || wcscmp(data_set->col_headers[4].name, L\"Signal B\") == 0)\n            varitr->is_brlmm_p[i] = 0; // ProbeSetName / Call / Confidence / Signal A\n                                       // / Signal B / Forced Call\n        else\n            error(\"AGCC file %s does not contain intensities data in the expected format\\n\", agcc[i]->fn);\n        if (hseek(data_set->hfile, data_set->pos_first_element, SEEK_SET) < 0)\n            error(\"Fail to seek to position %d in AGCC file\\n\", data_set->pos_first_element);\n        bcf_hdr_add_sample(hdr, agcc[i]->display_name);\n        varitr->data_sets[i] = data_set;\n    }\n    varitr_init_common(varitr);\n    return varitr;\n}\n\nstatic varitr_t *varitr_init_txt(bcf_hdr_t *hdr, const char *calls_fn, const char *confidences_fn,\n                                 const char *summary_fn) {\n    varitr_t *varitr = (varitr_t *)calloc(1, sizeof(varitr_t));\n\n    kstring_t str = {0, 0, NULL};\n    int i, moff = 0, *off = NULL, ncols;\n\n    if (calls_fn) {\n        fprintf(stderr, \"Reading genotype calls file %s\\n\", calls_fn);\n        varitr->calls_fp = unheader(calls_fn, &str);\n        ncols = ksplit_core(str.s, '\\t', &moff, &off);\n        if (strcmp(&str.s[off[0]], \"probeset_id\"))\n            error(\"Malformed first line from calls file: %s\\n%s\\n\", calls_fn, str.s);\n        varitr->nsmpl = ncols - 1;\n        for (i = 1; i < ncols; i++) {\n            char *ptr = strrchr(&str.s[off[i]], '.');\n            if (ptr && strcmp(ptr + 1, \"CEL\") == 0) *ptr = '\\0';\n            bcf_hdr_add_sample(hdr, &str.s[off[i]]);\n        }\n    }\n\n    if (confidences_fn) {\n        fprintf(stderr, \"Reading genotype confidences file %s\\n\", confidences_fn);\n        varitr->confidences_fp = unheader(confidences_fn, &str);\n        ncols = ksplit_core(str.s, '\\t', &moff, &off);\n        if (strcmp(&str.s[off[0]], \"probeset_id\"))\n            error(\"Malformed first line from confidences file: %s\\n%s\\n\", confidences_fn, str.s);\n        if (!varitr->calls_fp) {\n            varitr->nsmpl = ncols - 1;\n            for (i = 1; i < ncols; i++) {\n                char *ptr = strrchr(&str.s[off[i]], '.');\n                if (ptr && strcmp(ptr + 1, \"CEL\") == 0) *ptr = '\\0';\n                bcf_hdr_add_sample(hdr, &str.s[off[i]]);\n            }\n        }\n    }\n\n    if (summary_fn) {\n        fprintf(stderr, \"Reading allelic intensities file %s\\n\", summary_fn);\n        varitr->summary_fp = unheader(summary_fn, &str);\n        ncols = ksplit_core(str.s, '\\t', &moff, &off);\n        if (strcmp(&str.s[off[0]], \"probeset_id\"))\n            error(\"Malformed first line from summary file: %s\\n%s\\n\", summary_fn, str.s);\n        if (!varitr->calls_fp && !varitr->confidences_fp) {\n            varitr->nsmpl = ncols - 1;\n            for (i = 1; i < ncols; i++) {\n                char *ptr = strrchr(&str.s[off[i]], '.');\n                if (ptr && strcmp(ptr + 1, \"CEL\") == 0) *ptr = '\\0';\n                bcf_hdr_add_sample(hdr, &str.s[off[i]]);\n            }\n        }\n    }\n\n    free(str.s);\n    free(off);\n\n    varitr_init_common(varitr);\n    return varitr;\n}\n\nstatic inline void check_probe_set_id(char *dest, const char *src) {\n    if (dest[0] == '\\0') {\n        if (strlen(src) > MAX_LENGTH_PROBE_SET_ID) error(\"Probe Set Name %s is too long\\n\", src);\n        strcpy(dest, src);\n    } else {\n        if (strcmp(dest, src) != 0) error(\"Probe Set Name mismatch: %s %s\\n\", dest, src);\n    }\n}\n\nstatic int varitr_loop(varitr_t *varitr, void *probeset_ids) {\n    int i, ret = 0;\n    varitr->probeset_id[0] = '\\0';\n    if (varitr->data_sets) {\n        for (i = 0; i < varitr->nsmpl; i++) {\n            DataSet *data_set = varitr->data_sets[i];\n            uint32_t n;\n            char probeset_id[MAX_LENGTH_PROBE_SET_ID + 1];\n            do {\n                varitr->nrows[i]++;\n                // check whether you have arrived at the last element\n                if (varitr->nrows[i] > data_set->n_rows) return -1;\n                read_bytes(data_set->hfile, (void *)data_set->buffer, data_set->n_buffer);\n                n = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[0]]);\n                if (n > MAX_LENGTH_PROBE_SET_ID)\n                    error(\"Probe Set Name %.*s is too long\\n\", n, &data_set->buffer[data_set->col_offsets[0] + 4]);\n                strncpy(probeset_id, &data_set->buffer[data_set->col_offsets[0] + 4], (size_t)n);\n                probeset_id[n] = '\\0';\n            } while (probeset_ids && !khash_str2int_has_key(probeset_ids, probeset_id));\n            check_probe_set_id(varitr->probeset_id, probeset_id);\n            varitr->gts[i] = chp_gt[data_set->buffer[data_set->col_offsets[1]] & 0x0F];\n            union {\n                uint32_t u;\n                float f;\n            } convert;\n            convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[2]]);\n            varitr->conf_arr[i] = convert.f;\n            if (varitr->is_brlmm_p[i]) {\n                convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[3]]);\n                varitr->delta_arr[i] = convert.f;\n                convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[4]]);\n                varitr->size_arr[i] = convert.f;\n                varitr->norm_x_arr[i] = expf((varitr->size_arr[i] + varitr->delta_arr[i] * 0.5f) * (float)M_LN2);\n                varitr->norm_y_arr[i] = expf((varitr->size_arr[i] - varitr->delta_arr[i] * 0.5f) * (float)M_LN2);\n            } else {\n                convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[3]]);\n                varitr->norm_x_arr[i] = convert.f;\n                convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[4]]);\n                varitr->norm_y_arr[i] = convert.f;\n                float log2x = logf(varitr->norm_x_arr[i]) * (float)M_LOG2E;\n                float log2y = logf(varitr->norm_y_arr[i]) * (float)M_LOG2E;\n                varitr->delta_arr[i] = log2x - log2y;\n                varitr->size_arr[i] = (log2x + log2y) * 0.5f;\n            }\n        }\n    } else {\n        kstring_t str = {0, 0, NULL};\n        int moff = 0, *off = NULL, ncols, len;\n        kstring_t str_b = {0, 0, NULL};\n        int moff_b = 0, *off_b = NULL, ncols_b, len_b;\n        char *tmp;\n\n        // read genotypes\n        if (varitr->calls_fp) {\n            do {\n                if ((ret = hts_getline(varitr->calls_fp, KS_SEP_LINE, &str)) < 0) goto exit;\n                ncols = ksplit_core(str.s, '\\t', &moff, &off);\n                if (ncols != 1 + varitr->nsmpl)\n                    error(\"Expected %d columns but %d columns found in the calls file\\n\", 1 + varitr->nsmpl, ncols);\n                for (i = 1; i < 1 + varitr->nsmpl; i++) {\n                    int gt = strtol(&str.s[off[i]], &tmp, 0);\n                    if (*tmp || gt < -4 || gt > 27)\n                        error(\"Could not parse genotype %s found in the calls file\\n\", &str.s[off[i]]);\n                    varitr->gts[i - 1] = txt_gt[4 + gt];\n                }\n            } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]]));\n            check_probe_set_id(varitr->probeset_id, &str.s[off[0]]);\n        }\n\n        // read confidences\n        if (varitr->confidences_fp) {\n            do {\n                if ((ret = hts_getline(varitr->confidences_fp, KS_SEP_LINE, &str)) < 0) goto exit;\n                ncols = ksplit_core(str.s, '\\t', &moff, &off);\n                if (ncols != 1 + varitr->nsmpl)\n                    error(\"Expected %d columns but %d columns found in the confidences file\\n\", 1 + varitr->nsmpl,\n                          ncols);\n                for (i = 1; i < 1 + varitr->nsmpl; i++) varitr->conf_arr[i - 1] = strtof(&str.s[off[i]], &tmp);\n            } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]]));\n            check_probe_set_id(varitr->probeset_id, &str.s[off[0]]);\n        }\n\n        // read intensities\n        if (varitr->summary_fp) {\n            do {\n                // skips -C/-D/-E/-F/-G summary statistics\n                do {\n                    if ((ret = hts_getline(varitr->summary_fp, KS_SEP_LINE, &str)) < 0) goto exit;\n                    ncols = ksplit_core(str.s, '\\t', &moff, &off);\n                    if (ncols != 1 + varitr->nsmpl)\n                        error(\"Expected %d columns but %d columns found in the summary file\\n\", 1 + varitr->nsmpl,\n                              ncols);\n                    len = strlen(&str.s[off[0]]);\n                } while (str.s[off[0] + len - 2] != '-' && str.s[off[0] + len - 1] != 'A');\n                // skips probes with -A summary statistics only\n                do {\n                    // check whether the next line contains the expected -B probeset_id\n                    if ((ret = hts_getline(varitr->summary_fp, KS_SEP_LINE, &str_b)) < 0) goto exit;\n                    ncols_b = ksplit_core(str_b.s, '\\t', &moff_b, &off_b);\n                    if (ncols_b != 1 + varitr->nsmpl)\n                        error(\"Expected %d columns but %d columns found in the summary file\\n\", 1 + varitr->nsmpl,\n                              ncols_b);\n                    len_b = strlen(&str_b.s[off_b[0]]);\n                    if (str_b.s[off_b[0] + len_b - 2] == '-' && str_b.s[off_b[0] + len_b - 1] == 'B') break;\n\n                    kstring_t str_tmp = str;\n                    str = str_b;\n                    str_b = str_tmp;\n                    int len_tmp = len;\n                    len = len_b;\n                    len_b = len_tmp;\n                    int moff_tmp = moff;\n                    moff = moff_b;\n                    moff_b = moff_tmp;\n                    int *off_tmp = off;\n                    off = off_b;\n                    off_b = off_tmp;\n                    int ncols_tmp = ncols;\n                    ncols = ncols_b;\n                    ncols_b = ncols_tmp;\n                } while (1);\n\n                if (len != len_b || strncmp(&str.s[off[0]], &str_b.s[off_b[0]], len - 2) != 0)\n                    error(\"Mismatching %s and %s Probe Set IDs found in the summary file\\n\", &str.s[off[0]],\n                          &str_b.s[off_b[0]]);\n                for (i = 1; i < 1 + varitr->nsmpl; i++) {\n                    varitr->norm_x_arr[i - 1] = strtof(&str.s[off[i]], &tmp);\n                    if (*tmp) error(\"Could not parse intensity value %s found in the summary file\\n\", &str.s[off[i]]);\n                    varitr->norm_y_arr[i - 1] = strtof(&str_b.s[off_b[i]], &tmp);\n                    if (*tmp)\n                        error(\"Could not parse intensity value %s found in the summary file\\n\", &str_b.s[off_b[i]]);\n                    float log2x = logf(varitr->norm_x_arr[i - 1]) * (float)M_LOG2E;\n                    float log2y = logf(varitr->norm_y_arr[i - 1]) * (float)M_LOG2E;\n                    varitr->delta_arr[i - 1] = log2x - log2y;\n                    varitr->size_arr[i - 1] = (log2x + log2y) * 0.5f;\n                }\n                str.s[off[0] + len - 2] = '\\0';\n            } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]]));\n            check_probe_set_id(varitr->probeset_id, &str.s[off[0]]);\n        }\n    exit:\n        free(str_b.s);\n        free(off_b);\n        free(str.s);\n        free(off);\n    }\n    return ret;\n}\n\nstatic void varitr_destroy(varitr_t *varitr) {\n    free(varitr->data_sets);\n    free(varitr->nrows);\n    free(varitr->is_brlmm_p);\n    if (varitr->calls_fp) hts_close(varitr->calls_fp);\n    if (varitr->confidences_fp) hts_close(varitr->confidences_fp);\n    if (varitr->summary_fp) hts_close(varitr->summary_fp);\n    free(varitr->gts);\n    free(varitr->conf_arr);\n    free(varitr->norm_x_arr);\n    free(varitr->norm_y_arr);\n    free(varitr->delta_arr);\n    free(varitr->size_arr);\n    free(varitr->baf_arr);\n    free(varitr->lrr_arr);\n    free(varitr);\n}\n\n/****************************************\n * OUTPUT FUNCTIONS                     *\n ****************************************/\n\nstatic bcf_hdr_t *hdr_init(const faidx_t *fai, int flags) {\n    bcf_hdr_t *hdr = bcf_hdr_init(\"w\");\n    int i, n = faidx_nseq(fai);\n    for (i = 0; i < n; i++) {\n        const char *seq = faidx_iseq(fai, i);\n        int len = faidx_seq_len(fai, seq);\n        bcf_hdr_printf(hdr, \"##contig=<ID=%s,length=%d>\", seq, len);\n    }\n    bcf_hdr_append(hdr, \"##INFO=<ID=ALLELE_A,Number=1,Type=Integer,Description=\\\"A allele\\\">\");\n    bcf_hdr_append(hdr, \"##INFO=<ID=ALLELE_B,Number=1,Type=Integer,Description=\\\"B allele\\\">\");\n    bcf_hdr_append(hdr, \"##INFO=<ID=DBSNP_RS_ID,Number=1,Type=String,Description=\\\"dbSNP RS ID\\\">\");\n    bcf_hdr_append(hdr, \"##INFO=<ID=AFFY_SNP_ID,Number=1,Type=String,Description=\\\"Affymetrix SNP ID\\\">\");\n    if (flags & SNP_LOADED) {\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_AA,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for AA diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_AB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for AB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_BB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for BB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_AA,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for AA diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_AB,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for AB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_BB,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for BB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_AA,Number=1,Type=Float,Description=\\\"Number of AA \"\n                       \"calls in training set for diploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_AB,Number=1,Type=Float,Description=\\\"Number of AB \"\n                       \"calls in training set for diploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_BB,Number=1,Type=Float,Description=\\\"Number of BB \"\n                       \"calls in training set for diploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_AA,Number=1,Type=Float,Description=\\\"Number of AA \"\n                       \"calls in training set for diploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_AB,Number=1,Type=Float,Description=\\\"Number of AB \"\n                       \"calls in training set for diploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_BB,Number=1,Type=Float,Description=\\\"Number of BB \"\n                       \"calls in training set for diploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_AA,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for AA diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_AB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for AB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_BB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for BB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_AA,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for AA diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_AB,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for AB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_BB,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for BB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_AA,Number=1,Type=Float,Description=\\\"Covariance for \"\n                       \"AA diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_AB,Number=1,Type=Float,Description=\\\"Covariance for \"\n                       \"AB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_BB,Number=1,Type=Float,Description=\\\"Covariance for \"\n                       \"BB diploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_AA.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for AA haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_AB.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for AB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanX_BB.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized DELTA for BB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_AA.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for AA haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_AB.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for AB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varX_BB.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized DELTA for BB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_AA.1,Number=1,Type=Float,Description=\\\"Number of \"\n                       \"AA calls in training set for haploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_AB.1,Number=1,Type=Float,Description=\\\"Number of \"\n                       \"AB calls in training set for haploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsMean_BB.1,Number=1,Type=Float,Description=\\\"Number of \"\n                       \"BB calls in training set for haploid mean\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_AA.1,Number=1,Type=Float,Description=\\\"Number of AA \"\n                       \"calls in training set for haploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_AB.1,Number=1,Type=Float,Description=\\\"Number of AB \"\n                       \"calls in training set for haploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=nObsVar_BB.1,Number=1,Type=Float,Description=\\\"Number of BB \"\n                       \"calls in training set for haploid variance\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_AA.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for AA haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_AB.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for AB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanY_BB.1,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized SIZE for BB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_AA.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for AA haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_AB.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for AB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=varY_BB.1,Number=1,Type=Float,Description=\\\"Variance of \"\n                       \"normalized SIZE for BB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_AA.1,Number=1,Type=Float,Description=\\\"Covariance \"\n                       \"for AA haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_AB.1,Number=1,Type=Float,Description=\\\"Covariance \"\n                       \"for AB haploid cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=covarXY_BB.1,Number=1,Type=Float,Description=\\\"Covariance \"\n                       \"for BB haploid cluster\\\">\");\n    }\n    if (!(flags & NO_INFO_GC))\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=GC,Number=1,Type=Float,Description=\\\"GC ratio content \"\n                       \"around the variant\\\">\");\n    if ((flags & CALLS_LOADED) && (flags & FORMAT_GT))\n        bcf_hdr_append(hdr, \"##FORMAT=<ID=GT,Number=1,Type=String,Description=\\\"Genotype\\\">\");\n    if ((flags & CONFIDENCES_LOADED) && (flags & FORMAT_CONF))\n        bcf_hdr_append(hdr, \"##FORMAT=<ID=CONF,Number=1,Type=Float,Description=\\\"Genotype confidence\\\">\");\n    if (flags & SUMMARY_LOADED) {\n        if (flags & FORMAT_NORMX)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=NORMX,Number=1,Type=Float,Description=\\\"Normalized X \"\n                           \"intensity\\\">\");\n        if (flags & FORMAT_NORMY)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=NORMY,Number=1,Type=Float,Description=\\\"Normalized Y \"\n                           \"intensity\\\">\");\n        if (flags & FORMAT_DELTA)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=DELTA,Number=1,Type=Float,Description=\\\"Normalized \"\n                           \"contrast value\\\">\");\n        if (flags & FORMAT_SIZE)\n            bcf_hdr_append(hdr, \"##FORMAT=<ID=SIZE,Number=1,Type=Float,Description=\\\"Normalized size value\\\">\");\n    }\n    if ((flags & SUMMARY_LOADED) && (flags & SNP_LOADED)) {\n        if (flags & FORMAT_BAF)\n            bcf_hdr_append(hdr, \"##FORMAT=<ID=BAF,Number=1,Type=Float,Description=\\\"B Allele Frequency\\\">\");\n        if (flags & FORMAT_LRR)\n            bcf_hdr_append(hdr, \"##FORMAT=<ID=LRR,Number=1,Type=Float,Description=\\\"Log R Ratio\\\">\");\n    }\n    return hdr;\n}\n\n// adjust cluster centers (using apt-probeset-genotype posteriors as priors)\n// similar to\n// http://github.com/WGLab/PennCNV/blob/master/affy/bin/generate_affy_geno_cluster.pl\nstatic void adjust_clusters(const int *gts, const float *x, const float *y, int n, snp_t *snp) {\n    snp->aa.xm *= 0.2f;\n    snp->ab.xm *= 0.2f;\n    snp->bb.xm *= 0.2f;\n    snp->aa.ym *= 0.2f;\n    snp->ab.ym *= 0.2f;\n    snp->bb.ym *= 0.2f;\n    snp->aa.k = 0.2f;\n    snp->ab.k = 0.2f;\n    snp->bb.k = 0.2f;\n\n    int i;\n    for (i = 0; i < n; i++) {\n        switch (gts[i]) {\n        case GT_AA:\n            snp->aa.k++;\n            snp->aa.xm += x[i];\n            snp->aa.ym += y[i];\n            break;\n        case GT_AB:\n            snp->ab.k++;\n            snp->ab.xm += x[i];\n            snp->ab.ym += y[i];\n            break;\n        case GT_BB:\n            snp->bb.k++;\n            snp->bb.xm += x[i];\n            snp->bb.ym += y[i];\n            break;\n        default:\n            break;\n        }\n    }\n\n    snp->aa.xm /= snp->aa.k;\n    snp->ab.xm /= snp->ab.k;\n    snp->bb.xm /= snp->bb.k;\n    snp->aa.ym /= snp->aa.k;\n    snp->ab.ym /= snp->ab.k;\n    snp->bb.ym /= snp->bb.k;\n}\n\nstatic void update_info_cluster(const bcf_hdr_t *hdr, bcf1_t *rec, const char **info_str, const snp_t *snp) {\n    bcf_update_info_float(hdr, rec, info_str[0], &snp->aa.xm, 1);\n    bcf_update_info_float(hdr, rec, info_str[1], &snp->ab.xm, 1);\n    bcf_update_info_float(hdr, rec, info_str[2], &snp->bb.xm, 1);\n    bcf_update_info_float(hdr, rec, info_str[3], &snp->aa.xss, 1);\n    bcf_update_info_float(hdr, rec, info_str[4], &snp->ab.xss, 1);\n    bcf_update_info_float(hdr, rec, info_str[5], &snp->bb.xss, 1);\n    bcf_update_info_float(hdr, rec, info_str[6], &snp->aa.k, 1);\n    bcf_update_info_float(hdr, rec, info_str[7], &snp->ab.k, 1);\n    bcf_update_info_float(hdr, rec, info_str[8], &snp->bb.k, 1);\n    bcf_update_info_float(hdr, rec, info_str[9], &snp->aa.v, 1);\n    bcf_update_info_float(hdr, rec, info_str[10], &snp->ab.v, 1);\n    bcf_update_info_float(hdr, rec, info_str[11], &snp->bb.v, 1);\n    bcf_update_info_float(hdr, rec, info_str[12], &snp->aa.ym, 1);\n    bcf_update_info_float(hdr, rec, info_str[13], &snp->ab.ym, 1);\n    bcf_update_info_float(hdr, rec, info_str[14], &snp->bb.ym, 1);\n    bcf_update_info_float(hdr, rec, info_str[15], &snp->aa.yss, 1);\n    bcf_update_info_float(hdr, rec, info_str[16], &snp->ab.yss, 1);\n    bcf_update_info_float(hdr, rec, info_str[17], &snp->bb.yss, 1);\n    bcf_update_info_float(hdr, rec, info_str[18], &snp->aa.xyss, 1);\n    bcf_update_info_float(hdr, rec, info_str[19], &snp->ab.xyss, 1);\n    bcf_update_info_float(hdr, rec, info_str[20], &snp->bb.xyss, 1);\n}\n\n// compute LRR and BAF\n// similar to\n// http://github.com/WGLab/PennCNV/blob/master/affy/bin/normalize_affy_geno_cluster.pl\nstatic void compute_baf_lrr(const float *norm_x, const float *norm_y, int n, const snp_t *snp, int is_birdseed,\n                            float *baf, float *lrr) {\n    float aa_theta, ab_theta, bb_theta, aa_r, ab_r, bb_r;\n\n    if (is_birdseed) {\n        aa_theta = atan2f(snp->aa.ym, snp->aa.xm) * (float)M_2_PI;\n        ab_theta = atan2f(snp->ab.ym, snp->ab.xm) * (float)M_2_PI;\n        bb_theta = atan2f(snp->bb.ym, snp->bb.xm) * (float)M_2_PI;\n        aa_r = snp->aa.xm + snp->aa.ym;\n        ab_r = snp->ab.xm + snp->ab.ym;\n        bb_r = snp->bb.xm + snp->bb.ym;\n    } else {\n        aa_theta = atanf(expf(-snp->aa.xm * (float)M_LN2)) * (float)M_2_PI;\n        ab_theta = atanf(expf(-snp->ab.xm * (float)M_LN2)) * (float)M_2_PI;\n        bb_theta = atanf(expf(-snp->bb.xm * (float)M_LN2)) * (float)M_2_PI;\n        aa_r = expf(snp->aa.ym * (float)M_LN2) * 2.0f * coshf(snp->aa.xm * 0.5f * (float)M_LN2);\n        ab_r = expf(snp->ab.ym * (float)M_LN2) * 2.0f * coshf(snp->ab.xm * 0.5f * (float)M_LN2);\n        bb_r = expf(snp->bb.ym * (float)M_LN2) * 2.0f * coshf(snp->bb.xm * 0.5f * (float)M_LN2);\n    }\n\n    // handles chromosome Y SNPs\n    if (snp->copynumber == 1) {\n        ab_theta = (aa_theta + bb_theta) * 0.5f;\n        ab_r = (aa_r + bb_r) * 0.5f;\n    }\n\n    int i;\n    for (i = 0; i < n; i++) {\n        float ilmn_theta = atan2f(norm_y[i], norm_x[i]) * (float)M_2_PI;\n        float ilmn_r = norm_x[i] + norm_y[i];\n        get_baf_lrr(ilmn_theta, ilmn_r, aa_theta, ab_theta, bb_theta, aa_r, ab_r, bb_r, NAN, &baf[i], &lrr[i]);\n    }\n}\n\nstatic void process(faidx_t *fai, const annot_t *annot, void *probeset_ids, snp_models_t *snp_models, varitr_t *varitr,\n                    htsFile *out_fh, bcf_hdr_t *hdr, int flags, int gc_win) {\n    int i, nsmpl = bcf_hdr_nsamples(hdr);\n    if ((flags & ADJUST_CLUSTERS) && (nsmpl < 100))\n        fprintf(stderr, \"Warning: adjusting clusters with %d sample(s) is not recommended\\n\", nsmpl);\n\n    bcf1_t *rec = bcf_init();\n    char ref_base[] = {'\\0', '\\0'};\n    kstring_t allele_a = {0, 0, NULL};\n    kstring_t allele_b = {0, 0, NULL};\n    kstring_t flank = {0, 0, NULL};\n\n    int32_t *gt_arr = (int32_t *)malloc(nsmpl * 2 * sizeof(int32_t));\n    float *baf_arr = (float *)malloc(nsmpl * sizeof(float));\n    float *lrr_arr = (float *)malloc(nsmpl * sizeof(float));\n\n    int n_missing = 0, n_no_snp_models = 0, n_skipped = 0;\n    for (i = 0; i < annot->n_records; i++) {\n        // identify variants to use for next VCF record\n        int idx;\n        if (varitr) {\n            if (varitr_loop(varitr, probeset_ids) < 0) break;\n            int ret = khash_str2int_get(annot->probeset_id, varitr->probeset_id, &idx);\n            if (ret < 0) error(\"Probe Set %s not found in manifest file\\n\", varitr->probeset_id);\n        } else {\n            if (probeset_ids && !khash_str2int_has_key(probeset_ids, annot->records[i].probeset_id)) {\n                n_skipped++;\n                continue;\n            }\n            idx = i;\n        }\n        record_t *record = &annot->records[idx];\n\n        bcf_clear(rec);\n        rec->n_sample = nsmpl;\n        rec->rid = bcf_hdr_name2id_flexible(hdr, record->chromosome);\n        rec->pos = record->position - 1;\n        if (rec->rid < 0 || rec->pos < 0 || record->strand < 0 || !record->flank) {\n            if (flags & VERBOSE) fprintf(stderr, \"Skipping unlocalized marker %s\\n\", record->probeset_id);\n            n_skipped++;\n            continue;\n        }\n        bcf_update_id(hdr, rec, record->probeset_id);\n\n        flank.l = 0;\n        kputs(record->flank, &flank);\n        strupper(flank.s);\n        if (record->strand) flank_reverse_complement(flank.s);\n\n        int len, win = min(max(max(gc_win, strlen(flank.s)), 100), rec->pos);\n        char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len);\n        if (!ref || len == 1)\n            error(\"faidx_fetch_seq failed at %s:%\" PRId64 \" (are you using the correct reference genome?)\\n\",\n                  bcf_seqname(hdr, rec), rec->pos + 1);\n        strupper(ref);\n        if (!(flags & NO_INFO_GC)) {\n            float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]);\n            bcf_update_info_float(hdr, rec, \"GC\", &gc_ratio, 1);\n        }\n        ref_base[0] = ref[win];\n        int32_t allele_b_idx;\n        allele_a.l = allele_b.l = 0;\n        if (strchr(flank.s, '-')) {\n            kputc('D', &allele_a);\n            kputc('I', &allele_b);\n            int ref_is_del = get_indel_alleles(&allele_a, &allele_b, flank.s, ref, win, len, 0);\n            if (ref_is_del < 0) {\n                if (flags & VERBOSE) fprintf(stderr, \"Unable to determine alleles for indel %s\\n\", record->probeset_id);\n                n_missing++;\n            }\n            if (ref_is_del == 0) {\n                rec->pos--;\n                ref_base[0] = ref[win - 1];\n            }\n            allele_b_idx = ref_is_del < 0 ? 1 : ref_is_del;\n        } else {\n            const char *left = strchr(flank.s, '[');\n            const char *middle = strchr(flank.s, '/');\n            const char *right = strchr(flank.s, ']');\n            if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank.s);\n            kputsn(left + 1, middle - left - 1, &allele_a);\n            kputsn(middle + 1, right - middle - 1, &allele_b);\n\n            if (middle - left == 2 && right - middle == 2) {\n                allele_b_idx = get_allele_b_idx(ref_base[0], allele_a.s, allele_b.s);\n            } else {\n                int allele_a_match = strncmp(left + 1, &ref[win], middle - left - 1) == 0;\n                int allele_b_match = strncmp(middle + 1, &ref[win], right - middle - 1) == 0;\n                if (allele_a_match && !allele_b_match) {\n                    allele_b_idx = 1;\n                } else if (!allele_a_match && allele_b_match) {\n                    allele_b_idx = 0;\n                } else if (allele_a_match && allele_b_match) {\n                    int allele_a_right =\n                        len_common_prefix(right + 1, &ref[win] + (middle - left) - 1, strlen(right + 1));\n                    int allele_b_right =\n                        len_common_prefix(right + 1, &ref[win] + (right - middle) - 1, strlen(right + 1));\n                    allele_b_idx = allele_a_right > allele_b_right;\n                } else {\n                    allele_b_idx = -1;\n                }\n            }\n        }\n        free(ref);\n\n        int32_t allele_a_idx = get_allele_a_idx(allele_b_idx);\n        const char *alleles[3];\n        int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a.s, allele_b.s, allele_b_idx);\n        if (nals < 0) error(\"Unable to process Probe Set %s\\n\", record->probeset_id);\n        bcf_update_alleles(hdr, rec, alleles, nals);\n        bcf_update_info_int32(hdr, rec, \"ALLELE_A\", &allele_a_idx, 1);\n        bcf_update_info_int32(hdr, rec, \"ALLELE_B\", &allele_b_idx, 1);\n        if (record->dbsnp_rs_id) bcf_update_info_string(hdr, rec, \"DBSNP_RS_ID\", record->dbsnp_rs_id);\n        if (record->affy_snp_id) bcf_update_info_string(hdr, rec, \"AFFY_SNP_ID\", record->affy_snp_id);\n\n        if (varitr) {\n            if ((varitr->data_sets || varitr->calls_fp) && flags & FORMAT_GT) {\n                for (i = 0; i < nsmpl; i++) {\n                    switch (varitr->gts[i]) {\n                    case GT_AA:\n                        gt_arr[2 * i] = bcf_gt_unphased(allele_a_idx);\n                        gt_arr[2 * i + 1] = bcf_gt_unphased(allele_a_idx);\n                        break;\n                    case GT_AB:\n                        gt_arr[2 * i] = bcf_gt_unphased(min(allele_a_idx, allele_b_idx));\n                        gt_arr[2 * i + 1] = bcf_gt_unphased(max(allele_a_idx, allele_b_idx));\n                        break;\n                    case GT_BB:\n                        gt_arr[2 * i] = bcf_gt_unphased(allele_b_idx);\n                        gt_arr[2 * i + 1] = bcf_gt_unphased(allele_b_idx);\n                        break;\n                    case GT_NC:\n                        gt_arr[2 * i] = bcf_gt_missing;\n                        gt_arr[2 * i + 1] = bcf_gt_missing;\n                        break;\n                    default:\n                        error(\"Genotype for Probe Set ID %s is malformed: %d\\n\", record->probeset_id, varitr->gts[i]);\n                        break;\n                    }\n                }\n                bcf_update_genotypes(hdr, rec, gt_arr, nsmpl * 2);\n            }\n\n            if ((varitr->data_sets || varitr->confidences_fp) && flags & FORMAT_CONF)\n                bcf_update_format_float(hdr, rec, \"CONF\", varitr->conf_arr, nsmpl);\n\n            if (varitr->data_sets || varitr->summary_fp) {\n                if (flags & FORMAT_NORMX) bcf_update_format_float(hdr, rec, \"NORMX\", varitr->norm_x_arr, nsmpl);\n                if (flags & FORMAT_NORMY) bcf_update_format_float(hdr, rec, \"NORMY\", varitr->norm_y_arr, nsmpl);\n                if (flags & FORMAT_DELTA) bcf_update_format_float(hdr, rec, \"DELTA\", varitr->delta_arr, nsmpl);\n                if (flags & FORMAT_SIZE) bcf_update_format_float(hdr, rec, \"SIZE\", varitr->size_arr, nsmpl);\n            }\n        }\n\n        if (snp_models) {\n            int rets[2], idxs[2];\n            for (i = 0; i < 2; i++) {\n                rets[i] = khash_str2int_get(snp_models->probeset_id[i], record->probeset_id, &idxs[i]);\n            }\n            static const char *hap_info_str[] = {\n                \"meanX_AA.1\",    \"meanX_AB.1\",    \"meanX_BB.1\",    \"varX_AA.1\",    \"varX_AB.1\",    \"varX_BB.1\",\n                \"nObsMean_AA.1\", \"nObsMean_AB.1\", \"nObsMean_BB.1\", \"nObsVar_AA.1\", \"nObsVar_AB.1\", \"nObsVar_BB.1\",\n                \"meanY_AA.1\",    \"meanY_AB.1\",    \"meanY_BB.1\",    \"varY_AA.1\",    \"varY_AB.1\",    \"varY_BB.1\",\n                \"covarXY_AA.1\",  \"covarXY_AB.1\",  \"covarXY_BB.1\"};\n            static const char *dip_info_str[] = {\n                \"meanX_AA\",    \"meanX_AB\",    \"meanX_BB\",   \"varX_AA\",    \"varX_AB\",    \"varX_BB\",    \"nObsMean_AA\",\n                \"nObsMean_AB\", \"nObsMean_BB\", \"nObsVar_AA\", \"nObsVar_AB\", \"nObsVar_BB\", \"meanY_AA\",   \"meanY_AB\",\n                \"meanY_BB\",    \"varY_AA\",     \"varY_AB\",    \"varY_BB\",    \"covarXY_AA\", \"covarXY_AB\", \"covarXY_BB\"};\n            if (rets[0] >= 0) update_info_cluster(hdr, rec, hap_info_str, &snp_models->snps[0][idxs[0]]);\n            if (rets[1] >= 0) update_info_cluster(hdr, rec, dip_info_str, &snp_models->snps[1][idxs[1]]);\n            snp_t *snp =\n                rets[1] >= 0 ? &snp_models->snps[1][idxs[1]] : (rets[0] >= 0 ? &snp_models->snps[0][idxs[0]] : NULL);\n            if (!snp) {\n                n_no_snp_models++;\n                if (flags & VERBOSE)\n                    fprintf(stderr, \"Warning: SNP model for Probe Set ID %s was not found\\n\", record->probeset_id);\n            } else {\n                if (flags & ADJUST_CLUSTERS)\n                    adjust_clusters(varitr->gts, snp_models->is_birdseed ? varitr->norm_x_arr : varitr->delta_arr,\n                                    snp_models->is_birdseed ? varitr->norm_y_arr : varitr->size_arr, nsmpl, snp);\n                if (flags & SUMMARY_LOADED) {\n                    compute_baf_lrr(varitr->norm_x_arr, varitr->norm_y_arr, nsmpl, snp, snp_models->is_birdseed,\n                                    baf_arr, lrr_arr);\n                    if (flags & FORMAT_BAF) bcf_update_format_float(hdr, rec, \"BAF\", baf_arr, nsmpl);\n                    if (flags & FORMAT_LRR) bcf_update_format_float(hdr, rec, \"LRR\", lrr_arr, nsmpl);\n                }\n            }\n        }\n\n        if (bcf_write(out_fh, hdr, rec) < 0) error(\"Unable to write to output VCF file\\n\");\n    }\n    if (snp_models)\n        fprintf(stderr, \"Lines   total/missing-reference/missing-snp-posteriors/skipped:\\t%d/%d/%d/%d\\n\", i, n_missing,\n                n_no_snp_models, n_skipped);\n    else\n        fprintf(stderr, \"Lines   total/missing-reference/skipped:\\t%d/%d/%d\\n\", i, n_missing, n_skipped);\n\n    free(gt_arr);\n    free(baf_arr);\n    free(lrr_arr);\n\n    free(allele_a.s);\n    free(allele_b.s);\n    free(flank.s);\n\n    bcf_destroy(rec);\n    return;\n}\n\n/****************************************\n * PLUGIN                               *\n ****************************************/\n\nconst char *about(void) { return \"convert Affymetrix files to VCF.\\n\"; }\n\nstatic const char *usage_text(void) {\n    return \"\\n\"\n           \"About: convert Affymetrix apt-probeset-genotype output files to VCF. \"\n           \"(version \" AFFY2VCF_VERSION\n           \" http://github.com/freeseek/gtc2vcf)\\n\"\n           \"Usage: bcftools +affy2vcf [options] --csv <file> --fasta-ref <file> [<A.chp> ...]\\n\"\n           \"\\n\"\n           \"Plugin options:\\n\"\n           \"    -l, --list-tags                 list available FORMAT tags with description for VCF output\\n\"\n           \"    -t, --tags LIST                 list of output FORMAT tags [\" TAG_LIST_DFLT\n           \"]\\n\"\n           \"    -c, --csv <file>                CSV manifest file (can be gzip compressed)\\n\"\n           \"    -f, --fasta-ref <file>          reference sequence in fasta format\\n\"\n           \"        --set-cache-size <int>      select fasta cache size in bytes\\n\"\n           \"        --gc-window-size <int>      window size in bp used to compute the GC content (-1 for no estimate) \"\n           \"[\" GC_WIN_DFLT\n           \"]\\n\"\n           \"        --probeset-ids              tab delimited file with column 'probeset_id' specifying probesets to \"\n           \"convert\\n\"\n           \"        --calls <file>              apt-probeset-genotype calls output (can be gzip compressed)\\n\"\n           \"        --confidences <file>        apt-probeset-genotype confidences output (can be gzip compressed)\\n\"\n           \"        --summary <file>            apt-probeset-genotype summary output (can be gzip compressed)\\n\"\n           \"        --snp <file>                apt-probeset-genotype SNP posteriors output (can be gzip compressed)\\n\"\n           \"        --chps <dir|file>           input CHP files rather than tab delimited files\\n\"\n           \"        --cel <file>                input CEL files rather CHP files\\n\"\n           \"        --adjust-clusters           adjust cluster centers in (Contrast, Size) space (requires --snp)\\n\"\n           \"        --no-version                do not append version and command line to the header\\n\"\n           \"    -o, --output <file>             write output to a file [standard output]\\n\"\n           \"    -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level \"\n           \"[v]\\n\"\n           \"        --threads <int>             number of extra output compression threads [0]\\n\"\n           \"    -x, --extra <file>              write CHP metadata to a file (requires CHP files)\\n\"\n           \"    -v, --verbose                   print verbose information\\n\"\n           \"    -W, --write-index[=FMT]         Automatically index the output files [off]\\n\"\n           \"\\n\"\n           \"Manifest options:\\n\"\n           \"        --fasta-flank               output flank sequence in FASTA format (requires --csv)\\n\"\n           \"    -s, --sam-flank <file>          input flank sequence alignment in SAM/BAM format (requires --csv)\\n\"\n           \"\\n\"\n           \"Examples:\\n\"\n           \"    bcftools +affy2vcf \\\\\\n\"\n           \"        --csv GenomeWideSNP_6.na35.annot.csv \\\\\\n\"\n           \"        --fasta-ref human_g1k_v37.fasta \\\\\\n\"\n           \"        --chps cc-chp/ \\\\\\n\"\n           \"        --snp AxiomGT1.snp-posteriors.txt \\\\\\n\"\n           \"        --output AxiomGT1.vcf \\\\\\n\"\n           \"        --extra report.tsv\\n\"\n           \"    bcftools +affy2vcf \\\\\\n\"\n           \"        --csv GenomeWideSNP_6.na35.annot.csv \\\\\\n\"\n           \"        --fasta-ref human_g1k_v37.fasta \\\\\\n\"\n           \"        --calls AxiomGT1.calls.txt \\\\\\n\"\n           \"        --confidences AxiomGT1.confidences.txt \\\\\\n\"\n           \"        --summary AxiomGT1.summary.txt \\\\\\n\"\n           \"        --snp AxiomGT1.snp-posteriors.txt \\\\\\n\"\n           \"        --output AxiomGT1.vcf\\n\"\n           \"\\n\"\n           \"Examples of manifest file options:\\n\"\n           \"    bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv --fasta-flank -o  GenomeWideSNP_6.fasta\\n\"\n           \"    bwa mem -M Homo_sapiens_assembly38.fasta GenomeWideSNP_6.fasta -o \"\n           \"GenomeWideSNP_6.sam\\n\"\n           \"    bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv -s GenomeWideSNP_6.sam -o \"\n           \"GenomeWideSNP_6.na35.annot.GRCh38.csv\\n\"\n           \"\\n\";\n}\n\nstatic int parse_tags(const char *str) {\n    int i, flags = 0, n;\n    char **tags = hts_readlist(str, 0, &n);\n    for (i = 0; i < n; i++) {\n        if (!strcasecmp(tags[i], \"GT\"))\n            flags |= FORMAT_GT;\n        else if (!strcasecmp(tags[i], \"CONF\"))\n            flags |= FORMAT_CONF;\n        else if (!strcasecmp(tags[i], \"NORMX\"))\n            flags |= FORMAT_NORMX;\n        else if (!strcasecmp(tags[i], \"NORMY\"))\n            flags |= FORMAT_NORMY;\n        else if (!strcasecmp(tags[i], \"DELTA\"))\n            flags |= FORMAT_DELTA;\n        else if (!strcasecmp(tags[i], \"SIZE\"))\n            flags |= FORMAT_SIZE;\n        else if (!strcasecmp(tags[i], \"LRR\"))\n            flags |= FORMAT_LRR;\n        else if (!strcasecmp(tags[i], \"BAF\"))\n            flags |= FORMAT_BAF;\n        else\n            error(\"Error parsing \\\"--tags %s\\\": the tag \\\"%s\\\" is not supported\\n\", str, tags[i]);\n        free(tags[i]);\n    }\n    if (n) free(tags);\n    return flags;\n}\n\nstatic void list_tags(void) {\n    error(\n        \"FORMAT/GT       Number:1  Type:String   ..  Genotype\\n\"\n        \"FORMAT/CONF     Number:1  Type:Float    ..  Genotype confidence\\n\"\n        \"FORMAT/BAF      Number:1  Type:Float    ..  B Allele Frequency\\n\"\n        \"FORMAT/LRR      Number:1  Type:Float    ..  Log R Ratio\\n\"\n        \"FORMAT/NORMX    Number:1  Type:Float    ..  Normalized X intensity\\n\"\n        \"FORMAT/NORMY    Number:1  Type:Float    ..  Normalized Y intensity\\n\"\n        \"FORMAT/DELTA    Number:1  Type:Float    ..  Normalized Delta value\\n\"\n        \"FORMAT/SIZE     Number:1  Type:Float    ..  Normalized Size value\\n\");\n}\n\nint run(int argc, char *argv[]) {\n    const char *tag_list = TAG_LIST_DFLT;\n    const char *ref_fname = NULL;\n    const char *extra_fname = NULL;\n    const char *csv_fname = NULL;\n    const char *probeset_ids_fname = NULL;\n    const char *calls_fname = NULL;\n    const char *confidences_fname = NULL;\n    const char *summary_fname = NULL;\n    const char *snp_fname = NULL;\n    const char *pathname = NULL;\n    const char *output_fname = \"-\";\n    const char *sam_fname = NULL;\n    char *index_fname;\n    char *tmp;\n    int i;\n    int flags = 0;\n    int output_type = FT_VCF;\n    int clevel = -1;\n    int cache_size = 0;\n    int gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0);\n    int n_threads = 0;\n    int record_cmd_line = 1;\n    int write_index = 0;\n    int fasta_flank = 0;\n    faidx_t *fai = NULL;\n    FILE *out_txt = NULL;\n\n    static struct option loptions[] = {{\"list-tags\", no_argument, NULL, 'l'},\n                                       {\"tags\", required_argument, NULL, 't'},\n                                       {\"csv\", required_argument, NULL, 'c'},\n                                       {\"fasta-ref\", required_argument, NULL, 'f'},\n                                       {\"set-cache-size\", required_argument, NULL, 1},\n                                       {\"gc-window-size\", required_argument, NULL, 2},\n                                       {\"probeset-ids\", required_argument, NULL, 3},\n                                       {\"calls\", required_argument, NULL, 4},\n                                       {\"confidences\", required_argument, NULL, 5},\n                                       {\"summary\", required_argument, NULL, 6},\n                                       {\"snp\", required_argument, NULL, 7},\n                                       {\"chps\", required_argument, NULL, 11},\n                                       {\"cel\", no_argument, NULL, 12},\n                                       {\"adjust-clusters\", no_argument, NULL, 13},\n                                       {\"no-version\", no_argument, NULL, 8},\n                                       {\"output\", required_argument, NULL, 'o'},\n                                       {\"output-type\", required_argument, NULL, 'O'},\n                                       {\"threads\", required_argument, NULL, 9},\n                                       {\"extra\", required_argument, NULL, 'x'},\n                                       {\"verbose\", no_argument, NULL, 'v'},\n                                       {\"fasta-flank\", no_argument, NULL, 14},\n                                       {\"sam-flank\", required_argument, NULL, 's'},\n                                       {\"write-index\", optional_argument, NULL, 'W'},\n                                       {NULL, 0, NULL, 0}};\n    int c;\n    while ((c = getopt_long(argc, argv, \"h?lt:c:f:x:o:O:vs:W::\", loptions, NULL)) >= 0) {\n        switch (c) {\n        case 'l':\n            list_tags();\n            break;\n        case 't':\n            tag_list = optarg;\n            break;\n        case 'c':\n            csv_fname = optarg;\n            break;\n        case 'f':\n            ref_fname = optarg;\n            break;\n        case 1:\n            cache_size = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --set-cache-size %s\\n\", optarg);\n            break;\n        case 2:\n            gc_win = (int)strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --gc-window-size %s\\n\", optarg);\n            if (gc_win <= 0) flags |= NO_INFO_GC;\n            break;\n        case 3:\n            probeset_ids_fname = optarg;\n            flags |= PROBESET_IDS_LOADED;\n            break;\n        case 4:\n            calls_fname = optarg;\n            flags |= CALLS_LOADED;\n            break;\n        case 5:\n            confidences_fname = optarg;\n            flags |= CONFIDENCES_LOADED;\n            break;\n        case 6:\n            summary_fname = optarg;\n            flags |= SUMMARY_LOADED;\n            break;\n        case 7:\n            snp_fname = optarg;\n            flags |= SNP_LOADED;\n            break;\n        case 11:\n            pathname = optarg;\n            break;\n        case 12:\n            flags |= LOAD_CEL;\n            break;\n        case 13:\n            flags |= ADJUST_CLUSTERS;\n            break;\n        case 8:\n            record_cmd_line = 0;\n            break;\n        case 'o':\n            output_fname = optarg;\n            break;\n        case 'O':\n            switch (optarg[0]) {\n            case 'b':\n                output_type = FT_BCF_GZ;\n                break;\n            case 'u':\n                output_type = FT_BCF;\n                break;\n            case 'z':\n                output_type = FT_VCF_GZ;\n                break;\n            case 'v':\n                output_type = FT_VCF;\n                break;\n            default: {\n                clevel = strtol(optarg, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9) error(\"The output type \\\"%s\\\" not recognised\\n\", optarg);\n            }\n            }\n            if (optarg[1]) {\n                clevel = strtol(optarg + 1, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9)\n                    error(\"Could not parse argument: --compression-level %s\\n\", optarg + 1);\n            }\n            break;\n        case 9:\n            n_threads = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse argument: --threads %s\\n\", optarg);\n            break;\n        case 'x':\n            extra_fname = optarg;\n            break;\n        case 'v':\n            flags |= VERBOSE;\n            break;\n        case 14:\n            fasta_flank = 1;\n            break;\n        case 's':\n            sam_fname = optarg;\n            break;\n        case 'W':\n            if (!(write_index = write_index_parse(optarg))) error(\"Unsupported index format '%s'\\n\", optarg);\n            break;\n        case 'h':\n        case '?':\n        default:\n            error(\"%s\", usage_text());\n        }\n    }\n    flags |= parse_tags(tag_list);\n\n    int nfiles = 0;\n    char **filenames = NULL;\n    if (pathname) {\n        filenames = get_file_list(pathname, flags & LOAD_CEL ? \"CEL\" : \"chp\", &nfiles);\n    } else {\n        nfiles = argc - optind;\n        filenames = argv + optind;\n    }\n    uint8_t *magic = (uint8_t *)malloc(nfiles * sizeof(uint8_t *));\n    void **files = (void **)malloc(nfiles * sizeof(void *));\n\n    if (csv_fname) {\n        if (flags & LOAD_CEL) error(\"Cannot use --csv with --cel as CEL files cannot be converted\\n%s\", usage_text());\n        if (fasta_flank && sam_fname)\n            error(\"Only one of --fasta-flank or --sam-flank options can be used at once\\n%s\", usage_text());\n        if (!fasta_flank && !sam_fname && !ref_fname)\n            error(\"Expected one of --fasta-flank or --sam-flank or --fasta-ref options\\n%s\", usage_text());\n        if ((flags & ADJUST_CLUSTERS) && (!summary_fname || !snp_fname))\n            error(\"Expected --summary and --snp options with --adjust-clusters option\\n%s\", usage_text());\n        if (nfiles == 0 && extra_fname) error(\"Expected CHP files with --extra option\\n%s\", usage_text());\n        if (nfiles > 0 && (calls_fname || confidences_fname || summary_fname))\n            error(\n                \"Cannot load tables --calls, --confidences, --summary if CHP files provided \"\n                \"instead\\n%s\",\n                usage_text());\n    } else if (nfiles == 0) {\n        error(\"%s\", usage_text());\n    }\n\n    // beginning of plugin run\n    fprintf(stderr, \"affy2vcf \" AFFY2VCF_VERSION \" http://github.com/freeseek/gtc2vcf\\n\");\n\n    if (nfiles > 0 && !(flags & LOAD_CEL)) flags |= CALLS_LOADED | CONFIDENCES_LOADED | SUMMARY_LOADED;\n\n    // make sure the process is allowed to open enough files\n    struct rlimit lim;\n    getrlimit(RLIMIT_NOFILE, &lim);\n    if (nfiles + 7 > lim.rlim_max)\n        error(\"On this system you cannot open more than %ld files at once while %d is required\\n\", lim.rlim_max,\n              nfiles + 7);\n    if (nfiles + 7 > lim.rlim_cur) {\n        lim.rlim_cur = nfiles + 7;\n        setrlimit(RLIMIT_NOFILE, &lim);\n    }\n\n    annot_t *annot = NULL;\n    if (csv_fname) {\n        fprintf(stderr, \"Reading CSV file %s\\n\", csv_fname);\n        if (sam_fname) fprintf(stderr, \"Reading SAM file %s\\n\", sam_fname);\n        annot =\n            annot_init(csv_fname, sam_fname, ((sam_fname && !ref_fname) || fasta_flank) ? output_fname : NULL, flags);\n    }\n\n    for (i = 0; i < nfiles; i++) {\n        hFILE *hfile = hopen(filenames[i], \"rb\");\n        if (hfile == NULL) error(\"Could not open %s: %s\\n\", filenames[i], strerror(errno));\n        if (hpeek(hfile, (void *)&magic[i], 1) < 1) {\n            error(\"Failed to read from file %s\\n\", filenames[i]);\n        }\n        switch (magic[i]) {\n        case 59:\n            fprintf(stderr, \"Reading AGCC file %s\\n\", filenames[i]);\n            files[i] = (void *)agcc_init(filenames[i], hfile, nfiles > 1);\n            break;\n        case 64:\n            fprintf(stderr, \"Reading XDA CEL file %s\\n\", filenames[i]);\n            files[i] = (void *)xda_cel_init(filenames[i], hfile, nfiles > 1);\n            break;\n        case 65:\n            error(\"Currently unable to read XDA CHP format for file %s\\n\", filenames[i]);\n        default:\n            error(\"Expected magic numbers 59, 64 or 65 but found %d in file %s\\n\", magic[i], filenames[i]);\n        }\n    }\n\n    if (annot) {\n        if (extra_fname && !(flags & LOAD_CEL)) {\n            out_txt = get_file_handle(extra_fname);\n            chps_to_tsv(magic, (agcc_t **)files, nfiles, out_txt);\n        }\n        fai = fai_load(ref_fname);\n        if (!fai) error(\"Could not load the reference %s\\n\", ref_fname);\n        if (cache_size) fai_set_cache_size(fai, cache_size);\n\n        if (probeset_ids_fname) fprintf(stderr, \"Reading probeset IDs file %s\\n\", probeset_ids_fname);\n        void *probeset_ids = probeset_ids_fname ? probeset_ids_init(probeset_ids_fname) : NULL;\n        if (snp_fname) fprintf(stderr, \"Reading SNP posteriors file %s\\n\", snp_fname);\n        snp_models_t *snp_models = snp_fname ? snp_models_init(snp_fname) : NULL;\n        fprintf(stderr, \"Writing VCF file\\n\");\n        bcf_hdr_t *hdr = hdr_init(fai, flags);\n        bcf_hdr_printf(hdr, \"##CSV=%s\", strrchr(csv_fname, '/') ? strrchr(csv_fname, '/') + 1 : csv_fname);\n        if (sam_fname)\n            bcf_hdr_printf(hdr, \"##SAM=%s\", strrchr(sam_fname, '/') ? strrchr(sam_fname, '/') + 1 : sam_fname);\n        if (snp_fname)\n            bcf_hdr_printf(hdr, \"##SNP=%s\", strrchr(snp_fname, '/') ? strrchr(snp_fname, '/') + 1 : snp_fname);\n        if (record_cmd_line) bcf_hdr_append_version(hdr, argc, argv, \"bcftools_affy2vcf\");\n        char wmode[8];\n        set_wmode(wmode, output_type, (char *)output_fname, clevel);\n        htsFile *out_fh = hts_open(output_fname, hts_bcf_wmode(output_type));\n        if (out_fh == NULL) error(\"[%s] Error: cannot write to \\\"%s\\\": %s\\n\", __func__, output_fname, strerror(errno));\n        if (n_threads) hts_set_threads(out_fh, n_threads);\n        varitr_t *varitr = NULL;\n        if (nfiles > 0)\n            varitr = varitr_init_cc(hdr, (agcc_t **)files, nfiles);\n        else if (calls_fname || confidences_fname || summary_fname)\n            varitr = varitr_init_txt(hdr, calls_fname, confidences_fname, summary_fname);\n        if (bcf_hdr_write(out_fh, hdr) < 0) error(\"Unable to write to output VCF file\\n\");\n        if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0)\n            error(\"Error: failed to initialise index for %s\\n\", output_fname);\n        if (bcf_hdr_sync(hdr) < 0)\n            error_errno(\"[%s] Failed to update header\",\n                        __func__); // updates the number of samples\n        process(fai, annot, probeset_ids, snp_models, varitr, out_fh, hdr, flags, gc_win);\n        if (varitr) varitr_destroy(varitr);\n        if (snp_models) snp_models_destroy(snp_models);\n        if (probeset_ids) khash_str2int_destroy_free(probeset_ids);\n        fai_destroy(fai);\n        bcf_hdr_destroy(hdr);\n        if (write_index) {\n            if (bcf_idx_save(out_fh) < 0) {\n                if (hts_close(out_fh) != 0)\n                    error(\"Close failed %s\\n\", strcmp(output_fname, \"-\") ? output_fname : \"stdout\");\n                error(\"Error: cannot write to index %s\\n\", index_fname);\n            }\n            free(index_fname);\n        }\n        if (hts_close(out_fh) != 0) error(\"Close failed %s\\n\", strcmp(output_fname, \"-\") ? output_fname : \"stdout\");\n        annot_destroy(annot);\n    }\n\n    if (!ref_fname && nfiles > 0) {\n        out_txt = get_file_handle(output_fname);\n        if (nfiles == 1) {\n            switch (magic[0]) {\n            case 59:\n                agcc_print((agcc_t *)files[0], out_txt, flags & VERBOSE);\n                break;\n            case 64:\n                xda_cel_print((xda_cel_t *)files[0], out_txt, flags & VERBOSE);\n                break;\n            default:\n                error(\"Expected magic numbers 59 or 64 but found %d in file %s\\n\", magic[0], filenames[0]);\n            }\n        } else if (flags & LOAD_CEL) {\n            cels_to_tsv(magic, files, nfiles, out_txt);\n        } else {\n            chps_to_tsv(magic, (agcc_t **)files, nfiles, out_txt);\n        }\n    }\n\n    if (pathname) {\n        for (i = 0; i < nfiles; i++) free(filenames[i]);\n        free(filenames);\n    }\n    for (i = 0; i < nfiles; i++) {\n        switch (magic[i]) {\n        case 59:\n            agcc_destroy((agcc_t *)files[i]);\n            break;\n        case 64:\n            xda_cel_destroy((xda_cel_t *)files[i]);\n            break;\n        default:\n            error(\"Expected magic numbers 59 or 64 but found %d in file %s\\n\", magic[i], filenames[i]);\n        }\n    }\n    free(magic);\n    free(files);\n    if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt);\n    return 0;\n}\n"
  },
  {
    "path": "gtc2vcf.c",
    "content": "/* The MIT License\n\n   Copyright (c) 2018-2026 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n#include <getopt.h>\n#include <errno.h>\n#include <sys/resource.h>\n#include <htslib/vcf.h>\n#include <htslib/kseq.h>\n#include <htslib/khash_str2int.h>\n#include \"bcftools.h\"\n#include \"tsv2vcf.h\"\n#include \"gtc2vcf.h\"\n\n#define GTC2VCF_VERSION \"2026-01-26\"\n\n#define GT_NC 0\n#define GT_AA 1\n#define GT_AB 2\n#define GT_BB 3\n\n#define TAG_LIST_DFLT \"GT,GQ,IGC,BAF,LRR,NORMX,NORMY,R,THETA,X,Y\"\n#define GC_WIN_DFLT \"200\"\n#define CAPACITY_DFLT \"32768\"\n#define GENOME_BUILD_DFLT \"GRCh38\"\n\n#define VERBOSE (1 << 0)\n#define BPM_LOADED (1 << 1)\n#define CSV_LOADED (1 << 2)\n#define EGT_LOADED (1 << 3)\n#define LOAD_IDAT (1 << 4)\n#define ADJUST_CLUSTERS (1 << 5)\n#define GENOME_STUDIO (1 << 6)\n#define NO_INFO_GC (1 << 7)\n#define FORMAT_GT (1 << 8)\n#define FORMAT_GQ (1 << 9)\n#define FORMAT_IGC (1 << 10)\n#define FORMAT_BAF (1 << 11)\n#define FORMAT_LRR (1 << 12)\n#define FORMAT_NORMX (1 << 13)\n#define FORMAT_NORMY (1 << 14)\n#define FORMAT_R (1 << 15)\n#define FORMAT_THETA (1 << 16)\n#define FORMAT_X (1 << 17)\n#define FORMAT_Y (1 << 18)\n\n/****************************************\n * hFILE READING FUNCTIONS              *\n ****************************************/\n\n// read or skip a fixed length array\nstatic void read_array(hFILE *hfile, void **arr, size_t *m_arr, size_t nmemb, size_t size, size_t term) {\n    if (arr) {\n        if (!m_arr) {\n            *arr = malloc((nmemb + term) * size);\n            if (!*arr) error(\"Failed to allocate memory for array\\n\");\n        } else if (*m_arr < nmemb + term) {\n            void *tmp = realloc(*arr, (nmemb + term) * size);\n            if (!tmp) error(\"Failed to allocate memory for array\\n\");\n            *arr = tmp;\n            *m_arr = nmemb + term;\n        }\n        if (hread(hfile, *arr, nmemb * size) < nmemb * size) {\n            error(\"Failed to read %ld bytes from stream\\n\", nmemb * size);\n        }\n    } else {\n        int i, c = 0;\n        for (i = 0; i < nmemb * size; i++) c = hgetc(hfile);\n        if (c == EOF) error(\"Failed to reposition stream forward %ld bytes\\n\", nmemb * size);\n    }\n}\n\n// read or skip a length-prefixed array\nstatic void read_pfx_array(hFILE *hfile, void **arr, size_t *m_arr, size_t item_size) {\n    int32_t n;\n    if (hread(hfile, (void *)&n, 4) < 4) {\n        error(\"Failed to read 4 bytes from stream\\n\");\n    }\n    read_array(hfile, arr, m_arr, n, item_size, 0);\n}\n\n// read or skip a length-prefixed string\n// http://en.wikipedia.org/wiki/LEB128#Decode_unsigned_integer\nstatic void read_pfx_string(hFILE *hfile, char **str, size_t *m_str) {\n    uint8_t byte;\n    size_t n = 0, shift = 0;\n    while (1) {\n        if (hread(hfile, (void *)&byte, 1) < 1) {\n            error(\"Failed to read 1 byte from stream\\n\");\n        }\n        n |= (size_t)(byte & 0x7F) << shift;\n        if (!(byte & 0x80)) break;\n        shift += 7;\n    }\n    if (n || m_str) {\n        read_array(hfile, (void **)str, m_str, n, 1, 1);\n        if (str) (*str)[n] = '\\0';\n    }\n}\n\n// check whether file is compressed with gzip\nstatic int is_gzip(hFILE *hfile) {\n    uint8_t buffer[2];\n    if (hpeek(hfile, (void *)buffer, 2) < 2) error(\"Failed to read 2 bytes from stream\\n\");\n    return (buffer[0] == 0x1f && buffer[1] == 0x8b);\n}\n\n/****************************************\n * BUFFER ARRAY IMPLEMENTATION          *\n ****************************************/\n\ntypedef struct {\n    hFILE *hfile;\n    off_t offset;\n    int32_t item_num;\n    int32_t item_offset;\n    size_t item_capacity;\n    size_t item_size;\n    char *buffer;\n} buffer_array_t;\n\nstatic buffer_array_t *buffer_array_init(hFILE *hfile, size_t capacity, size_t item_size) {\n    buffer_array_t *arr = (buffer_array_t *)malloc(1 * sizeof(buffer_array_t));\n    arr->hfile = hfile;\n    read_bytes(hfile, (void *)&arr->item_num, sizeof(int32_t));\n    arr->offset = htell(arr->hfile);\n    arr->item_offset = 0;\n    arr->item_capacity = (capacity <= 0) ? (size_t)strtol(CAPACITY_DFLT, NULL, 0) : capacity;\n    arr->item_size = item_size;\n    arr->buffer = (char *)malloc(arr->item_capacity * item_size);\n    read_bytes(hfile, (void *)arr->buffer,\n               (arr->item_num < arr->item_capacity ? arr->item_num : arr->item_capacity) * item_size);\n    return arr;\n}\n\nstatic int get_element(buffer_array_t *arr, void *dst, size_t item_idx) {\n    if (!arr || item_idx >= arr->item_num) {\n        return -1;\n    } else if (item_idx - arr->item_offset < arr->item_capacity) {\n        memcpy(dst, (void *)(arr->buffer + (item_idx - arr->item_offset) * arr->item_size), arr->item_size);\n        return 0;\n    }\n    arr->item_offset = item_idx;\n    if (hseek(arr->hfile, arr->offset + item_idx * arr->item_size, SEEK_SET) < 0) {\n        error(\"Fail to seek to position %ld in file\\n\", arr->offset + item_idx * arr->item_size);\n    }\n    read_bytes(arr->hfile, (void *)arr->buffer,\n               ((arr->item_num - arr->item_offset) < arr->item_capacity ? (arr->item_num - arr->item_offset)\n                                                                        : arr->item_capacity)\n                   * arr->item_size);\n    memcpy(dst, (void *)arr->buffer, arr->item_size);\n    return 0;\n}\n\nstatic void buffer_array_destroy(buffer_array_t *arr) {\n    if (!arr) return;\n    free(arr->buffer);\n    free(arr);\n}\n\n/****************************************\n * BPM FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/BeadPoolManifest.py\n\ntypedef struct {\n    int32_t version;\n    uint8_t norm_id; // Normalization lookups from manifest. This indexes into list of\n                     // normalization transforms read from GTC file\n    char *ilmn_id;   // IlmnID (probe identifier) of locus\n    char *name;      // Name (variant identifier) of locus\n    int32_t index;\n    char *ilmn_strand; // TOP BOT PLUS MINUS or Top Bot P M\n    char *snp;         // SNP value for locus (e.g., [A/C])\n    char *chrom;       // Chromosome for the locus (e.g., XY)\n    char *ploidy;\n    char *species;\n    char *map_info; // Mapping location of locus\n    char *customer_strand;\n    int32_t address_a;        // AddressA ID of locus\n    char *allele_a_probe_seq; // CSV files or BPM files with version 4 data block\n    int32_t address_b;        // AddressB ID of locus (0 if none)\n    char *allele_b_probe_seq; // CSV files or BPM files with version 4 data block (empty if\n                              // none)\n    char *genome_build;\n    char *source;\n    char *source_version;\n    char *source_strand;\n    char *source_seq;      // CSV files or BPM files with version 4 data block\n    char *top_genomic_seq; // CSV files or BPM files with version 4 data block\n    int32_t beadset_id;    // CSV files\n    uint8_t exp_clusters;\n    uint8_t intensity_only;\n    uint8_t assay_type; // Identifies type of assay (0 - Infinium II, 1 - Infinium I (A/T),\n                        // 2 - Infinium I (G/C)\n    uint8_t assay_type_csv;\n    float frac_a;\n    float frac_c;\n    float frac_g;\n    float frac_t;\n    char *ref_strand; // RefStrand annotation\n} LocusEntry;\n\n// retrieve assay type following (allele_a_probe_seq, source_seq) -> assay_type map\n// (...W., ...W[./.]W...) -> 1\n// (...S., ...S[./.]S...) -> 2\n// (...S., ...S[./.]W...) -> 1\n// (...S., ...W[./.]S...) -> 1\n// (...W., ...S[./.]W...) -> 2\n// (...W., ...W[./.]S...) -> 2\nstatic uint8_t get_assay_type(const char *allele_a_probe_seq, const char *allele_b_probe_seq, const char *source_seq) {\n    if (!allele_a_probe_seq || !source_seq) return 0xFF;\n    if (!allele_b_probe_seq) return 0;\n    const char *left = strchr(source_seq, '[');\n    const char *right = strchr(source_seq, ']');\n    if (!left || !right) error(\"Source sequence is malformed: %s\\n\", source_seq);\n    char trail_left = toupper(*(left - 1));\n    char trail_right = toupper(*(right + 1));\n    if ((trail_left == 'A' || trail_left == 'T') && (trail_right == 'A' || trail_right == 'T')) return 1;\n    if ((trail_left == 'C' || trail_left == 'G') && (trail_right == 'C' || trail_right == 'G')) return 2;\n    int i = 2;\n    while (!(iupac2bitmask(allele_a_probe_seq[strlen(allele_a_probe_seq) - i])\n             & iupac2bitmask(allele_b_probe_seq[strlen(allele_b_probe_seq) - i])))\n        i++;\n    char trail_a_probe_seq = toupper(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]);\n    if (trail_a_probe_seq == 'C' || trail_a_probe_seq == 'G' || trail_a_probe_seq == 'S') return 1;\n    if (trail_a_probe_seq == 'A' || trail_a_probe_seq == 'T' || trail_a_probe_seq == 'W') return 2;\n    // these weird rule were deduced from manifests for array GDA_PGx-8v1-0_20042614\n    if (trail_a_probe_seq == 'Y' && trail_right == 'G') return 1;\n    if (trail_a_probe_seq == 'Y' && trail_right == 'T') return 1;\n    if (trail_a_probe_seq == 'Y' && trail_right == 'A') return 2;\n    if (trail_a_probe_seq == 'K' && trail_right == 'C') return 1;\n    if (trail_a_probe_seq == 'K' && trail_right == 'A') return 2;\n    if (trail_a_probe_seq == 'M' && trail_right == 'G') return 1;\n    if (trail_a_probe_seq == 'M' && trail_right == 'T') return 2;\n    if (trail_a_probe_seq == 'R' && trail_right == 'C') return 1;\n    if (trail_a_probe_seq == 'R' && trail_right == 'T') return 2;\n    fprintf(stderr, \"Warning: Unable to retrieve assay type: %s %s %s\\n\", allele_a_probe_seq, allele_b_probe_seq,\n            source_seq);\n    return 0xFF;\n}\n\nstatic void locusentry_read(LocusEntry *locus_entry, hFILE *hfile) {\n    locus_entry->norm_id = 0xFF;\n    read_bytes(hfile, (void *)&locus_entry->version, sizeof(int32_t));\n    if (locus_entry->version < 4 || locus_entry->version == 5 || locus_entry->version > 8)\n        error(\"Locus version %d in manifest file not supported\\n\", locus_entry->version);\n    read_pfx_string(hfile, &locus_entry->ilmn_id, NULL);\n    read_pfx_string(hfile, &locus_entry->name, NULL);\n    read_pfx_string(hfile, NULL, NULL); // ASOA\n    read_pfx_string(hfile, NULL, NULL); // ASOB\n    read_pfx_string(hfile, NULL, NULL); // LSO\n    read_bytes(hfile, (void *)&locus_entry->index, sizeof(int32_t));\n    read_pfx_string(hfile, NULL, NULL); // IllumicodeSeq\n    read_pfx_string(hfile, &locus_entry->ilmn_strand, NULL);\n    read_pfx_string(hfile, &locus_entry->snp, NULL);\n    read_pfx_string(hfile, &locus_entry->chrom, NULL);\n    read_pfx_string(hfile, &locus_entry->ploidy, NULL);\n    read_pfx_string(hfile, &locus_entry->species, NULL);\n    read_pfx_string(hfile, &locus_entry->map_info, NULL);\n    read_pfx_string(hfile, &locus_entry->top_genomic_seq, NULL); // only version 4\n    read_pfx_string(hfile, &locus_entry->customer_strand, NULL);\n    read_bytes(hfile, (void *)&locus_entry->address_a, sizeof(int32_t));\n    read_bytes(hfile, (void *)&locus_entry->address_b, sizeof(int32_t));\n    read_pfx_string(hfile, &locus_entry->allele_a_probe_seq, NULL); // only version 4\n    read_pfx_string(hfile, &locus_entry->allele_b_probe_seq, NULL); // only version 4\n    read_pfx_string(hfile, &locus_entry->genome_build, NULL);\n    read_pfx_string(hfile, &locus_entry->source, NULL);\n    read_pfx_string(hfile, &locus_entry->source_version, NULL);\n    read_pfx_string(hfile, &locus_entry->source_strand, NULL);\n    read_pfx_string(hfile, &locus_entry->source_seq, NULL); // only version 4\n    if (locus_entry->source_seq) {\n        char *ptr = strchr(locus_entry->source_seq, '-');\n        if (ptr && *(ptr - 1) == '/') {\n            *ptr = *(ptr - 2);\n            *(ptr - 2) = '-';\n        }\n    }\n\n    if (locus_entry->version >= 6) {\n        read_bytes(hfile, NULL, 1); // MarkerInCNVRegion\n        read_bytes(hfile, (void *)&locus_entry->exp_clusters, sizeof(int8_t));\n        read_bytes(hfile, (void *)&locus_entry->intensity_only, sizeof(int8_t));\n        read_bytes(hfile, (void *)&locus_entry->assay_type, sizeof(uint8_t));\n\n        if (locus_entry->assay_type < 0 || locus_entry->assay_type > 2)\n            error(\"Format error in reading assay type from locus entry\\n\");\n        if (locus_entry->address_b == 0 && locus_entry->assay_type != 0)\n            error(\"Manifest format error: Assay type is inconsistent with address B\\n\");\n        if (locus_entry->address_b != 0 && locus_entry->assay_type == 0)\n            error(\"Manifest format error: Assay type is inconsistent with address B\\n\");\n    } else {\n        locus_entry->assay_type =\n            get_assay_type(locus_entry->allele_a_probe_seq, locus_entry->allele_b_probe_seq, locus_entry->source_seq);\n    }\n\n    if (locus_entry->version >= 7) {\n        read_bytes(hfile, &locus_entry->frac_a, sizeof(float));\n        read_bytes(hfile, &locus_entry->frac_c, sizeof(float));\n        read_bytes(hfile, &locus_entry->frac_t, sizeof(float));\n        read_bytes(hfile, &locus_entry->frac_g, sizeof(float));\n    }\n    if (locus_entry->version >= 8) read_pfx_string(hfile, &locus_entry->ref_strand, NULL);\n}\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile; // bpm file\n    htsFile *fp;  // csv file\n    int32_t version;\n    char *manifest_name;  // Name of manifest\n    char *control_config; // Control description from manifest\n    int32_t num_loci;     // Number of loci in manifest\n    int32_t *indexes;\n    char **names; // Names of loci from manifest\n    void *names2index;\n    uint8_t *norm_ids;\n    LocusEntry *locus_entries;\n    uint8_t *norm_lookups;\n    char **header;\n    size_t m_header;\n} bpm_t;\n\nstatic uint8_t *bpm_norm_lookups(bpm_t *bpm) {\n    int i;\n    uint8_t sorted_norm_ids[256];\n    for (i = 0; i < 256; i++) sorted_norm_ids[i] = 0xFF;\n    for (i = 0; i < bpm->num_loci; i++) {\n        int norm_id = bpm->locus_entries[i].norm_id;\n        sorted_norm_ids[norm_id] = norm_id;\n    }\n    int j = 0;\n    for (i = 0; i < 256; i++)\n        if (sorted_norm_ids[i] != 0xFF) sorted_norm_ids[j++] = sorted_norm_ids[i];\n    uint8_t *norm_lookups = (uint8_t *)malloc(256 * sizeof(uint8_t *));\n    memset((void *)norm_lookups, 0xFF, 256 * sizeof(uint8_t *));\n    for (i = 0; i < j; i++) norm_lookups[sorted_norm_ids[i]] = i;\n    return norm_lookups;\n}\n\nstatic bpm_t *bpm_init(const char *fn, int eof_check, int make_dict) {\n    bpm_t *bpm = (bpm_t *)calloc(1, sizeof(bpm_t));\n    bpm->fn = strdup(fn);\n    bpm->hfile = hopen(bpm->fn, \"rb\");\n    if (bpm->hfile == NULL) error(\"Could not open %s: %s\\n\", bpm->fn, strerror(errno));\n    if (is_gzip(bpm->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", bpm->fn);\n\n    int i;\n    uint8_t buffer[4];\n    if (hread(bpm->hfile, (void *)buffer, 4) < 4) error(\"Failed to read magic number from %s file\\n\", bpm->fn);\n    if (memcmp(buffer, \"BPM\", 3) != 0) error(\"BPM file %s format identifier is bad\\n\", bpm->fn);\n    if (buffer[3] != 1) error(\"BPM file %s version is unknown\\n\", bpm->fn);\n\n    read_bytes(bpm->hfile, (void *)&bpm->version, sizeof(int32_t));\n    if (bpm->version & 0x1000) bpm->version ^= 0x1000;\n    if (bpm->version > 5 || bpm->version < 3) error(\"BPM file %s version %d is unsupported\\n\", bpm->fn, bpm->version);\n    read_pfx_string(bpm->hfile, &bpm->manifest_name, NULL);\n\n    if (bpm->version > 1) read_pfx_string(bpm->hfile, &bpm->control_config, NULL);\n\n    read_bytes(bpm->hfile, (void *)&bpm->num_loci, sizeof(int32_t));\n    read_array(bpm->hfile, (void **)&bpm->indexes, NULL, bpm->num_loci, sizeof(int32_t), 0);\n    bpm->names = (char **)malloc(bpm->num_loci * sizeof(char *));\n    for (i = 0; i < bpm->num_loci; i++) read_pfx_string(bpm->hfile, &bpm->names[i], NULL);\n    if (make_dict) {\n        bpm->names2index = khash_str2int_init();\n        for (i = 0; i < bpm->num_loci; i++) {\n            if (khash_str2int_has_key(bpm->names2index, bpm->names[i]))\n                error(\"Illumina probe %s present multiple times in file %s\\n\", bpm->names[i], fn);\n            khash_str2int_inc(bpm->names2index, bpm->names[i]);\n        }\n    }\n    read_array(bpm->hfile, (void **)&bpm->norm_ids, NULL, bpm->num_loci, sizeof(uint8_t), 0);\n\n    bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry));\n    LocusEntry locus_entry;\n    for (i = 0; i < bpm->num_loci; i++) {\n        memset(&locus_entry, 0, sizeof(LocusEntry));\n        locusentry_read(&locus_entry, bpm->hfile);\n        int idx = locus_entry.index - 1;\n        if (idx < 0 || idx >= bpm->num_loci) error(\"Locus entry index %d is out of boundaries\\n\", locus_entry.index);\n        if (bpm->norm_ids[idx] > 100)\n            error(\"Manifest format error: read invalid normalization ID %d\\n\", bpm->norm_ids[idx]);\n        // To mimic the flawed byte-wrapping behavior from GenomeStudio, AutoCall, and\n        // IAAP, this value is allowed to overflow beyond 255, which happens with some\n        // probes in the Omni5 arrays\n        bpm->norm_ids[idx] += 100 * locus_entry.assay_type;\n        locus_entry.norm_id = bpm->norm_ids[idx];\n        memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry));\n    }\n    bpm->norm_lookups = bpm_norm_lookups(bpm);\n    for (i = 0; i < bpm->num_loci; i++) {\n        if (i != bpm->locus_entries[i].index - 1)\n            error(\"Manifest format error: read invalid number of assay entries\\n\");\n    }\n    if (bpm->locus_entries[0].version < 8)\n        fprintf(stderr, \"Warning: RefStrand annotation missing from manifest file %s\\n\", bpm->fn);\n\n    read_bytes(bpm->hfile, (void *)&bpm->m_header, sizeof(int32_t));\n    bpm->header = (char **)malloc(bpm->m_header * sizeof(char *));\n    for (i = 0; i < bpm->m_header; i++) read_pfx_string(bpm->hfile, &bpm->header[i], NULL);\n\n    if (eof_check && !heof(bpm->hfile))\n        error(\n            \"BPM reader did not reach the end of file %s at position %ld\\nUse --do-not-check-eof to suppress this \"\n            \"check\\n\",\n            bpm->fn, htell(bpm->hfile));\n\n    return bpm;\n}\n\nstatic void bpm_destroy(bpm_t *bpm) {\n    if (!bpm) return;\n    int i;\n    if (bpm->hfile && hclose(bpm->hfile) < 0) error(\"Error closing BPM file %s\\n\", bpm->fn);\n    free(bpm->fn);\n    if (bpm->fp && hts_close(bpm->fp) < 0) error(\"Error closing CSV file %s\\n\", bpm->fp->fn);\n    free(bpm->manifest_name);\n    free(bpm->control_config);\n    free(bpm->indexes);\n    if (bpm->names) {\n        for (i = 0; i < bpm->num_loci; i++) free(bpm->names[i]);\n        free(bpm->names);\n    }\n    khash_str2int_destroy(bpm->names2index);\n    free(bpm->norm_ids);\n    for (i = 0; i < bpm->num_loci; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        free(locus_entry->ilmn_id);\n        free(locus_entry->name);\n        free(locus_entry->ilmn_strand);\n        free(locus_entry->snp);\n        free(locus_entry->chrom);\n        free(locus_entry->ploidy);\n        free(locus_entry->species);\n        free(locus_entry->map_info);\n        free(locus_entry->customer_strand);\n        free(locus_entry->allele_a_probe_seq);\n        free(locus_entry->allele_b_probe_seq);\n        free(locus_entry->genome_build);\n        free(locus_entry->source);\n        free(locus_entry->source_version);\n        free(locus_entry->source_strand);\n        free(locus_entry->source_seq);\n        free(locus_entry->top_genomic_seq);\n        free(locus_entry->ref_strand);\n    }\n    free(bpm->locus_entries);\n    free(bpm->norm_lookups);\n    for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]);\n    free(bpm->header);\n    free(bpm);\n}\n\nstatic void bpm_to_csv(const bpm_t *bpm, FILE *stream, int flags) {\n    int i;\n    for (i = 0; i < bpm->m_header; i++) fprintf(stream, \"%s\\n\", bpm->header[i]);\n    if (flags & BPM_LOADED) {\n        fprintf(stream,\n                \"Index,NormID,IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_\"\n                \"ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,\"\n                \"SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,\"\n                \"Intensity_Only,Assay_Type,Frac A,Frac C,Frac G,Frac T,RefStrand\");\n        if (flags & CSV_LOADED) fprintf(stream, \",Assay_Type_CSV\");\n        fputc('\\n', stream);\n    } else {\n        fprintf(stream,\n                \"IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_\"\n                \"ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,\"\n                \"SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,RefStrand\\n\");\n    }\n    if (flags & VERBOSE) {\n        kstring_t address_b = {0, 0, NULL};\n        if (flags & BPM_LOADED) {\n            for (i = 0; i < bpm->num_loci; i++) {\n                LocusEntry *locus_entry = &bpm->locus_entries[i];\n                address_b.l = 0;\n                ksprintf(&address_b, locus_entry->address_b ? \"%010d\" : \"\", locus_entry->address_b);\n                fprintf(stream,\n                        \"%d,%d,%s,%s,%s,%s,%010d,%-s,%s,%-s,%s,%s,%s,%s,%s,%s,%s,%s,%-s,%-s,%d,\"\n                        \"%d,%d,%d,%f,%f,%f,%f,%s\",\n                        locus_entry->index, locus_entry->norm_id, locus_entry->ilmn_id, locus_entry->name,\n                        locus_entry->ilmn_strand, locus_entry->snp, locus_entry->address_a,\n                        locus_entry->allele_a_probe_seq ? locus_entry->allele_a_probe_seq : \"\", address_b.s,\n                        locus_entry->allele_b_probe_seq ? locus_entry->allele_b_probe_seq : \"\",\n                        locus_entry->genome_build, locus_entry->chrom, locus_entry->map_info, locus_entry->ploidy,\n                        locus_entry->species, locus_entry->source, locus_entry->source_version,\n                        locus_entry->source_strand, locus_entry->source_seq ? locus_entry->source_seq : \"\",\n                        locus_entry->top_genomic_seq ? locus_entry->top_genomic_seq : \"\", locus_entry->beadset_id,\n                        locus_entry->exp_clusters, locus_entry->intensity_only, locus_entry->assay_type,\n                        locus_entry->frac_a, locus_entry->frac_c, locus_entry->frac_g, locus_entry->frac_t,\n                        locus_entry->ref_strand ? locus_entry->ref_strand : \"\");\n                if (flags & CSV_LOADED) fprintf(stream, \",%d\", locus_entry->assay_type_csv);\n                fputc('\\n', stream);\n            }\n        } else {\n            for (i = 0; i < bpm->num_loci; i++) {\n                LocusEntry *locus_entry = &bpm->locus_entries[i];\n                address_b.l = 0;\n                ksprintf(&address_b, locus_entry->address_b ? \"%010d\" : \"\", locus_entry->address_b);\n                fprintf(stream, \"%s,%s,%s,%s,%010d,%-s,%s,%-s,%s,%s,%s,%s,%s,%s,%s,%s,%-s,%-s,%d,%d,%s\\n\",\n                        locus_entry->ilmn_id, locus_entry->name, locus_entry->ilmn_strand, locus_entry->snp,\n                        locus_entry->address_a, locus_entry->allele_a_probe_seq, address_b.s,\n                        locus_entry->allele_b_probe_seq ? locus_entry->allele_b_probe_seq : \"\",\n                        locus_entry->genome_build, locus_entry->chrom, locus_entry->map_info, locus_entry->ploidy,\n                        locus_entry->species, locus_entry->source, locus_entry->source_version,\n                        locus_entry->source_strand, locus_entry->source_seq, locus_entry->top_genomic_seq,\n                        locus_entry->beadset_id, locus_entry->exp_clusters,\n                        locus_entry->ref_strand ? locus_entry->ref_strand : \"\");\n            }\n        }\n        free(address_b.s);\n    } else {\n        fprintf(stream, \"... use --verbose to visualize Assay data ...\\n\");\n    }\n    fprintf(stream, \"[Controls]\\n\");\n    fprintf(stream, \"%s\", bpm->control_config);\n}\n\n/****************************************\n * CSV FILE IMPLEMENTATION              *\n ****************************************/\n\nstatic int tsv_read_uint8(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    uint8_t *uint8 = (uint8_t *)usr;\n    char tmp = *tsv->se;\n    *tsv->se = 0;\n    char *endptr;\n    *uint8 = (uint8_t)strtol(tsv->ss, &endptr, 0);\n    *tsv->se = tmp;\n    return 0;\n}\n\nstatic int tsv_read_int32(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    int32_t *int32 = (int32_t *)usr;\n    char tmp = *tsv->se;\n    *tsv->se = 0;\n    char *endptr;\n    *int32 = (int32_t)strtol(tsv->ss, &endptr, 10);\n    *tsv->se = tmp;\n    return 0;\n}\n\nstatic int tsv_read_float(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    float *single = (float *)usr;\n    char tmp = *tsv->se;\n    *tsv->se = 0;\n    char *endptr;\n    *single = (float)strtof(tsv->ss, &endptr);\n    *tsv->se = tmp;\n    return 0;\n}\n\nstatic int tsv_read_string(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    char **str = (char **)usr;\n    if (tsv->se == tsv->ss) {\n        *str = NULL;\n    } else {\n        char tmp = *tsv->se;\n        *tsv->se = 0;\n        *str = strdup(tsv->ss);\n        *tsv->se = tmp;\n    }\n    return 0;\n}\n\n// Petr Danecek's similar implementation in bcftools/tsv2vcf.c\nstatic int csv_parse(tsv_t *tsv, bcf1_t *rec, char *str) {\n    int status = 0;\n    tsv->icol = 0;\n    tsv->ss = tsv->se = str;\n    while (*tsv->ss && tsv->icol < tsv->ncols) {\n        while (*tsv->se && *tsv->se != ',') tsv->se++;\n        if (tsv->cols[tsv->icol].setter) {\n            int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr);\n            if (ret < 0) return -1;\n            status++;\n        }\n        if (*tsv->se) tsv->se++;\n        tsv->ss = tsv->se;\n        tsv->icol++;\n    }\n    return status ? 0 : -1;\n}\n\nstatic void locus_merge(LocusEntry *dest, LocusEntry *src) {\n    if (src->version) dest->version = src->version;\n    if (src->norm_id != 0xFF) dest->norm_id = src->norm_id;\n    if (strcmp(dest->ilmn_id, src->ilmn_id)) {\n        error(\"BPM and CSV manifests have conflicting IDs: %s and %s\\n\", dest->ilmn_id, src->ilmn_id);\n    } else {\n        free(dest->ilmn_id);\n        dest->ilmn_id = src->ilmn_id;\n    }\n    if (src->name) {\n        free(dest->name);\n        dest->name = src->name;\n    }\n    if (src->index != 0) dest->index = src->index;\n    if (src->ilmn_strand) {\n        free(dest->ilmn_strand);\n        dest->ilmn_strand = src->ilmn_strand;\n    }\n    if (src->snp) {\n        free(dest->snp);\n        dest->snp = src->snp;\n    }\n    if (src->chrom) {\n        free(dest->chrom);\n        dest->chrom = src->chrom;\n    }\n    if (src->ploidy) {\n        free(dest->ploidy);\n        dest->ploidy = src->ploidy;\n    }\n    if (src->species) {\n        free(dest->species);\n        dest->species = src->species;\n    }\n    if (src->map_info) {\n        free(dest->map_info);\n        dest->map_info = src->map_info;\n    }\n    if (src->customer_strand) {\n        free(dest->customer_strand);\n        dest->customer_strand = src->customer_strand;\n    }\n    if (src->address_a != 0) dest->address_a = src->address_a;\n    if (src->allele_a_probe_seq) {\n        free(dest->allele_a_probe_seq);\n        dest->allele_a_probe_seq = src->allele_a_probe_seq;\n    }\n    if (src->address_b != 0) dest->address_b = src->address_b;\n    if (src->allele_b_probe_seq) {\n        free(dest->allele_b_probe_seq);\n        dest->allele_b_probe_seq = src->allele_b_probe_seq;\n    }\n    if (src->genome_build) {\n        free(dest->genome_build);\n        dest->genome_build = src->genome_build;\n    }\n    if (src->source) {\n        free(dest->source);\n        dest->source = src->source;\n    }\n    if (src->source_version) {\n        free(dest->source_version);\n        dest->source_version = src->source_version;\n    }\n    if (src->source_strand) {\n        free(dest->source_strand);\n        dest->source_strand = src->source_strand;\n    }\n    if (src->source_seq) {\n        free(dest->source_seq);\n        dest->source_seq = src->source_seq;\n    }\n    if (src->top_genomic_seq) {\n        free(dest->top_genomic_seq);\n        dest->top_genomic_seq = src->top_genomic_seq;\n    }\n    if (src->beadset_id) dest->beadset_id = src->beadset_id;\n    if (src->exp_clusters) dest->exp_clusters = src->exp_clusters;\n    if (src->intensity_only) dest->intensity_only = src->intensity_only;\n    if (src->assay_type != 0xFF) dest->assay_type = src->assay_type;\n    if (src->assay_type_csv != 0xFF) dest->assay_type_csv = src->assay_type_csv;\n    if (src->frac_a) dest->frac_a = src->frac_a;\n    if (src->frac_c) dest->frac_c = src->frac_c;\n    if (src->frac_g) dest->frac_g = src->frac_g;\n    if (src->frac_t) dest->frac_t = src->frac_t;\n    if (src->ref_strand) {\n        free(dest->ref_strand);\n        dest->ref_strand = src->ref_strand;\n    }\n}\n\n// this line will read a CSV file and if a BPM object is provided it will fill it rather than\n// create a new one\nstatic bpm_t *bpm_csv_init(const char *fn, bpm_t *bpm, int make_dict) {\n    int bpm_available = bpm != NULL;\n    if (!bpm_available) bpm = (bpm_t *)calloc(1, sizeof(bpm_t));\n    int bpm_prev_num_loci = bpm->num_loci;\n\n    bpm->fp = hts_open(fn, \"r\");\n    if (bpm->fp == NULL) error(\"Could not open %s: %s\\n\", fn, strerror(errno));\n\n    kstring_t str = {0, 0, NULL};\n    kstring_t hdr = {0, 0, NULL};\n    if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error(\"Empty file: %s\\n\", fn);\n    if (strncmp(str.s, \"Illumina\", 8) && strncmp(str.s, \"\\\"Illumina\", 9))\n        error(\"Header of file %s is incorrect: %s\\n\", fn, str.s);\n    kputs(str.s, &hdr);\n    kputc('\\n', &hdr);\n\n    char *tmp = NULL;\n    size_t prev = 0;\n    while (strncmp(str.s + prev, \"[Assay]\", 7)) {\n        if (strncmp(str.s + prev, \"Descriptor File Name,\", 21) == 0) {\n            free(bpm->manifest_name);\n            bpm->manifest_name = strdup(str.s + prev + 21);\n            char *ptr = strchr(bpm->manifest_name, ',');\n            if (ptr) *ptr = '\\0';\n        } else if (strncmp(str.s + prev, \"Loci Count ,\", 12) == 0) {\n            bpm->num_loci = (int)strtol(str.s + prev + 12, &tmp, 0);\n        } else if (strncmp(str.s + prev, \"Loci Count,\", 11) == 0) {\n            bpm->num_loci = (int)strtol(str.s + prev + 11, &tmp, 0);\n        }\n        if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error(\"Error reading from file: %s\\n\", fn);\n        kputs(str.s, &hdr);\n        kputc('\\n', &hdr);\n    }\n    if (bpm->num_loci == 0)\n        error(\"Could not understand number of loci from header of manifest file %s\\n\", fn);\n    else if (bpm_available && bpm_prev_num_loci != bpm->num_loci)\n        error(\"BPM manifest file has %d loci while CSV manifest file %s has %d loci\\n\", bpm_prev_num_loci, fn,\n              bpm->num_loci);\n\n    int i, moff = 0, *off = NULL;\n    for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]);\n    bpm->m_header = ksplit_core(hdr.s, '\\n', &moff, &off);\n    free(bpm->header);\n    bpm->header = (char **)malloc(bpm->m_header * sizeof(char *));\n    for (i = 0; i < bpm->m_header; i++) bpm->header[i] = strdup(&hdr.s[off[i]]);\n    free(off);\n    free(hdr.s);\n\n    if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error(\"Error reading from file: %s\\n\", fn);\n\n    LocusEntry locus_entry;\n    tsv_t *tsv = tsv_init(str.s);\n    tsv_register(tsv, \"Index\", tsv_read_int32, &locus_entry.index);\n    int norm_id = tsv_register(tsv, \"NormID\", tsv_read_uint8, &locus_entry.norm_id);\n    tsv_register(tsv, \"IlmnID\", tsv_read_string, &locus_entry.ilmn_id);\n    tsv_register(tsv, \"Name\", tsv_read_string, &locus_entry.name);\n    tsv_register(tsv, \"IlmnStrand\", tsv_read_string, &locus_entry.ilmn_strand);\n    tsv_register(tsv, \"SNP\", tsv_read_string, &locus_entry.snp);\n    tsv_register(tsv, \"AddressA_ID\", tsv_read_int32, &locus_entry.address_a);\n    tsv_register(tsv, \"AlleleA_ProbeSeq\", tsv_read_string, &locus_entry.allele_a_probe_seq);\n    tsv_register(tsv, \"AddressB_ID\", tsv_read_int32, &locus_entry.address_b);\n    tsv_register(tsv, \"AlleleB_ProbeSeq\", tsv_read_string, &locus_entry.allele_b_probe_seq);\n    tsv_register(tsv, \"GenomeBuild\", tsv_read_string, &locus_entry.genome_build);\n    tsv_register(tsv, \"Chr\", tsv_read_string, &locus_entry.chrom);\n    tsv_register(tsv, \"MapInfo\", tsv_read_string, &locus_entry.map_info);\n    tsv_register(tsv, \"Ploidy\", tsv_read_string, &locus_entry.ploidy);\n    tsv_register(tsv, \"Species\", tsv_read_string, &locus_entry.species);\n    tsv_register(tsv, \"Source\", tsv_read_string, &locus_entry.source);\n    tsv_register(tsv, \"SourceVersion\", tsv_read_string, &locus_entry.source_version);\n    tsv_register(tsv, \"SourceStrand\", tsv_read_string, &locus_entry.source_strand);\n    tsv_register(tsv, \"SourceSeq\", tsv_read_string, &locus_entry.source_seq);\n    tsv_register(tsv, \"TopGenomicSeq\", tsv_read_string, &locus_entry.top_genomic_seq);\n    int beadset_id = tsv_register(tsv, \"BeadSetID\", tsv_read_int32, &locus_entry.beadset_id);\n    tsv_register(tsv, \"Exp_Clusters\", tsv_read_uint8, &locus_entry.exp_clusters);\n    tsv_register(tsv, \"Intensity_Only\", tsv_read_uint8, &locus_entry.intensity_only);\n    tsv_register(tsv, \"Frac A\", tsv_read_float, &locus_entry.frac_a);\n    tsv_register(tsv, \"Frac C\", tsv_read_float, &locus_entry.frac_c);\n    tsv_register(tsv, \"Frac G\", tsv_read_float, &locus_entry.frac_g);\n    tsv_register(tsv, \"Frac T\", tsv_read_float, &locus_entry.frac_t);\n    int ref_strand = tsv_register(tsv, \"RefStrand\", tsv_read_string, &locus_entry.ref_strand);\n    if (ref_strand < 0) fprintf(stderr, \"Warning: RefStrand annotation missing from manifest file %s\\n\", fn);\n\n    if (!bpm_available) bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry));\n    for (i = 0; i < bpm->num_loci; i++) {\n        memset(&locus_entry, 0, sizeof(LocusEntry));\n        locus_entry.norm_id = 0xFF;\n        locus_entry.assay_type = 0xFF;\n        locus_entry.assay_type_csv = 0xFF;\n        if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error(\"Error reading from file: %s\\n\", fn);\n        if (csv_parse(tsv, NULL, str.s) < 0) error(\"Could not parse the manifest file: %s\\n\", str.s);\n        if (beadset_id == 0 && locus_entry.beadset_id == 0)\n            error(\"BeadSetID value 0 for probe %s is not allowed\\n\", locus_entry.ilmn_id);\n        if (locus_entry.source_seq) {\n            char *ptr = strchr(locus_entry.source_seq, '-');\n            if (ptr && *(ptr - 1) == '/') {\n                *ptr = *(ptr - 2);\n                *(ptr - 2) = '-';\n            }\n        }\n        locus_entry.assay_type_csv =\n            get_assay_type(locus_entry.allele_a_probe_seq, locus_entry.allele_b_probe_seq, locus_entry.source_seq);\n        if (locus_entry.index == 0) locus_entry.index = i + 1;\n        int idx = locus_entry.index - 1;\n        if (idx < 0 || idx >= bpm->num_loci) error(\"Locus entry index %d is out of boundaries\\n\", idx);\n        if (!bpm_available) {\n            memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry));\n        } else {\n            locus_merge(&bpm->locus_entries[idx], &locus_entry);\n            if (bpm->locus_entries[idx].assay_type != 0xff\n                && bpm->locus_entries[idx].assay_type != bpm->locus_entries[idx].assay_type_csv)\n                fprintf(stderr, \"Warning: Failed to retrieve assay type %d: %s %s %s\\n\",\n                        bpm->locus_entries[idx].assay_type, bpm->locus_entries[idx].allele_a_probe_seq,\n                        bpm->locus_entries[idx].allele_b_probe_seq, bpm->locus_entries[idx].source_seq);\n        }\n    }\n    tsv_destroy(tsv);\n\n    if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error(\"Error reading from file: %s\\n\", fn);\n    if (strncmp(str.s, \"[Controls]\", 10) != 0)\n        error(\n            \"Missing [Controls] section from manifest file: %s\\n\"\n            \"Found the following line instead: %s\\n\",\n            fn, str.s);\n    while (hts_getline(bpm->fp, KS_SEP_LINE, &str) > 0) kputc('\\n', &str);\n    free(bpm->control_config);\n    bpm->control_config = str.s;\n\n    if (make_dict && !bpm->names2index) {\n        bpm->names2index = khash_str2int_init();\n        for (i = 0; i < bpm->num_loci; i++) {\n            if (khash_str2int_has_key(bpm->names2index, bpm->locus_entries[i].name))\n                error(\"Illumina probe %s present multiple times in file %s\\n\", bpm->locus_entries[i].name, fn);\n            khash_str2int_inc(bpm->names2index, bpm->locus_entries[i].name);\n        }\n    }\n\n    if (norm_id == 0) {\n        free(bpm->norm_lookups);\n        bpm->norm_lookups = bpm_norm_lookups(bpm);\n    }\n\n    return bpm;\n}\n\n/****************************************\n * EGT FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumEGTFile.java\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/ClusterFile.py\n\ntypedef struct {\n    int32_t N;        // Number of samples assigned to cluster during training\n    float r_dev;      // R (intensity) std deviation value\n    float r_mean;     // R (intensity) mean value\n    float theta_dev;  // Theta std devation value\n    float theta_mean; // Theta mean value\n} ClusterStats;\n\ntypedef struct {\n    float cluster_separation; // A score measure the separation between genotype clusters\n    float total_score;        // The GenTrain score\n    float original_score;     // The original score before editing this cluster\n    uint8_t edited;           // Whether this cluster has been manually manipulated\n} ClusterScore;\n\ntypedef struct {\n    ClusterStats aa_cluster_stats; // Describes AA genotype cluster\n    ClusterStats ab_cluster_stats; // Describes AB genotype cluster\n    ClusterStats bb_cluster_stats; // Describes BB genotype cluster\n    float intensity_threshold;     // Intensity threshold for no-call\n    ClusterScore cluster_score;    // Various scores for cluster\n    int32_t address;               // Bead type identifier for probe A\n    float r_mean;                  // precomputed clusters mean\n} ClusterRecord;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int32_t version;\n    char *gencall_version;       // The GenCall version\n    char *cluster_version;       // The clustering algorithm version\n    char *call_version;          // The genotyping algorithm version\n    char *normalization_version; // The normalization algorithm version\n    char *date_created;          // The date the cluster file was created (e.g., 3/9/2017 2:18:30 PM)\n    uint8_t is_wgt;\n    int32_t data_block_version;\n    char *opa;\n    char *manifest_name; // The manifest name used to build this cluster file\n    int32_t num_records;\n    ClusterRecord *cluster_records;\n    char **names; // Names of records from manifest\n    void *names2index;\n} egt_t;\n\nstatic void clusterscore_read(ClusterScore *clusterscore, hFILE *hfile) {\n    read_bytes(hfile, (void *)&clusterscore->cluster_separation, sizeof(float));\n    read_bytes(hfile, (void *)&clusterscore->total_score, sizeof(float));\n    read_bytes(hfile, (void *)&clusterscore->original_score, sizeof(float));\n    read_bytes(hfile, (void *)&clusterscore->edited, sizeof(uint8_t));\n}\n\nstatic void clusterrecord_read(ClusterRecord *clusterrecord, hFILE *hfile, int32_t data_block_version) {\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.N, sizeof(int32_t));\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.N, sizeof(int32_t));\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.N, sizeof(int32_t));\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_mean, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_mean, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_mean, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_dev, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_mean, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_mean, sizeof(float));\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_mean, sizeof(float));\n    if (data_block_version >= 7) {\n        read_bytes(hfile, (void *)&clusterrecord->intensity_threshold, sizeof(float));\n        read_bytes(hfile, NULL, 14 * sizeof(float));\n    } else {\n        clusterrecord->intensity_threshold = NAN;\n    }\n}\n\nstatic egt_t *egt_init(const char *fn, int eof_check) {\n    int i;\n    egt_t *egt = (egt_t *)calloc(1, sizeof(egt_t));\n    egt->fn = strdup(fn);\n    egt->hfile = hopen(egt->fn, \"rb\");\n    if (egt->hfile == NULL) error(\"Could not open %s: %s\\n\", egt->fn, strerror(errno));\n    if (is_gzip(egt->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", egt->fn);\n\n    read_bytes(egt->hfile, (void *)&egt->version, sizeof(int32_t));\n    if (egt->version != 3) error(\"EGT cluster file version %d not supported\\n\", egt->version);\n\n    read_pfx_string(egt->hfile, &egt->gencall_version, NULL);\n    read_pfx_string(egt->hfile, &egt->cluster_version, NULL);\n    read_pfx_string(egt->hfile, &egt->call_version, NULL);\n    read_pfx_string(egt->hfile, &egt->normalization_version, NULL);\n    read_pfx_string(egt->hfile, &egt->date_created, NULL);\n\n    read_bytes(egt->hfile, (void *)&egt->is_wgt, sizeof(uint8_t));\n    if (egt->is_wgt != 1) error(\"Only WGT cluster file version supported\\n\");\n\n    read_pfx_string(egt->hfile, &egt->manifest_name, NULL);\n\n    read_bytes(egt->hfile, (void *)&egt->data_block_version, sizeof(int32_t));\n    if (egt->data_block_version < 5 || egt->data_block_version == 6 || egt->data_block_version > 9)\n        error(\"Data block version %d in cluster file not supported\\n\", egt->data_block_version);\n    read_pfx_string(egt->hfile, &egt->opa, NULL);\n\n    read_bytes(egt->hfile, (void *)&egt->num_records, sizeof(int32_t));\n    egt->cluster_records = (ClusterRecord *)malloc(egt->num_records * sizeof(ClusterRecord));\n    for (i = 0; i < egt->num_records; i++)\n        clusterrecord_read(&egt->cluster_records[i], egt->hfile, egt->data_block_version);\n    for (i = 0; i < egt->num_records; i++) clusterscore_read(&egt->cluster_records[i].cluster_score, egt->hfile);\n\n    // toss useless strings such as aa_ab_bb/aa_ab/aa_bb/ab_bb\n    for (i = 0; i < egt->num_records; i++) read_pfx_string(egt->hfile, NULL, NULL);\n\n    egt->names = (char **)malloc(egt->num_records * sizeof(char *));\n    egt->names2index = khash_str2int_init();\n    for (i = 0; i < egt->num_records; i++) {\n        read_pfx_string(egt->hfile, &egt->names[i], NULL);\n        if (khash_str2int_has_key(egt->names2index, egt->names[i]))\n            error(\"Illumina probe %s present multiple times in file %s\\n\", egt->names[i], fn);\n        khash_str2int_inc(egt->names2index, egt->names[i]);\n    }\n    for (i = 0; i < egt->num_records; i++)\n        read_bytes(egt->hfile, (void *)&egt->cluster_records[i].address, sizeof(int32_t));\n\n    int32_t aa_n, ab_n, bb_n;\n    for (i = 0; i < egt->num_records; i++) {\n        read_bytes(egt->hfile, (void *)&aa_n, sizeof(int32_t));\n        read_bytes(egt->hfile, (void *)&ab_n, sizeof(int32_t));\n        read_bytes(egt->hfile, (void *)&bb_n, sizeof(int32_t));\n        if (egt->cluster_records[i].aa_cluster_stats.N != aa_n || egt->cluster_records[i].ab_cluster_stats.N != ab_n\n            || egt->cluster_records[i].bb_cluster_stats.N != bb_n)\n            error(\"Cluster counts don't match with EGT cluster file %s\\n\", egt->fn);\n    }\n\n    if (egt->data_block_version == 9) read_bytes(egt->hfile, NULL, egt->num_records * sizeof(float));\n    if (eof_check && !heof(egt->hfile))\n        error(\n            \"EGT reader did not reach the end of file %s at position %ld\\nUse --do-not-check-eof to suppress this \"\n            \"check\\n\",\n            egt->fn, htell(egt->hfile));\n\n    for (i = 0; i < egt->num_records; i++) {\n        ClusterStats *aa = &egt->cluster_records[i].aa_cluster_stats;\n        ClusterStats *ab = &egt->cluster_records[i].ab_cluster_stats;\n        ClusterStats *bb = &egt->cluster_records[i].bb_cluster_stats;\n        egt->cluster_records[i].r_mean =\n            (aa->N * aa->r_mean + ab->N * ab->r_mean + bb->N * bb->r_mean) / (aa->N + ab->N + bb->N);\n    }\n    return egt;\n}\n\nstatic void egt_destroy(egt_t *egt) {\n    if (!egt) return;\n    int i;\n    if (hclose(egt->hfile) < 0) error(\"Error closing EGT file %s\\n\", egt->fn);\n    free(egt->fn);\n    free(egt->gencall_version);\n    free(egt->cluster_version);\n    free(egt->call_version);\n    free(egt->normalization_version);\n    free(egt->date_created);\n    free(egt->opa);\n    free(egt->manifest_name);\n    free(egt->cluster_records);\n    for (i = 0; i < egt->num_records; i++) free(egt->names[i]);\n    free(egt->names);\n    khash_str2int_destroy(egt->names2index);\n    free(egt);\n}\n\nstatic void egt_to_csv(const egt_t *egt, FILE *stream, int verbose) {\n    fprintf(stream, \"Illumina, Inc.\\n\");\n    fprintf(stream, \"[Heading]\\n\");\n    fprintf(stream, \"Descriptor File Name,%s\\n\", strrchr(egt->fn, '/') ? strrchr(egt->fn, '/') + 1 : egt->fn);\n    fprintf(stream, \"GenCall version,%s\\n\", egt->gencall_version);\n    fprintf(stream, \"Clustering algorithm version,%s\\n\", egt->cluster_version);\n    fprintf(stream, \"Genotyping algorithm version,%s\\n\", egt->call_version);\n    fprintf(stream, \"Normalization algorithm version,%s\\n\", egt->normalization_version);\n    fprintf(stream, \"Date Manufactured,%s\\n\", egt->date_created);\n    fprintf(stream, \"Manifest name used to build this cluster file,%s\\n\", egt->manifest_name);\n    fprintf(stream, \"OPA,%s\\n\", egt->opa ? egt->opa : \"\");\n    fprintf(stream, \"Loci Count,%d\\n\", egt->num_records);\n    fprintf(stream, \"[Assay]\\n\");\n    fprintf(stream,\n            \"Name,AA.N,AA.R_dev,AA.R_mean,AA.Theta_dev,AA.Theta_mean,AB.N,AB.R_dev,AB.R_mean,AB.\"\n            \"Theta_dev,AB.Theta_mean,BB.N,BB.R_dev,BB.R_mean,BB.Theta_dev,BB.Theta_mean,Intensity \"\n            \"Threshold,Cluster Separation,GenTrain Score,Original Score,Edited,Address\\n\");\n    if (verbose) {\n        int i;\n        for (i = 0; i < egt->num_records; i++) {\n            ClusterRecord *cluster_record = &egt->cluster_records[i];\n            fprintf(stream, \"%s,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%f,%f,%f,%d,%d\\n\", egt->names[i],\n                    cluster_record->aa_cluster_stats.N, cluster_record->aa_cluster_stats.r_dev,\n                    cluster_record->aa_cluster_stats.r_mean, cluster_record->aa_cluster_stats.theta_dev,\n                    cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.N,\n                    cluster_record->ab_cluster_stats.r_dev, cluster_record->ab_cluster_stats.r_mean,\n                    cluster_record->ab_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_mean,\n                    cluster_record->bb_cluster_stats.N, cluster_record->bb_cluster_stats.r_dev,\n                    cluster_record->bb_cluster_stats.r_mean, cluster_record->bb_cluster_stats.theta_dev,\n                    cluster_record->bb_cluster_stats.theta_mean, cluster_record->intensity_threshold,\n                    cluster_record->cluster_score.cluster_separation, cluster_record->cluster_score.total_score,\n                    cluster_record->cluster_score.original_score, cluster_record->cluster_score.edited,\n                    cluster_record->address);\n        }\n    } else {\n        fprintf(stream, \"... use --verbose to visualize Assay data ...\\n\");\n    }\n}\n\n/****************************************\n * IDAT FILE IMPLEMENTATION             *\n ****************************************/\n\n// http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py\n// http://github.com/HenrikBengtsson/illuminaio/blob/master/R/readIDAT.R\n\n#define NUM_SNPS_READ 1000 // ID_N_CORES\n// #define ... 100 // ID_BACKGROUNDS - not used\n// #define ... 101 // ID_BACKGROUND_DEVS - not used\n#define ILLUMINA_ID 102 // ID_BEAD_TYPES\n#define SD 103          // ID_DEVS\n#define MEAN 104        // ID_MEANS\n// #define ... 105 // ID_MEDIANS - not used\n// #define ... 106 // ID_N_BEADS - not used\n#define NBEADS 107 // ID_N_GOOD_BEADS\n// #define ... 108 // ID_TRIMMED_MEANS - not used\n#define MID_BLOCK 200         // ID_ILLUMICODES\n#define RUN_INFO 300          // ID_PROCESS_HISTORY\n#define RED_GREEN 400         // ID_TENTH_PERCENTILE\n#define IDAT_SNP_MANIFEST 401 // ID_SAMPLE_BEADSET\n#define SENTRIX_BARCODE 402   // ID_BARCODE\n#define CHIP_TYPE 403         // ID_SENTRIX_FORMAT\n#define SENTRIX_POSITION 404  // ID_SECTION_LABEL\n#define BEADSET 405           // ID_BEADSET\n#define IDAT_SAMPLE_NAME 406  // ID_DNA\n#define DESCRIPTION 407       // ID_OPA\n#define IDAT_SAMPLE_PLATE 408 // ID_DNA_PLATE\n#define IDAT_SAMPLE_WELL 409  // ID_WELL\n#define IDAT_SAMPLE_COUNT 410 // ID_SAMPLE_COUNT\n// #define ... 411 // ID_DX - not used\n#define IDAT_VLN 510 // ID_VLN\n\ntypedef struct {\n    const char *chip_type;\n    int num_snps;\n    int num_mid_blocks;\n    const char *chip_type_guess;\n} chip_type_t;\n\nstatic chip_type_t chip_types[] = {\n    {\"1-95um_multi-swath_for_4x5M\", 4568350, 4568350, \"HumanOmni5-4-v1-0\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4640213, 4640213, \"HumanOmni5-4v1-1\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4685673, 4685673, \"InfiniumOmni5-4v1-2\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4696316, 4696316, \"HumanOmni5-4-v1-0\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266191, 2266191, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266367, 2266367, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266404, 2266404, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266406, 2266406, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2268676, 2268676, \"MEGAEx_BioVU_15075710\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2315574, 2315574, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2389000, 2389000, \"CCPMBiobankMEGA2_20002558X345183\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2508689, 2508689, \"GDA-8v1-0\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2550870, 2550870, \"HumanOmni2.5-8v1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2563064, 2563064, \"HumanOmni25M-8v1-1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2575219, 2575219, \"HumanOmni2.5-8v1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2605775, 2605775, \"HumanOmni25M-8v1-1\"},\n    {\"BeadChip 12x1\", 55300, 55300, \"humanmethylation27_270596_v1-2 ???\"},\n    {\"BeadChip 12x1Q\", 191668, 191668, \"CanineHD\"},\n    {\"BeadChip 12x1Q\", 299260, 299260, \"HumanCytoSNP-12v2-1\"},\n    {\"BeadChip 12x8\", 301084, 301084, \"HumanCore-12v1-0\"},\n    {\"BeadChip 12x8\", 304138, 304138, \"HumanExome-12v1-1\"},\n    {\"BeadChip 12x8\", 567727, 567727, \"HumanCoreExome-12-v1-0\"},\n    {\"BeadChip 12x8\", 569060, 569060, \"HumanCoreExome-12-v1-0\"},\n    {\"BeadChip 12x8\", 573012, 573012, \"HumanCoreExome-12-v1-1\"},\n    {\"BeadChip 12x8\", 576769, 576769, \"HumanCoreExome-12-v1-1\"},\n    {\"BeadChip 12x8\", 622399, 622399, \"humanmethylation450_15017482_v-1-2 ???\"},\n    {\"BeadChip 12x8\", 722405, 722405, \"HumanOmniExpress-12-v1-1\"},\n    {\"BeadChip 12x8\", 734889, 734889, \"HumanOmniExpress-12-v1-0\"},\n    {\"BeadChip 12x8\", 736136, 736136, \"HumanOmniExpress-12-v1-0\"},\n    {\"BeadChip 1x12\", 577085, 8627, \"HumanHap550v3\"},\n    {\"BeadChip 1x12\", 661182, 49163, \"HumanHap650Yv3\"},\n    {\"BeadChip 1x40\", 1129736, 57373, \"Human1Mv1\"},\n    {\"BeadChip 1x40 66\", 1078890, 52497, \"Human1Mv1\"},\n    {\"BeadChip 24x1x4\", 306776, 306776, \"InfiniumCore-24v1-2\"},\n    {\"BeadChip 24x1x4\", 527136, 527136, \"OncoArray-500K\"},\n    {\"BeadChip 24x1x4\", 577781, 577781, \"HumanCoreExome-24v1-0\"},\n    {\"BeadChip 24x1x4\", 581261, 581261, \"HumanCoreExome-24v1-2\"},\n    {\"BeadChip 24x1x4\", 582684, 582684, \"HumanCoreExome-24v1-1\"},\n    {\"BeadChip 24x1x4\", 611866, 611866, \"HumanCoreExome-24v1-4\"},\n    {\"BeadChip 24x1x4\", 623302, 623302, \"PsychChip_15048346\"},\n    {\"BeadChip 24x1x4\", 623513, 623513, \"InfiniumPsychArray-24v1-1\"},\n    {\"BeadChip 24x1x4\", 638714, 638714, \"PsychChip_v1-1_15073391\"},\n    {\"BeadChip 24x1x4\", 647864, 647864, \"InfiniumPsychArray-24v1-3\"},\n    {\"BeadChip 24x1x4\", 663209, 663209, \"GSA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 704215, 704215, \"GSA-24v3-0\"},\n    {\"BeadChip 24x1x4\", 708013, 708013, \"DeCodeGenetics_V1_20012591\"},\n    {\"BeadChip 24x1x4\", 710576, 710576, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 710606, 710606, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 710608, 710608, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 715653, 715653, \"HumanOmniExpress-24v1-1\"},\n    {\"BeadChip 24x1x4\", 716279, 716279, \"InfiniumOmniExpress-24v1-2\"},\n    {\"BeadChip 24x1x4\", 718963, 718963, \"HumanOmniExpress-24-v1-0\"},\n    {\"BeadChip 24x1x4\", 719234, 719234, \"HumanOmniExpress-24-v1-0\"},\n    {\"BeadChip 24x1x4\", 729110, 729110, \"ASA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 733354, 733354, \"GSA-24v2-0\"},\n    {\"BeadChip 24x1x4\", 749019, 749019, \"DeCodeGenetics_V3_20032937X331991\"},\n    {\"BeadChip 24x1x4\", 751614, 751614, \"GSAMD-24v3-0-EA_20034606\"},\n    {\"BeadChip 24x1x4\", 766804, 766804, \"JSA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 776509, 776509, \"ASA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 780343, 780343, \"GSAMD-24v2-0_20024620\"},\n    {\"BeadChip 24x1x4\", 780509, 780509, \"GSAMD-24v2-0_20024620\"},\n    {\"BeadChip 24x1x4\", 818205, 818205, \"GSA-24v2-0\"},\n    {\"BeadChip 2x10\", 321354, 37161, \"HumanHap300v2\"},\n    {\"BeadChip 2x12\", 381079, 29275, \"HumanCNV370v1\"},\n    {\"BeadChip 2x20\", 561686, 54936, \"HumanHap550v3\"},\n    {\"BeadChip 2x6Q\", 1224000, 180026, \"Human1M-Duov3\"},\n    {\"BeadChip 2x6Q\", 1224629, 180026, \"Human1M-Duov3\"},\n    {\"BeadChip 48x4\", 730546, 730546, \"GSA-MD-48v4-0_20098041\"},\n    {\"BeadChip 4x10\", 2623923, 1300482, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2623923, 1323441, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624666, 1300941, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624666, 1323725, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624671, 1323726, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2655594, 1354653, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4X1X14\", 1186430, 1186430, \"HumanOmni1-Quad_v1-0\"},\n    {\"BeadChip 4x2Q\", 376216, 186490, \"HumanCNV370-Quadv3\"},\n    {\"BeadChip 4x3Q\", 626122, 208778, \"Human610-Quadv1\"},\n    {\"BeadChip 4x3Q\", 667447, 208778, \"Human660W-Quad_v1\"},\n    {\"BeadChip 8x5\", 1052641, 1052641, \"infinium-methylationepic-v-1-0 ???\"},\n    {\"BeadChip 8x5\", 867478, 867478, \"CytoSNP-850K\"},\n    {\"BeadChip 8x5\", 988240, 988240, \"HumanOmniExpressExome-8-v1-1\"},\n    {\"BeadChip 8x5\", 989536, 989536, \"HumanOmniExpressExome-8-v1-1\"},\n    {\"BeadChip 8x5\", 992824, 992824, \"HumanOmniExpressExome-8-v1-4\"},\n    {\"BeadChip 8x5\", 996003, 996003, \"HumanOmniExpressExome-8-v1-2\"},\n    {\"BeadChip 8x5\", 996055, 996055, \"HumanOmniExpressExome-8-v1-2\"},\n    {\"SLIDE.15028542.24x1x3\", 307984, 307984, \"HumanCore-24v1-0\"},\n    {\"SLIDE.15028542.24x1x3\", 311460, 311460, \"HumanCore-24v1-0\"},\n    {NULL, 0, 0, NULL}};\n\ntypedef struct {\n    char *run_time;\n    char *block_type;\n    char *block_pars;\n    char *block_code;\n    char *code_version;\n} RunInfo;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int64_t version;\n    int32_t number_toc_entries;\n    uint16_t *id;\n    int64_t *toc;\n    int32_t num_snps;\n    int32_t num_mid_blocks;\n    int32_t *ilmn_id;\n    uint16_t *sd;\n    uint16_t *mean;\n    uint8_t *nbeads;\n    const uint16_t *trimmed_mean; // only used for historical purposes\n    uint8_t *mid_block;\n    uint8_t red_green[4];\n    char *snp_manifest;\n    char *sentrix_barcode;\n    char *chip_type;\n    char *sentrix_position;\n    char *beadset;\n    char *sample_name;\n    char *description;\n    char *sample_plate;\n    char *sample_well;\n    int32_t sample_count;\n    char *vln;\n    RunInfo *run_infos;\n    int32_t m_run_infos;\n    const char *chip_type_guess;\n    const char *imaging_date;\n    const char *scanner_data;\n    void *ilmn_id2index;\n} idat_t;\n\nKHASH_MAP_INIT_INT(32, int32_t)\n\nstatic int idat_read(idat_t *idat, uint16_t id) {\n    int i;\n    for (i = 0; i < idat->number_toc_entries && id != idat->id[i]; i++);\n    if (i == idat->number_toc_entries) return -1;\n    if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)\n        error(\"Fail to seek to position %ld in IDAT %s file\\n\", idat->toc[i], idat->fn);\n\n    switch (id) {\n    case NUM_SNPS_READ:\n        read_bytes(idat->hfile, (void *)&idat->num_snps, sizeof(int32_t));\n        break;\n    case ILLUMINA_ID:\n        idat->ilmn_id = (int32_t *)malloc(idat->num_snps * sizeof(int32_t));\n        read_bytes(idat->hfile, (void *)idat->ilmn_id, idat->num_snps * sizeof(int32_t));\n        int ret;\n        idat->ilmn_id2index = kh_init(32);\n        khash_t(32) *hash = (khash_t(32) *)idat->ilmn_id2index;\n        for (i = 0; i < idat->num_snps; i++) {\n            khiter_t k = kh_put(32, hash, idat->ilmn_id[i], &ret);\n            if (ret < 0) error(\"Unable to insert Illumina ID %d in hash table\\n\", idat->ilmn_id[i]);\n            if (ret > 0)\n                kh_val(hash, k) = kh_size(hash) - 1;\n            else\n                error(\"Duplicate Illumina ID %d in hash table\\n\", idat->ilmn_id[i]);\n        }\n        break;\n    case SD:\n        idat->sd = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));\n        read_bytes(idat->hfile, (void *)idat->sd, idat->num_snps * sizeof(uint16_t));\n        break;\n    case MEAN:\n        idat->mean = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));\n        read_bytes(idat->hfile, (void *)idat->mean, idat->num_snps * sizeof(uint16_t));\n        idat->trimmed_mean = idat->mean;\n        break;\n    case NBEADS:\n        idat->nbeads = (uint8_t *)malloc(idat->num_snps * sizeof(uint8_t));\n        read_bytes(idat->hfile, (void *)idat->nbeads, idat->num_snps * sizeof(uint8_t));\n        break;\n    case MID_BLOCK:\n        read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t));\n        idat->mid_block = (uint8_t *)malloc(idat->num_mid_blocks * sizeof(uint8_t));\n        read_bytes(idat->hfile, (void *)idat->mid_block, idat->num_mid_blocks * sizeof(uint8_t));\n        break;\n    case RED_GREEN:\n        read_bytes(idat->hfile, (void *)&idat->red_green, 4 * sizeof(uint8_t));\n        break;\n    case IDAT_SNP_MANIFEST:\n        read_pfx_string(idat->hfile, &idat->snp_manifest, NULL);\n        break;\n    case SENTRIX_BARCODE:\n        read_pfx_string(idat->hfile, &idat->sentrix_barcode, NULL);\n        break;\n    case CHIP_TYPE:\n        read_pfx_string(idat->hfile, &idat->chip_type, NULL);\n        break;\n    case SENTRIX_POSITION:\n        read_pfx_string(idat->hfile, &idat->sentrix_position, NULL);\n        break;\n    case BEADSET:\n        read_pfx_string(idat->hfile, &idat->beadset, NULL);\n        break;\n    case IDAT_SAMPLE_NAME:\n        read_pfx_string(idat->hfile, &idat->sample_name, NULL);\n        break;\n    case DESCRIPTION:\n        read_pfx_string(idat->hfile, &idat->description, NULL);\n        break;\n    case IDAT_SAMPLE_PLATE:\n        read_pfx_string(idat->hfile, &idat->sample_plate, NULL);\n        break;\n    case IDAT_SAMPLE_WELL:\n        read_pfx_string(idat->hfile, &idat->sample_well, NULL);\n        break;\n    case IDAT_SAMPLE_COUNT:\n        read_bytes(idat->hfile, (void *)&idat->sample_count, sizeof(int32_t));\n        break;\n    case IDAT_VLN:\n        read_pfx_string(idat->hfile, &idat->vln, NULL);\n        break;\n    case RUN_INFO:\n        read_bytes(idat->hfile, (void *)&idat->m_run_infos, sizeof(int32_t));\n        idat->run_infos = (RunInfo *)calloc(idat->m_run_infos, sizeof(RunInfo));\n        for (i = 0; i < idat->m_run_infos; i++) {\n            read_pfx_string(idat->hfile, &idat->run_infos[i].run_time, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_type, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_pars, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_code, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].code_version, NULL);\n        }\n        break;\n    default:\n        error(\"IDAT file format does not support TOC entry %d\\n\", id);\n        break;\n    }\n    return 0;\n}\n\nstatic idat_t *idat_init(const char *fn, int load_arrays) {\n    idat_t *idat = (idat_t *)calloc(1, sizeof(idat_t));\n    idat->fn = strdup(fn);\n    idat->hfile = hopen(idat->fn, \"rb\");\n    if (idat->hfile == NULL) error(\"Could not open %s: %s\\n\", idat->fn, strerror(errno));\n    if (is_gzip(idat->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", idat->fn);\n\n    int i;\n    uint8_t buffer[4];\n    if (hread(idat->hfile, (void *)buffer, 4) < 4) error(\"Failed to read magic number from %s file\\n\", idat->fn);\n    if (memcmp(buffer, \"IDAT\", 4) != 0) error(\"IDAT file %s format identifier is bad\\n\", idat->fn);\n\n    read_bytes(idat->hfile, (void *)&idat->version, sizeof(int64_t));\n    if (idat->version < 3)\n        error(\"Cannot read IDAT file %s. Unsupported IDAT file format version: %ld\\n\", idat->fn, idat->version);\n\n    read_bytes(idat->hfile, (void *)&idat->number_toc_entries, sizeof(int32_t));\n    idat->id = (uint16_t *)malloc(idat->number_toc_entries * sizeof(uint16_t));\n    idat->toc = (int64_t *)malloc(idat->number_toc_entries * sizeof(int64_t));\n    for (i = 0; i < idat->number_toc_entries; i++) {\n        read_bytes(idat->hfile, (void *)&idat->id[i], sizeof(uint16_t));\n        read_bytes(idat->hfile, (void *)&idat->toc[i], sizeof(int64_t));\n    }\n\n    for (i = 0; i < idat->number_toc_entries; i++) {\n        if (!load_arrays && idat->id[i] <= MID_BLOCK) {\n            if (idat->id[i] == MID_BLOCK) {\n                if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)\n                    error(\"Fail to seek to position %ld in IDAT %s file\\n\", idat->toc[i], idat->fn);\n                read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t));\n            }\n            continue;\n        }\n        idat_read(idat, idat->id[i]);\n    }\n\n    if (idat->chip_type) {\n        const chip_type_t *ptr;\n        for (ptr = chip_types; ptr->chip_type; ptr++) {\n            if (strcmp(idat->chip_type, ptr->chip_type) == 0 && ptr->num_snps == idat->num_snps\n                && ptr->num_mid_blocks == idat->num_mid_blocks)\n                idat->chip_type_guess = ptr->chip_type_guess;\n        }\n    }\n\n    for (i = 0; i < idat->m_run_infos; i++) {\n        if (strcmp(idat->run_infos[i].block_type, \"Scan\") != 0) continue;\n        idat->imaging_date = idat->run_infos[i].run_time;\n        idat->scanner_data = idat->run_infos[i].block_pars;\n    }\n\n    return idat;\n}\n\nstatic void idat_destroy(idat_t *idat) {\n    if (!idat) return;\n    if (hclose(idat->hfile) < 0) error(\"Error closing IDAT file %s\\n\", idat->fn);\n    free(idat->fn);\n    free(idat->id);\n    free(idat->toc);\n    free(idat->snp_manifest);\n    free(idat->sentrix_barcode);\n    free(idat->chip_type);\n    free(idat->sentrix_position);\n    free(idat->beadset);\n    free(idat->sample_name);\n    free(idat->description);\n    free(idat->sample_plate);\n    free(idat->sample_well);\n    free(idat->vln);\n    int i;\n    for (i = 0; i < idat->m_run_infos; i++) {\n        free(idat->run_infos[i].run_time);\n        free(idat->run_infos[i].block_type);\n        free(idat->run_infos[i].block_pars);\n        free(idat->run_infos[i].block_code);\n        free(idat->run_infos[i].code_version);\n    }\n    free(idat->run_infos);\n    free(idat->ilmn_id);\n    free(idat->sd);\n    free(idat->mean);\n    free(idat->nbeads);\n    free(idat->mid_block);\n    if (idat->ilmn_id2index) kh_destroy(32, idat->ilmn_id2index);\n    free(idat);\n}\n\nstatic void idat_to_csv(const idat_t *idat, FILE *stream, int verbose) {\n    int i;\n    fprintf(stream, \"Illumina, Inc.\\n\");\n    fprintf(stream, \"[Heading]\\n\");\n    fprintf(stream, \"Descriptor File Name,%s\\n\", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn);\n    fprintf(stream, \"IDAT file version,%ld\\n\", idat->version);\n    fprintf(stream, \"Number of TOC entries,%d\\n\", idat->number_toc_entries);\n    fprintf(stream, \"Probes Count,%d\\n\", idat->num_snps);\n    fprintf(stream, \"Mid Blocks Count,%d\\n\", idat->num_mid_blocks);\n    fprintf(stream, \"Red Green,%02x %02x %02x %02x\\n\", idat->red_green[0], idat->red_green[1], idat->red_green[2],\n            idat->red_green[3]);\n    fprintf(stream, \"SNP Manifest,%s\\n\", idat->snp_manifest ? idat->snp_manifest : \"\");\n    fprintf(stream, \"Sentrix Barcode,%s\\n\", idat->sentrix_barcode);\n    fprintf(stream, \"Chip Type,%s\\n\", idat->chip_type);\n    fprintf(stream, \"Sentrix Position,%s\\n\", idat->sentrix_position);\n    fprintf(stream, \"BeadSet,%s\\n\", idat->beadset ? idat->beadset : \"\");\n    fprintf(stream, \"Sample Name,%s\\n\", idat->sample_name ? idat->sample_name : \"\");\n    fprintf(stream, \"Description,%s\\n\", idat->description ? idat->description : \"\");\n    fprintf(stream, \"Sample Plate,%s\\n\", idat->sample_plate ? idat->sample_plate : \"\");\n    fprintf(stream, \"Sample Well,%s\\n\", idat->sample_well ? idat->sample_well : \"\");\n    fprintf(stream, \"Sample Count,%d\\n\", idat->sample_count);\n    fprintf(stream, \"Vln,%s\\n\", idat->vln ? idat->vln : \"\");\n    fprintf(stream, \"Chip Prefix (Guess),%s\\n\", idat->chip_type_guess ? idat->chip_type_guess : \"Unknown\");\n    fprintf(stream, \"[Assay]\\n\");\n    fprintf(stream, \"IlmnID,Sd,Mean,Nbeads\\n\");\n    if (verbose) {\n        for (i = 0; i < idat->num_snps; i++)\n            fprintf(stream, \"%d,%d,%d,%d\\n\", idat->ilmn_id[i], idat->sd[i], idat->mean[i], idat->nbeads[i]);\n        fprintf(stream, \"[Mid Blocks]\\n\");\n        for (i = 0; i < idat->num_mid_blocks; i++) fprintf(stream, \"%d\\n\", idat->mid_block[i]);\n    } else {\n        fprintf(stream, \"... use --verbose to visualize Assay data ...\\n\");\n        fprintf(stream, \"[Mid Blocks]\\n\");\n        fprintf(stream, \"... use --verbose to visualize Mid Blocks data ...\\n\");\n    }\n    fprintf(stream, \"[Run Infos]\\n\");\n    for (i = 0; i < idat->m_run_infos; i++) {\n        fprintf(stream, \"%s\\t%s\\t%s\\t%s\\t%s\\n\", idat->run_infos[i].run_time, idat->run_infos[i].block_type,\n                idat->run_infos[i].block_pars, idat->run_infos[i].block_code, idat->run_infos[i].code_version);\n    }\n}\n\nstatic void idats_to_tsv(idat_t **idats, int n, FILE *stream) {\n    fprintf(stream,\n            \"idat\\tnumber_probes\\tnumber_mid_blocks\\tred_green\\tmanifest_file\\tsentrix_\"\n            \"barcode\\tchip_type\\t\"\n            \"sentrix_position\\tbeadset\\tsample_name\\tdescription\\tsample_plate\\tsample_\"\n            \"well\\tsample_count\\tvln\\t\"\n            \"chip_type_guess\\tscan_date\\tscanner_data\\n\");\n    int i;\n    for (i = 0; i < n; i++) {\n        idat_t *idat = idats[i];\n        fprintf(stream,\n                \"%s\\t%d\\t%d\\t%02x %02x %02x \"\n                \"%02x\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%d\\t%s\\t%s\\t%s\\t%s\\n\",\n                strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn, idat->num_snps, idat->num_mid_blocks,\n                idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3],\n                idat->snp_manifest ? idat->snp_manifest : \"\", idat->sentrix_barcode, idat->chip_type,\n                idat->sentrix_position, idat->beadset ? idat->beadset : \"\", idat->sample_name ? idat->sample_name : \"\",\n                idat->description ? idat->description : \"\", idat->sample_plate ? idat->sample_plate : \"\",\n                idat->sample_well ? idat->sample_well : \"\", idat->sample_count, idat->vln ? idat->vln : \"\",\n                idat->chip_type_guess ? idat->chip_type_guess : \"Unknown\", idat->imaging_date ? idat->imaging_date : \"\",\n                idat->scanner_data ? idat->scanner_data : \"\");\n    }\n}\n\n/****************************************\n * GTC FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumGTCFile.java\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/GenotypeCalls.py\n\n#define NUM_SNPS 1\n#define PLOIDY 2      // AutoConvert 2.0\n#define PLOIDY_TYPE 3 // AutoConvert 2.0\n#define GTC_SAMPLE_NAME 10\n#define GTC_SAMPLE_PLATE 11\n#define GTC_SAMPLE_WELL 12\n#define CLUSTER_FILE 100\n#define GTC_SNP_MANIFEST 101\n#define IMAGING_DATE 200\n#define AUTOCALL_DATE 201\n#define AUTOCALL_VERSION 300\n#define NORMALIZATION_TRANSFORMS 400\n#define CONTROLS_X 500\n#define CONTROLS_Y 501\n#define RAW_X 1000\n#define RAW_Y 1001\n#define GENOTYPES 1002\n#define BASE_CALLS 1003\n#define GENOTYPE_SCORES 1004\n#define SCANNER_DATA 1005\n#define CALL_RATE 1006\n#define GENDER 1007\n#define LOGR_DEV 1008\n#define GC10 1009\n#define DX 1010\n#define SAMPLE_DATA 1011\n#define B_ALLELE_FREQS 1012   // AutoConvert 2.0\n#define LOGR_RATIOS 1013      // AutoConvert 2.0\n#define PERCENTILES_X 1014    // AutoConvert 2.0\n#define PERCENTILES_Y 1015    // AutoConvert 2.0\n#define SLIDE_IDENTIFIER 1016 // AutoConvert 2.0\n\nstatic const char *code2genotype[] = {\n    \"NC\",       \"AA\",       \"AB\",       \"BB\",       \"NULL\",     \"A\",        \"B\",        \"AAA\",\n    \"AAB\",      \"ABB\",      \"BBB\",      \"AAAA\",     \"AAAB\",     \"AABB\",     \"ABBB\",     \"BBBB\",\n    \"AAAAA\",    \"AAAAB\",    \"AAABB\",    \"AABBB\",    \"ABBBB\",    \"BBBBB\",    \"AAAAAA\",   \"AAAAAB\",\n    \"AAAABB\",   \"AAABBB\",   \"AABBBB\",   \"ABBBBB\",   \"BBBBBB\",   \"AAAAAAA\",  \"AAAAAAB\",  \"AAAAABB\",\n    \"AAAABBB\",  \"AAABBBB\",  \"AABBBBB\",  \"ABBBBBB\",  \"BBBBBBB\",  \"AAAAAAAA\", \"AAAAAAAB\", \"AAAAAABB\",\n    \"AAAAABBB\", \"AAAABBBB\", \"AAABBBBB\", \"AABBBBBB\", \"ABBBBBBB\", \"BBBBBBBB\"};\n\ntypedef struct {\n    int32_t version;\n    float offset_x;\n    float offset_y;\n    float scale_x;\n    float scale_y;\n    float shear;\n    float theta;\n    float cvx;\n    float cvy;\n    float nn12;\n    float rr12;\n    float taa;\n    float tbb;\n} XForm;\n\ntypedef char BaseCall[2];\n\ntypedef struct {\n    char *scanner_name;\n    int32_t pmt_green;\n    int32_t pmt_red;\n    char *scanner_version;\n    char *imaging_user;\n} ScannerData;\n\ntypedef struct {\n    float p50gc;\n    int32_t num_calls;\n    int32_t num_no_calls;\n    int32_t num_intensity_only;\n} SampleData;\n\ntypedef uint16_t Percentiles[3];\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int32_t version;\n    int32_t number_toc_entries;\n    uint16_t *id;\n    int32_t *toc;\n    int32_t num_snps;\n    int32_t ploidy;\n    int32_t ploidy_type;\n    char *sample_name;\n    char *sample_plate;\n    char *sample_well;\n    char *cluster_file;\n    char *snp_manifest;\n    char *imaging_date;\n    char *autocall_date;\n    char *autocall_version;\n    XForm *normalization_transforms;\n    size_t m_normalization_transforms;\n    uint16_t *controls_x;\n    size_t m_controls_x;\n    uint16_t *controls_y;\n    size_t m_controls_y;\n    ScannerData scanner_data;\n    float call_rate;\n    char gender;\n    float logr_dev;\n    float p10gc;\n    int32_t dx;\n    SampleData sample_data;\n    Percentiles percentiles_x;\n    Percentiles percentiles_y;\n    char *sentrix_id;\n\n    char *display_name;\n    float *sin_theta; // precomputed sine transforms\n    float *cos_theta; // precomputed cosine transforms\n\n    size_t capacity;\n    buffer_array_t *raw_x;\n    buffer_array_t *raw_y;\n    buffer_array_t *genotypes;\n    buffer_array_t *base_calls;\n    buffer_array_t *genotype_scores;\n    buffer_array_t *b_allele_freqs;\n    buffer_array_t *logr_ratios;\n} gtc_t;\n\nstatic int gtc_read(gtc_t *gtc, uint16_t id) {\n    int i;\n    for (i = 0; i < gtc->number_toc_entries && id != gtc->id[i]; i++);\n    if (i == gtc->number_toc_entries) return -1;\n    if (id != NUM_SNPS && id != PLOIDY && id != PLOIDY_TYPE) {\n        if (hseek(gtc->hfile, gtc->toc[i], SEEK_SET) < 0)\n            error(\"Fail to seek to position %d in GTC %s file \\n\", gtc->toc[i], gtc->fn);\n    }\n\n    switch (id) {\n    case NUM_SNPS:\n        gtc->num_snps = gtc->toc[i];\n        break;\n    case PLOIDY:\n        gtc->ploidy = gtc->toc[i];\n        break;\n    case PLOIDY_TYPE:\n        gtc->ploidy_type = gtc->toc[i];\n        break;\n    case GTC_SAMPLE_NAME:\n        read_pfx_string(gtc->hfile, &gtc->sample_name, NULL);\n        break;\n    case GTC_SAMPLE_PLATE:\n        read_pfx_string(gtc->hfile, &gtc->sample_plate, NULL);\n        break;\n    case GTC_SAMPLE_WELL:\n        read_pfx_string(gtc->hfile, &gtc->sample_well, NULL);\n        break;\n    case CLUSTER_FILE:\n        read_pfx_string(gtc->hfile, &gtc->cluster_file, NULL);\n        break;\n    case GTC_SNP_MANIFEST:\n        read_pfx_string(gtc->hfile, &gtc->snp_manifest, NULL);\n        break;\n    case IMAGING_DATE:\n        read_pfx_string(gtc->hfile, &gtc->imaging_date, NULL);\n        break;\n    case AUTOCALL_DATE:\n        read_pfx_string(gtc->hfile, &gtc->autocall_date, NULL);\n        break;\n    case AUTOCALL_VERSION:\n        read_pfx_string(gtc->hfile, &gtc->autocall_version, NULL);\n        break;\n    case NORMALIZATION_TRANSFORMS:\n        read_pfx_array(gtc->hfile, (void **)&gtc->normalization_transforms, &gtc->m_normalization_transforms,\n                       sizeof(XForm));\n        break;\n    case CONTROLS_X:\n        read_pfx_array(gtc->hfile, (void **)&gtc->controls_x, &gtc->m_controls_x, sizeof(uint16_t));\n        break;\n    case CONTROLS_Y:\n        read_pfx_array(gtc->hfile, (void **)&gtc->controls_y, &gtc->m_controls_y, sizeof(uint16_t));\n        break;\n    case RAW_X:\n        gtc->raw_x = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint16_t));\n        break;\n    case RAW_Y:\n        gtc->raw_y = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint16_t));\n        break;\n    case GENOTYPES:\n        gtc->genotypes = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint8_t));\n        break;\n    case BASE_CALLS:\n        gtc->base_calls = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(BaseCall));\n        break;\n    case GENOTYPE_SCORES:\n        gtc->genotype_scores = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float));\n        break;\n    case SCANNER_DATA:\n        read_pfx_string(gtc->hfile, &gtc->scanner_data.scanner_name, NULL);\n        read_bytes(gtc->hfile, (void *)&gtc->scanner_data.pmt_green, sizeof(float));\n        read_bytes(gtc->hfile, (void *)&gtc->scanner_data.pmt_red, sizeof(float));\n        read_pfx_string(gtc->hfile, &gtc->scanner_data.scanner_version, NULL);\n        read_pfx_string(gtc->hfile, &gtc->scanner_data.imaging_user, NULL);\n        break;\n    case CALL_RATE:\n        read_bytes(gtc->hfile, (void *)&gtc->call_rate, sizeof(float));\n        break;\n    case GENDER:\n        read_bytes(gtc->hfile, (void *)&gtc->gender, sizeof(char));\n        break;\n    case LOGR_DEV:\n        read_bytes(gtc->hfile, (void *)&gtc->logr_dev, sizeof(float));\n        break;\n    case GC10:\n        read_bytes(gtc->hfile, (void *)&gtc->p10gc, sizeof(float));\n        break;\n    case DX:\n        read_bytes(gtc->hfile, (void *)&gtc->dx, sizeof(int32_t));\n        break;\n    case SAMPLE_DATA:\n        read_bytes(gtc->hfile, (void *)&gtc->sample_data, sizeof(SampleData));\n        break;\n    case B_ALLELE_FREQS:\n        gtc->b_allele_freqs = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float));\n        break;\n    case LOGR_RATIOS:\n        gtc->logr_ratios = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float));\n        break;\n    case PERCENTILES_X:\n        read_bytes(gtc->hfile, (void *)&gtc->percentiles_x, sizeof(Percentiles));\n        break;\n    case PERCENTILES_Y:\n        read_bytes(gtc->hfile, (void *)&gtc->percentiles_y, sizeof(Percentiles));\n        break;\n    case SLIDE_IDENTIFIER:\n        read_pfx_string(gtc->hfile, &gtc->sentrix_id, NULL);\n        break;\n    default:\n        error(\"GTC file format does not support TOC entry %d\\n\", id);\n        break;\n    }\n    return 0;\n}\n\nstatic gtc_t *gtc_init(const char *fn, size_t capacity) {\n    gtc_t *gtc = (gtc_t *)calloc(1, sizeof(gtc_t));\n    gtc->fn = strdup(fn);\n    gtc->hfile = hopen(gtc->fn, \"rb\");\n    if (gtc->hfile == NULL) error(\"Could not open %s: %s\\n\", gtc->fn, strerror(errno));\n    if (is_gzip(gtc->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", gtc->fn);\n\n    int i;\n    uint8_t buffer[4];\n    if (hread(gtc->hfile, (void *)buffer, 4) < 4) error(\"Failed to read magic number from %s file\\n\", gtc->fn);\n    if (memcmp(buffer, \"gtc\", 3) != 0) error(\"GTC file %s format identifier is bad\\n\", gtc->fn);\n    if (buffer[3] > 5 && buffer[3] < 3) error(\"GTC file %s version %d is unsupported\\n\", gtc->fn, buffer[3]);\n    gtc->version = (int32_t)buffer[3];\n\n    read_bytes(gtc->hfile, (void *)&gtc->number_toc_entries, sizeof(int32_t));\n    gtc->id = (uint16_t *)malloc(gtc->number_toc_entries * sizeof(uint16_t));\n    gtc->toc = (int32_t *)malloc(gtc->number_toc_entries * sizeof(int32_t));\n    for (i = 0; i < gtc->number_toc_entries; i++) {\n        read_bytes(gtc->hfile, (void *)&gtc->id[i], sizeof(uint16_t));\n        read_bytes(gtc->hfile, (void *)&gtc->toc[i], sizeof(int32_t));\n    }\n\n    gtc->capacity = capacity;\n    for (i = 0; i < gtc->number_toc_entries; i++) gtc_read(gtc, gtc->id[i]);\n\n    const char *ptr = strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn;\n    gtc->display_name = strndup(ptr, strlen(ptr) - 4);\n\n    gtc->sin_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float));\n    gtc->cos_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float));\n    for (i = 0; i < gtc->m_normalization_transforms; i++) {\n        gtc->sin_theta[i] = (float)sin((double)gtc->normalization_transforms[i].theta);\n        gtc->cos_theta[i] = (float)cos((double)gtc->normalization_transforms[i].theta);\n    }\n\n    return gtc;\n}\n\nstatic void gtc_destroy(gtc_t *gtc) {\n    if (!gtc) return;\n    if (hclose(gtc->hfile) < 0) error(\"Error closing GTC file %s\\n\", gtc->fn);\n    free(gtc->fn);\n    free(gtc->id);\n    free(gtc->toc);\n    free(gtc->sample_name);\n    free(gtc->sample_plate);\n    free(gtc->sample_well);\n    free(gtc->cluster_file);\n    free(gtc->snp_manifest);\n    free(gtc->imaging_date);\n    free(gtc->autocall_date);\n    free(gtc->autocall_version);\n    free(gtc->normalization_transforms);\n    free(gtc->controls_x);\n    free(gtc->controls_y);\n    free(gtc->scanner_data.scanner_name);\n    free(gtc->scanner_data.scanner_version);\n    free(gtc->scanner_data.imaging_user);\n    free(gtc->sentrix_id);\n\n    free(gtc->display_name);\n    free(gtc->sin_theta);\n    free(gtc->cos_theta);\n\n    buffer_array_destroy(gtc->raw_x);\n    buffer_array_destroy(gtc->raw_y);\n    buffer_array_destroy(gtc->genotypes);\n    buffer_array_destroy(gtc->base_calls);\n    buffer_array_destroy(gtc->genotype_scores);\n    buffer_array_destroy(gtc->b_allele_freqs);\n    buffer_array_destroy(gtc->logr_ratios);\n    free(gtc);\n}\n\nstatic void gtc_to_csv(const gtc_t *gtc, FILE *stream, int verbose) {\n    fprintf(stream, \"Illumina, Inc.\\n\");\n    fprintf(stream, \"[Heading]\\n\");\n    fprintf(stream, \"Descriptor File Name,%s\\n\", strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn);\n    fprintf(stream, \"GTC genotype file version,%d\\n\", gtc->version);\n    fprintf(stream, \"Number of TOC entries,%d\\n\", gtc->number_toc_entries);\n    fprintf(stream, \"Number of SNPs,%d\\n\", gtc->num_snps);\n    fprintf(stream, \"Ploidy,%d\\n\", gtc->ploidy);\n    fprintf(stream, \"Ploidy Type,%d\\n\", gtc->ploidy_type);\n    fprintf(stream, \"Sample name,%s\\n\", gtc->sample_name ? gtc->sample_name : \"\");\n    fprintf(stream, \"Sample plate,%s\\n\", gtc->sample_plate ? gtc->sample_plate : \"\");\n    fprintf(stream, \"Sample well,%s\\n\", gtc->sample_well ? gtc->sample_well : \"\");\n    fprintf(stream, \"Cluster file,%s\\n\", gtc->cluster_file ? gtc->cluster_file : \"\");\n    fprintf(stream, \"SNP manifest,%s\\n\", gtc->snp_manifest ? gtc->snp_manifest : \"\");\n    fprintf(stream, \"Imaging date,%s\\n\", gtc->imaging_date ? gtc->imaging_date : \"\");\n    fprintf(stream, \"AutoCall date,%s\\n\", gtc->autocall_date ? gtc->autocall_date : \"\");\n    fprintf(stream, \"AutoCall version,%s\\n\", gtc->autocall_version);\n    fprintf(stream, \"Number of normalization transforms,%ld\\n\", gtc->m_normalization_transforms);\n    fprintf(stream, \"Number of controls X,%ld\\n\", gtc->m_controls_x);\n    fprintf(stream, \"Number of controls Y,%ld\\n\", gtc->m_controls_y);\n    fprintf(stream, \"Name of the scanner,%s\\n\", gtc->scanner_data.scanner_name ? gtc->scanner_data.scanner_name : \"\");\n    fprintf(stream, \"Pmt Green,%d\\n\", gtc->scanner_data.pmt_green);\n    fprintf(stream, \"Pmt Red,%d\\n\", gtc->scanner_data.pmt_red);\n    fprintf(stream, \"Version of the scanner software used,%s\\n\",\n            gtc->scanner_data.scanner_version ? gtc->scanner_data.scanner_version : \"\");\n    fprintf(stream, \"Name of the scanner user,%s\\n\",\n            gtc->scanner_data.imaging_user ? gtc->scanner_data.imaging_user : \"\");\n    fprintf(stream, \"Call Rate,%f\\n\", gtc->call_rate);\n    fprintf(stream, \"Computed Gender,%c\\n\", gtc->gender);\n    fprintf(stream, \"LogR deviation,%f\\n\", gtc->logr_dev);\n    fprintf(stream, \"GenCall score - 10th percentile,%f\\n\", gtc->p10gc);\n    fprintf(stream, \"DX,%d\\n\", gtc->dx);\n    fprintf(stream, \"GenCall score - 50th percentile,%f\\n\", gtc->sample_data.p50gc);\n    fprintf(stream, \"Number of valid calls,%d\\n\", gtc->sample_data.num_calls);\n    fprintf(stream, \"Number of invalid calls,%d\\n\", gtc->sample_data.num_no_calls);\n    fprintf(stream, \"Number of loci that are \\\"Intensity Only\\\" or \\\"Zeroed\\\",%d\\n\",\n            gtc->sample_data.num_intensity_only);\n    fprintf(stream, \"P05 X,%d\\n\", gtc->percentiles_x[0]);\n    fprintf(stream, \"P50 X,%d\\n\", gtc->percentiles_x[1]);\n    fprintf(stream, \"P95 X,%d\\n\", gtc->percentiles_x[2]);\n    fprintf(stream, \"P05 Y,%d\\n\", gtc->percentiles_y[0]);\n    fprintf(stream, \"P50 Y,%d\\n\", gtc->percentiles_y[1]);\n    fprintf(stream, \"P95 Y,%d\\n\", gtc->percentiles_y[2]);\n    fprintf(stream, \"Sentrix identifier for the slide,%s\\n\", gtc->sentrix_id ? gtc->sentrix_id : \"\");\n    fprintf(stream, \"[Assay]\\n\");\n    fprintf(stream, \"Raw X,Raw Y,GType,Top Alleles,Score,B Allele Freq,Log R Ratio\\n\");\n    int i;\n    if (verbose) {\n        for (i = 0; i < gtc->num_snps; i++) {\n            uint16_t raw_x = 0, raw_y = 0;\n            get_element(gtc->raw_x, (void *)&raw_x, i);\n            get_element(gtc->raw_y, (void *)&raw_y, i);\n            uint8_t genotype = 0;\n            get_element(gtc->genotypes, (void *)&genotype, i);\n            BaseCall base_call = {'-', '-'};\n            get_element(gtc->base_calls, (void *)&base_call, i);\n            float genotype_score = NAN, b_allele_freq = NAN, logr_ratio = NAN;\n            get_element(gtc->genotype_scores, (void *)&genotype_score, i);\n            get_element(gtc->b_allele_freqs, (void *)&b_allele_freq, i);\n            get_element(gtc->logr_ratios, (void *)&logr_ratio, i);\n            fprintf(stream, \"%d,%d,%s,%c%c,%.10f,%.10f,%.10f\\n\", raw_x, raw_y, code2genotype[genotype], base_call[0],\n                    base_call[1], genotype_score, b_allele_freq, logr_ratio);\n        }\n    } else {\n        fprintf(stream, \"... use --verbose to visualize assay data ...\\n\");\n    }\n    fprintf(stream, \"[Normalization Transforms]\\n\");\n    fprintf(stream, \"Version,Offset X,Offset Y,Scale X,Scale Y,Shear,Theta,CVX,CVY,NN12,RR12,TAA,TBB\\n\");\n    if (verbose) {\n        for (i = 0; i < gtc->m_normalization_transforms; i++) {\n            fprintf(stream, \"%d,%.10f,%.10f,%.10f,%.10f,%.10f,%.10f,\", gtc->normalization_transforms[i].version,\n                    gtc->normalization_transforms[i].offset_x, gtc->normalization_transforms[i].offset_y,\n                    gtc->normalization_transforms[i].scale_x, gtc->normalization_transforms[i].scale_y,\n                    gtc->normalization_transforms[i].shear, gtc->normalization_transforms[i].theta);\n            fprintf(stream, \"%.10f,%.10f,%.10f,%.10f,%.10f,%.10f\\n\", gtc->normalization_transforms[i].cvx,\n                    gtc->normalization_transforms[i].cvy, gtc->normalization_transforms[i].nn12,\n                    gtc->normalization_transforms[i].rr12, gtc->normalization_transforms[i].taa,\n                    gtc->normalization_transforms[i].tbb);\n        }\n    } else {\n        fprintf(stream, \"... use --verbose to visualize assay data ...\\n\");\n    }\n    //    fprintf(stream, \"[Controls]\\n\");\n    //    fprintf(stream, \"Raw X,Raw Y\\n\");\n    //    if (verbose) {\n    //        for (i = 0; i < gtc->m_controls_x; i++)\n    //            fprintf(stream, \"%d,%d\\n\", gtc->controls_x[i], gtc->controls_y[i]);\n    //    } else {\n    //        fprintf(stream, \"... use --verbose to visualize controls data ...\\n\");\n    //    }\n}\n\nstatic void gtcs_to_tsv(gtc_t **gtcs, int n, FILE *stream) {\n    fprintf(stream,\n            \"gtc\\tnumber_snps\\tploidy\\tploidy_type\\tsample_name\\tsample_plate\\tsample_\"\n            \"well\\tcluster_file\\tsnp_manifest\\t\"\n            \"scan_date\\tautocall_date\\tautocall_version\\tnumber_normalization_\"\n            \"transforms\\tnumber_x_controls\\t\"\n            \"number_y_controls\\tscanner_name\\tpmt_green\\tpmt_red\\tscanner_software_\"\n            \"version\\tscanner_username\\tcall_rate\\t\"\n            \"computed_gender\\tlogr_deviation\\tgencall_score_10_percentile\\tdx\\tgencall_score_\"\n            \"50_percentile\\t\"\n            \"number_valid_calls\\tnumber_invalid_calls\\tnumber_intensity_only_or_zeroed_\"\n            \"loci\\tp05_x\\tp50_x\\tp95_x\\tp05_y\\t\"\n            \"p50_y\\tp95_y\\tsentrix_barcode\\n\");\n    int i;\n    for (i = 0; i < n; i++) {\n        gtc_t *gtc = gtcs[i];\n        fprintf(stream,\n                \"%s\\t%d\\t%d\\t%d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%ld\\t%ld\\t%ld\\t%s\\t%d\\t%d\\t%\"\n                \"s\\t%s\\t%f\\t%c\\t%f\\t%f\\t%d\\t%f\\t%d\\t%d\\t%d\\t%d\\t%d\\t%d\\t%d\\t%d\\t%d\\t%s\\n\",\n                strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn, gtc->num_snps, gtc->ploidy,\n                gtc->ploidy_type, gtc->sample_name ? gtc->sample_name : \"\", gtc->sample_plate ? gtc->sample_plate : \"\",\n                gtc->sample_well ? gtc->sample_well : \"\", gtc->cluster_file ? gtc->cluster_file : \"\",\n                gtc->snp_manifest ? gtc->snp_manifest : \"\", gtc->imaging_date ? gtc->imaging_date : \"\",\n                gtc->autocall_date ? gtc->autocall_date : \"\", gtc->autocall_version ? gtc->autocall_version : \"\",\n                gtc->m_normalization_transforms, gtc->m_controls_x, gtc->m_controls_y,\n                gtc->scanner_data.scanner_name ? gtc->scanner_data.scanner_name : \"\", gtc->scanner_data.pmt_green,\n                gtc->scanner_data.pmt_red, gtc->scanner_data.scanner_version ? gtc->scanner_data.scanner_version : \"\",\n                gtc->scanner_data.imaging_user ? gtc->scanner_data.imaging_user : \"\", gtc->call_rate, gtc->gender,\n                gtc->logr_dev, gtc->p10gc, gtc->dx, gtc->sample_data.p50gc, gtc->sample_data.num_calls,\n                gtc->sample_data.num_no_calls, gtc->sample_data.num_intensity_only, gtc->percentiles_x[0],\n                gtc->percentiles_x[1], gtc->percentiles_x[2], gtc->percentiles_y[0], gtc->percentiles_y[1],\n                gtc->percentiles_y[2], gtc->sentrix_id ? gtc->sentrix_id : \"\");\n    }\n}\n\n/****************************************\n * SAM FILE IMPLEMENTATION              *\n ****************************************/\n\nstatic bpm_t *sam_csv_init(const char *fn, bpm_t *bpm, const char *genome_build, int flags) {\n    htsFile *hts = hts_open(fn, \"r\");\n    if (hts == NULL || hts_get_format(hts)->category != sequence_data)\n        error(\"File %s does not contain sequence data\\n\", fn);\n    sam_hdr_t *sam_hdr = sam_hdr_read(hts);\n    if (sam_hdr == NULL) error(\"Reading header from \\\"%s\\\" failed\", fn);\n    bam1_t *b = bam_init1();\n    if (b == NULL) error(\"Cannot create SAM record\\n\");\n\n    kstring_t str = {0, 0, NULL};\n    const char *chromosome = NULL;\n    int i, strand = -1, position = 0, n_unmapped = 0;\n    for (i = 0; i < bpm->num_loci; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        int idx = get_position(hts, sam_hdr, b, locus_entry->ilmn_id, locus_entry->source_seq, 1, &chromosome,\n                               &position, &strand);\n        if (idx < 0) {\n            error(\"Reading from %s failed\", fn);\n        } else if (idx == 0) {\n            if (flags & VERBOSE) fprintf(stderr, \"Unable to determine position for marker %s\\n\", locus_entry->ilmn_id);\n            n_unmapped++;\n        }\n        free(locus_entry->genome_build);\n        locus_entry->genome_build = strdup(genome_build);\n        free(locus_entry->chrom);\n        locus_entry->chrom = strdup(chromosome ? chromosome : \"0\");\n        free(locus_entry->map_info);\n        str.l = 0;\n        kputw(position, &str);\n        locus_entry->map_info = strdup(str.s);\n        free(locus_entry->ref_strand);\n        locus_entry->ref_strand =\n            ((strand < 0) || ((strcasecmp(locus_entry->ilmn_strand, locus_entry->source_strand) != 0) == strand))\n                ? strdup(\"+\")\n                : strdup(\"-\");\n    }\n    fprintf(stderr, \"Lines   total/unmapped:\\t%d/%d\\n\", bpm->num_loci, n_unmapped);\n    free(str.s);\n\n    bam_destroy1(b);\n    sam_hdr_destroy(sam_hdr);\n    if (hts_close(hts) < 0) error(\"closing \\\"%s\\\" failed\", fn);\n    return bpm;\n}\n\n/****************************************\n * INTENSITIES COMPUTATIONS             *\n ****************************************/\n\n// compute normalized intensities (http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf)\nstatic inline void raw_x_y2norm_x_y(uint16_t raw_x, uint16_t raw_y, float offset_x, float offset_y, float cos_theta,\n                                    float sin_theta, float shear, float scale_x, float scale_y, float *norm_x,\n                                    float *norm_y) {\n    float temp_x = (float)raw_x - offset_x;\n    float temp_y = (float)raw_y - offset_y;\n    float temp_x2 = cos_theta * temp_x + sin_theta * temp_y;\n    float temp_y2 = -sin_theta * temp_x + cos_theta * temp_y;\n    float temp_x3 = temp_x2 - shear * temp_y2;\n    *norm_x = temp_x3 < 0.0f ? 0.0f : temp_x3 / scale_x;\n    *norm_y = temp_y2 < 0.0f ? 0.0f : temp_y2 / scale_y;\n}\n\n// compute Theta and R from raw intensities\nstatic inline void norm_x_y2ilmn_theta_r(float norm_x, float norm_y, float *ilmn_theta, float *ilmn_r) {\n    *ilmn_theta = (float)(atan2((double)norm_y, (double)norm_x) * M_2_PI);\n    *ilmn_r = norm_x + norm_y;\n}\n\nstatic void adjust_clusters(const uint8_t *gts, const float *ilmn_theta, const float *ilmn_r, int n,\n                            ClusterRecord *cluster_record) {\n    cluster_record->aa_cluster_stats.N = 0;\n    cluster_record->ab_cluster_stats.N = 0;\n    cluster_record->bb_cluster_stats.N = 0;\n    cluster_record->aa_cluster_stats.theta_mean *= 0.2f;\n    cluster_record->ab_cluster_stats.theta_mean *= 0.2f;\n    cluster_record->bb_cluster_stats.theta_mean *= 0.2f;\n    cluster_record->aa_cluster_stats.r_mean *= 0.2f;\n    cluster_record->ab_cluster_stats.r_mean *= 0.2f;\n    cluster_record->bb_cluster_stats.r_mean *= 0.2f;\n\n    int i;\n    for (i = 0; i < n; i++) {\n        switch (gts[i]) {\n        case GT_AA:\n            cluster_record->aa_cluster_stats.N++;\n            cluster_record->aa_cluster_stats.theta_mean += ilmn_theta[i];\n            cluster_record->aa_cluster_stats.r_mean += ilmn_r[i];\n            break;\n        case GT_AB:\n            cluster_record->ab_cluster_stats.N++;\n            cluster_record->ab_cluster_stats.theta_mean += ilmn_theta[i];\n            cluster_record->ab_cluster_stats.r_mean += ilmn_r[i];\n            break;\n        case GT_BB:\n            cluster_record->bb_cluster_stats.N++;\n            cluster_record->bb_cluster_stats.theta_mean += ilmn_theta[i];\n            cluster_record->bb_cluster_stats.r_mean += ilmn_r[i];\n            break;\n        default:\n            break;\n        }\n    }\n\n    cluster_record->aa_cluster_stats.theta_mean /= ((float)cluster_record->aa_cluster_stats.N + 0.2f);\n    cluster_record->ab_cluster_stats.theta_mean /= ((float)cluster_record->ab_cluster_stats.N + 0.2f);\n    cluster_record->bb_cluster_stats.theta_mean /= ((float)cluster_record->bb_cluster_stats.N + 0.2f);\n    cluster_record->aa_cluster_stats.r_mean /= ((float)cluster_record->aa_cluster_stats.N + 0.2f);\n    cluster_record->ab_cluster_stats.r_mean /= ((float)cluster_record->ab_cluster_stats.N + 0.2f);\n    cluster_record->bb_cluster_stats.r_mean /= ((float)cluster_record->bb_cluster_stats.N + 0.2f);\n}\n\n/****************************************\n * CONVERSION UTILITIES                 *\n ****************************************/\n\nstatic inline char rev_allele(char allele) {\n    static const char allele_complement[128] = {\n        0, 0,   0, 0,   0,   0, 0, 0,   0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0,   0, 0,   0,   0, 0, 0,   0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 'T', 0, 'G', 'D', 0, 0, 'C', 0, 'I', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'A', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n    };\n    if (allele > 95) return 0;\n    return allele_complement[(int)allele];\n}\n\nstatic void gtcs_to_gs(gtc_t **gtc, int n, const bpm_t *bpm, const egt_t *egt, FILE *stream, int flags) {\n    int i, j;\n\n    // print header\n    fputs(\"Index\\tName\\tAddress\\tChr\\tPosition\", stream);\n    if (flags & EGT_LOADED) fputs(\"\\tGenTrain Score\", stream);\n    if (flags & BPM_LOADED) fputs(\"\\tFrac A\\tFrac C\\tFrac G\\tFrac T\", stream);\n    for (i = 0; i < n; i++) {\n        if (flags & FORMAT_GT) fprintf(stream, \"\\t%s.GType\", gtc[i]->display_name);\n        if (flags & FORMAT_IGC) fprintf(stream, \"\\t%s.Score\", gtc[i]->display_name);\n        if ((flags & BPM_LOADED) && (flags & FORMAT_THETA)) fprintf(stream, \"\\t%s.Theta\", gtc[i]->display_name);\n        if ((flags & BPM_LOADED) && (flags & FORMAT_R)) fprintf(stream, \"\\t%s.R\", gtc[i]->display_name);\n        if (flags & FORMAT_BAF) fprintf(stream, \"\\t%s.B Allele Freq\", gtc[i]->display_name);\n        if (flags & FORMAT_LRR) fprintf(stream, \"\\t%s.Log R Ratio\", gtc[i]->display_name);\n        if (flags & FORMAT_X) fprintf(stream, \"\\t%s.X Raw\", gtc[i]->display_name);\n        if (flags & FORMAT_Y) fprintf(stream, \"\\t%s.Y Raw\", gtc[i]->display_name);\n        if ((flags & BPM_LOADED) && (flags & FORMAT_NORMX)) fprintf(stream, \"\\t%s.X\", gtc[i]->display_name);\n        if ((flags & BPM_LOADED) && (flags & FORMAT_NORMY)) fprintf(stream, \"\\t%s.Y\", gtc[i]->display_name);\n        if (flags & FORMAT_GT)\n            fprintf(stream, \"\\t%s.Top Alleles\\t%s.Plus/Minus Alleles\", gtc[i]->display_name, gtc[i]->display_name);\n    }\n    fputc('\\n', stream);\n\n    // print loci\n    for (j = 0; j < bpm->num_loci; j++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[j];\n        int norm_id = locus_entry && bpm->norm_lookups && bpm->locus_entries[j].norm_id != 0xFF\n                          ? bpm->norm_lookups[bpm->locus_entries[j].norm_id]\n                          : -1;\n        ClusterRecord *cluster_record = NULL;\n        if (flags & EGT_LOADED) {\n            int idx;\n            int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n            if (ret < 0) error(\"Illumina probe %s not found in cluster file\\n\", locus_entry->name);\n            cluster_record = &egt->cluster_records[idx];\n        }\n        int strand = !locus_entry->ref_strand ? -1\n                                              : (strcmp(locus_entry->ref_strand, \"+\") == 0\n                                                     ? 0\n                                                     : (strcmp(locus_entry->ref_strand, \"-\") == 0 ? 1 : -1));\n        if (strand < 0) error(\"Unable to process reference strand %s\\n\", locus_entry->ref_strand);\n        fprintf(stream, \"%d\\t%s\\t%d\\t%s\\t%s\", bpm->indexes ? bpm->indexes[j] : j, locus_entry->name,\n                locus_entry->address_a, locus_entry->chrom, locus_entry->map_info);\n        if (cluster_record) fprintf(stream, \"\\t%f\", cluster_record->cluster_score.total_score);\n        if (flags & BPM_LOADED)\n            fprintf(stream, \"\\t%f\\t%f\\t%f\\t%f\", locus_entry->frac_a, locus_entry->frac_c, locus_entry->frac_g,\n                    locus_entry->frac_t);\n        uint16_t raw_x, raw_y;\n        float norm_x, norm_y, ilmn_r, ilmn_theta, baf, lrr;\n        for (i = 0; i < n; i++) {\n            uint8_t genotype;\n            get_element(gtc[i]->genotypes, (void *)&genotype, j);\n            float genotype_score;\n            get_element(gtc[i]->genotype_scores, (void *)&genotype_score, j);\n            BaseCall base_call;\n            get_element(gtc[i]->base_calls, (void *)&base_call, j);\n            get_element(gtc[i]->raw_x, (void *)&raw_x, j);\n            get_element(gtc[i]->raw_y, (void *)&raw_y, j);\n            norm_x = -NAN;\n            norm_y = -NAN;\n            ilmn_theta = -NAN;\n            ilmn_r = -NAN;\n            baf = -NAN;\n            lrr = -NAN;\n            if ((raw_x || raw_y) && norm_id >= 0) {\n                XForm *xform = &gtc[i]->normalization_transforms[norm_id];\n                raw_x_y2norm_x_y(raw_x, raw_y, xform->offset_x, xform->offset_y, gtc[i]->cos_theta[norm_id],\n                                 gtc[i]->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x,\n                                 &norm_y);\n                norm_x_y2ilmn_theta_r(norm_x, norm_y, &ilmn_theta, &ilmn_r);\n                if (cluster_record)\n                    get_baf_lrr(ilmn_theta, ilmn_r, cluster_record->aa_cluster_stats.theta_mean,\n                                cluster_record->ab_cluster_stats.theta_mean,\n                                cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean,\n                                cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean,\n                                locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf, &lrr);\n            }\n            if (isnan(baf)) get_element(gtc[i]->b_allele_freqs, (void *)&baf, j);\n            if (isnan(lrr)) get_element(gtc[i]->logr_ratios, (void *)&lrr, j);\n\n            char allele_a = strand ? rev_allele(locus_entry->snp[1]) : locus_entry->snp[1];\n            char allele_b = strand ? rev_allele(locus_entry->snp[3]) : locus_entry->snp[3];\n            BaseCall ref_call;\n            switch (genotype) {\n            case GT_NC:\n                ref_call[0] = '-';\n                ref_call[1] = '-';\n                break;\n            case GT_AA:\n                ref_call[0] = allele_a;\n                ref_call[1] = allele_a;\n                break;\n            case GT_AB:\n                ref_call[0] = allele_a;\n                ref_call[1] = allele_b;\n                break;\n            case GT_BB:\n                ref_call[0] = allele_b;\n                ref_call[1] = allele_b;\n                break;\n            default:\n                error(\"Unable to process marker %s\\n\", locus_entry->name);\n                break;\n            }\n            if (flags & FORMAT_GT) fprintf(stream, \"\\t%s\", code2genotype[genotype]);\n            if (flags & FORMAT_IGC) fprintf(stream, \"\\t%f\", genotype_score);\n            if ((flags & BPM_LOADED) && (flags & FORMAT_THETA)) fprintf(stream, \"\\t%f\", ilmn_theta);\n            if ((flags & BPM_LOADED) && (flags & FORMAT_R)) fprintf(stream, \"\\t%f\", ilmn_r);\n            if (flags & FORMAT_BAF) fprintf(stream, \"\\t%f\", baf);\n            if (flags & FORMAT_LRR) fprintf(stream, \"\\t%f\", lrr);\n            if (flags & FORMAT_X) fprintf(stream, \"\\t%u\", raw_x);\n            if (flags & FORMAT_Y) fprintf(stream, \"\\t%u\", raw_y);\n            if ((flags & BPM_LOADED) && (flags & FORMAT_NORMX)) fprintf(stream, \"\\t%f\", norm_x);\n            if ((flags & BPM_LOADED) && (flags & FORMAT_NORMY)) fprintf(stream, \"\\t%f\", norm_y);\n            if (flags & FORMAT_GT)\n                fprintf(stream, \"\\t%c%c\\t%c%c\", base_call[0], base_call[1], ref_call[0], ref_call[1]);\n        }\n        fputc('\\n', stream);\n    }\n}\n\nstatic bcf_hdr_t *hdr_init(const faidx_t *fai, int flags) {\n    bcf_hdr_t *hdr = bcf_hdr_init(\"w\");\n    int i, n = faidx_nseq(fai);\n    for (i = 0; i < n; i++) {\n        const char *seq = faidx_iseq(fai, i);\n        int len = faidx_seq_len(fai, seq);\n        bcf_hdr_printf(hdr, \"##contig=<ID=%s,length=%d>\", seq, len);\n    }\n\n    bcf_hdr_append(hdr, \"##INFO=<ID=ALLELE_A,Number=1,Type=Integer,Description=\\\"A allele\\\">\");\n    bcf_hdr_append(hdr, \"##INFO=<ID=ALLELE_B,Number=1,Type=Integer,Description=\\\"B allele\\\">\");\n    if (flags & BPM_LOADED) {\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_A,Number=1,Type=Float,Description=\\\"Fraction of the A \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_C,Number=1,Type=Float,Description=\\\"Fraction of the C \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_G,Number=1,Type=Float,Description=\\\"Fraction of the G \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_T,Number=1,Type=Float,Description=\\\"Fraction of the T \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=NORM_ID,Number=1,Type=Integer,Description=\\\"Normalization \"\n                       \"lookups from manifest\\\">\");\n    }\n    if (flags & CSV_LOADED) {\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=BEADSET_ID,Number=1,Type=Integer,Description=\\\"Bead set ID \"\n                       \"for normalization\\\">\");\n    }\n    if ((flags & BPM_LOADED) | (flags & CSV_LOADED)) {\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=INTENSITY_ONLY,Number=0,Type=Flag,Description=\\\"Locus with intensity information \"\n                       \"only and no genotyping information\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=ASSAY_TYPE,Number=1,Type=Integer,Description=\\\"Identifies type of \"\n                       \"assay (0 - Infinium II, 1 - Infinium I (A/T), 2 - Infinium I (G/C)\\\">\");\n    }\n    if (flags & EGT_LOADED) {\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=GenTrain_Score,Number=1,Type=Float,Description=\\\"The SNP \"\n                       \"cluster quality from the GenTrain clustering algorithm\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=Orig_Score,Number=1,Type=Float,Description=\\\"The original \"\n                       \"GenTrain score for the SNP before edits\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=Edited,Number=0,Type=Flag,Description=\\\"The SNP was edited \"\n                       \"after identifying clustering positions\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=Cluster_Sep,Number=1,Type=Float,Description=\\\"The cluster \"\n                       \"separation measurement for the SNP that ranges between 0 and 1\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=N_AA,Number=1,Type=Integer,Description=\\\"Number of AA calls \"\n                       \"in training set\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=N_AB,Number=1,Type=Integer,Description=\\\"Number of AB calls \"\n                       \"in training set\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=N_BB,Number=1,Type=Integer,Description=\\\"Number of BB calls \"\n                       \"in training set\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devR_AA,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized R for AA cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devR_AB,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized R for AB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devR_BB,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized R for BB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devTHETA_AA,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized THETA for AA cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devTHETA_AB,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized THETA for AB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=devTHETA_BB,Number=1,Type=Float,Description=\\\"Standard \"\n                       \"deviation of normalized THETA for BB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanR_AA,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized R for AA cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanR_AB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized R for AB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanR_BB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized R for BB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanTHETA_AA,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized THETA for AA cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanTHETA_AB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized THETA for AB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=meanTHETA_BB,Number=1,Type=Float,Description=\\\"Mean of \"\n                       \"normalized THETA for BB cluster\\\">\");\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=Intensity_Threshold,Number=1,Type=Float,Description=\\\"The \"\n                       \"intensity threshold value\\\">\");\n    }\n    if (!(flags & NO_INFO_GC))\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=GC,Number=1,Type=Float,Description=\\\"GC ratio content \"\n                       \"around the variant\\\">\");\n\n    if (flags & FORMAT_GT) bcf_hdr_append(hdr, \"##FORMAT=<ID=GT,Number=1,Type=String,Description=\\\"Genotype\\\">\");\n    if (flags & FORMAT_GQ)\n        bcf_hdr_append(hdr, \"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\\\"Genotype Quality\\\">\");\n    if (flags & FORMAT_IGC)\n        bcf_hdr_append(hdr,\n                       \"##FORMAT=<ID=IGC,Number=1,Type=Float,Description=\\\"Illumina GenCall \"\n                       \"Confidence Score\\\">\");\n    if (flags & FORMAT_BAF)\n        bcf_hdr_append(hdr, \"##FORMAT=<ID=BAF,Number=1,Type=Float,Description=\\\"B Allele Frequency\\\">\");\n    if (flags & FORMAT_LRR) bcf_hdr_append(hdr, \"##FORMAT=<ID=LRR,Number=1,Type=Float,Description=\\\"Log R Ratio\\\">\");\n    if ((flags & BPM_LOADED) | (flags & GENOME_STUDIO)) {\n        if (flags & FORMAT_NORMX)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=NORMX,Number=1,Type=Float,Description=\\\"Normalized X \"\n                           \"intensity\\\">\");\n        if (flags & FORMAT_NORMY)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=NORMY,Number=1,Type=Float,Description=\\\"Normalized Y \"\n                           \"intensity\\\">\");\n        if (flags & FORMAT_R)\n            bcf_hdr_append(hdr, \"##FORMAT=<ID=R,Number=1,Type=Float,Description=\\\"Normalized R value\\\">\");\n        if (flags & FORMAT_THETA)\n            bcf_hdr_append(hdr,\n                           \"##FORMAT=<ID=THETA,Number=1,Type=Float,Description=\\\"Normalized \"\n                           \"Theta value\\\">\");\n    }\n    if (flags & FORMAT_X) bcf_hdr_append(hdr, \"##FORMAT=<ID=X,Number=1,Type=Integer,Description=\\\"Raw X intensity\\\">\");\n    if (flags & FORMAT_Y) bcf_hdr_append(hdr, \"##FORMAT=<ID=Y,Number=1,Type=Integer,Description=\\\"Raw Y intensity\\\">\");\n    return hdr;\n}\n\nstatic int gts_to_gt_arr(int32_t *gt_arr, const uint8_t *gts, int n, int allele_a_idx, int allele_b_idx) {\n    int i;\n    for (i = 0; i < n; i++) {\n        switch (gts[i]) {\n        case GT_NC:\n            gt_arr[2 * i] = bcf_gt_missing;\n            gt_arr[2 * i + 1] = bcf_gt_missing;\n            break;\n        case GT_AA:\n            gt_arr[2 * i] = bcf_gt_unphased(allele_a_idx);\n            gt_arr[2 * i + 1] = bcf_gt_unphased(allele_a_idx);\n            break;\n        case GT_AB:\n            gt_arr[2 * i] = bcf_gt_unphased(min(allele_a_idx, allele_b_idx));\n            gt_arr[2 * i + 1] = bcf_gt_unphased(max(allele_a_idx, allele_b_idx));\n            break;\n        case GT_BB:\n            gt_arr[2 * i] = bcf_gt_unphased(allele_b_idx);\n            gt_arr[2 * i + 1] = bcf_gt_unphased(allele_b_idx);\n            break;\n        default:\n            return -1;\n        }\n    }\n    return 0;\n}\n\nstatic int locus2bcf(const LocusEntry *locus_entry, const ClusterRecord *cluster_record, const bcf_hdr_t *hdr,\n                     const faidx_t *fai, int gc_win, int flags, kstring_t *allele_a, kstring_t *allele_b,\n                     kstring_t *flank, int32_t *allele_a_idx, int32_t *allele_b_idx, bcf1_t *rec) {\n    rec->rid = bcf_hdr_name2id_flexible(hdr, locus_entry->chrom);\n    char *endptr;\n    rec->pos = strtol(locus_entry->map_info, &endptr, 0) - 1;\n    if (locus_entry->map_info == endptr)\n        error(\"Map info %s for marker %s is not understood\\n\", locus_entry->map_info, locus_entry->ilmn_id);\n    int strand =\n        !locus_entry->ref_strand\n            ? -1\n            : (strcmp(locus_entry->ref_strand, \"+\") == 0 ? 0 : (strcmp(locus_entry->ref_strand, \"-\") == 0 ? 1 : -1));\n    if (rec->rid < 0 || rec->pos < 0) {\n        if (flags & VERBOSE) fprintf(stderr, \"Skipping unlocalized marker %s\\n\", locus_entry->ilmn_id);\n        return -1;\n    }\n    bcf_update_id(hdr, rec, locus_entry->name);\n\n    int len,\n        win = min(max(100, locus_entry->source_seq ? max(gc_win, strlen(locus_entry->source_seq)) : gc_win), rec->pos);\n    char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len);\n    if (!ref || len == 1)\n        error(\"faidx_fetch_seq failed at %s:%\" PRId64 \" (are you using the correct reference genome?)\\n\",\n              bcf_seqname(hdr, rec), rec->pos + 1);\n    strupper(ref);\n    if (!(flags & NO_INFO_GC)) {\n        float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]);\n        bcf_update_info_float(hdr, rec, \"GC\", &gc_ratio, 1);\n    }\n\n    char ref_base[] = {'\\0', '\\0'};\n    ref_base[0] = ref[win];\n    allele_a->l = allele_b->l = 0;\n    kputc(locus_entry->snp[1], allele_a);\n    kputc(locus_entry->snp[3], allele_b);\n    int is_indel = allele_a->s[0] == 'D' || allele_a->s[0] == 'I' || allele_b->s[0] == 'D' || allele_b->s[0] == 'I';\n    int ref_is_del = -1;\n    if (is_indel && strand >= 0 && locus_entry->source_seq && strchr(locus_entry->source_seq, '-')) {\n        flank->l = 0;\n        kputs(locus_entry->source_seq, flank);\n        strupper(flank->s);\n        if ((strcasecmp(locus_entry->ilmn_strand, locus_entry->source_strand) != 0) != strand)\n            flank_reverse_complement(flank->s);\n        int shift = flank_left_shift(flank->s);\n\n        ref_is_del = get_indel_alleles(allele_a, allele_b, flank->s, ref, win, len, shift);\n        if (ref_is_del == 0) {\n            rec->pos--;\n            ref_base[0] = ref[win - 1];\n        }\n        *allele_b_idx = ref_is_del < 0 ? 1 : ref_is_del ^ (locus_entry->snp[3] == 'D');\n    } else {\n        if (allele_a->s[0] == 'N' && allele_b->s[0] == 'A') {\n            allele_a->s[0] = '.';\n            allele_b->s[0] = '.';\n        } else if (is_indel) {\n            ref_base[0] = allele_a->s[0];\n        } else {\n            if (strand < 0) {\n                if (strcmp(locus_entry->ilmn_strand, \"BOT\") == 0 || strcmp(locus_entry->ilmn_strand, \"Bot\") == 0) {\n                    allele_a->s[0] = rev_nt(allele_a->s[0]);\n                    allele_b->s[0] = rev_nt(allele_b->s[0]);\n                }\n                strand = get_strand_from_top_alleles(allele_a->s, allele_b->s, ref, win, len);\n                if (strand < 0) {\n                    if (flags & VERBOSE)\n                        fprintf(stderr, \"Unable to determine reference strand for SNP %s\\n\", locus_entry->ilmn_id);\n                    allele_a->s[0] = '.';\n                    allele_b->s[0] = '.';\n                }\n            }\n            if (strand == 1) {\n                allele_a->s[0] = rev_nt(allele_a->s[0]);\n                allele_b->s[0] = rev_nt(allele_b->s[0]);\n            }\n        }\n        *allele_b_idx = get_allele_b_idx(ref_base[0], allele_a->s, allele_b->s);\n    }\n    free(ref);\n\n    *allele_a_idx = get_allele_a_idx(*allele_b_idx);\n    const char *alleles[3];\n    int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a->s, allele_b->s, *allele_b_idx);\n    if (nals < 0) error(\"Unable to process marker %s\\n\", locus_entry->ilmn_id);\n    bcf_update_alleles(hdr, rec, alleles, nals);\n    bcf_update_info_int32(hdr, rec, \"ALLELE_A\", allele_a_idx, 1);\n    bcf_update_info_int32(hdr, rec, \"ALLELE_B\", allele_b_idx, 1);\n\n    if (flags & BPM_LOADED) {\n        bcf_update_info_float(hdr, rec, \"FRAC_A\", &locus_entry->frac_a, 1);\n        bcf_update_info_float(hdr, rec, \"FRAC_C\", &locus_entry->frac_c, 1);\n        bcf_update_info_float(hdr, rec, \"FRAC_G\", &locus_entry->frac_g, 1);\n        bcf_update_info_float(hdr, rec, \"FRAC_T\", &locus_entry->frac_t, 1);\n        bcf_update_info_int32(hdr, rec, \"NORM_ID\", &locus_entry->norm_id, 1);\n    }\n    if (flags & CSV_LOADED) {\n        bcf_update_info_int32(hdr, rec, \"BEADSET_ID\", &locus_entry->beadset_id, 1);\n    }\n    if ((flags & BPM_LOADED) | (flags & CSV_LOADED)) {\n        int32_t assay_type = (int32_t)(flags & BPM_LOADED ? locus_entry->assay_type : locus_entry->assay_type_csv);\n        bcf_update_info_flag(hdr, rec, \"INTENSITY_ONLY\", NULL, locus_entry->intensity_only);\n        bcf_update_info_int32(hdr, rec, \"ASSAY_TYPE\", &assay_type, 1);\n    }\n    if (flags & EGT_LOADED) {\n        bcf_update_info_float(hdr, rec, \"GenTrain_Score\", &cluster_record->cluster_score.total_score, 1);\n        bcf_update_info_float(hdr, rec, \"Orig_Score\", &cluster_record->cluster_score.original_score, 1);\n        if (cluster_record->cluster_score.edited) bcf_update_info_flag(hdr, rec, \"Edited\", NULL, 1);\n        bcf_update_info_float(hdr, rec, \"Cluster_Sep\", &cluster_record->cluster_score.cluster_separation, 1);\n        bcf_update_info_int32(hdr, rec, \"N_AA\", &cluster_record->aa_cluster_stats.N, 1);\n        bcf_update_info_int32(hdr, rec, \"N_AB\", &cluster_record->ab_cluster_stats.N, 1);\n        bcf_update_info_int32(hdr, rec, \"N_BB\", &cluster_record->bb_cluster_stats.N, 1);\n        bcf_update_info_float(hdr, rec, \"devR_AA\", &cluster_record->aa_cluster_stats.r_dev, 1);\n        bcf_update_info_float(hdr, rec, \"devR_AB\", &cluster_record->ab_cluster_stats.r_dev, 1);\n        bcf_update_info_float(hdr, rec, \"devR_BB\", &cluster_record->bb_cluster_stats.r_dev, 1);\n        bcf_update_info_float(hdr, rec, \"devTHETA_AA\", &cluster_record->aa_cluster_stats.theta_dev, 1);\n        bcf_update_info_float(hdr, rec, \"devTHETA_AB\", &cluster_record->ab_cluster_stats.theta_dev, 1);\n        bcf_update_info_float(hdr, rec, \"devTHETA_BB\", &cluster_record->bb_cluster_stats.theta_dev, 1);\n        bcf_update_info_float(hdr, rec, \"meanR_AA\", &cluster_record->aa_cluster_stats.r_mean, 1);\n        bcf_update_info_float(hdr, rec, \"meanR_AB\", &cluster_record->ab_cluster_stats.r_mean, 1);\n        bcf_update_info_float(hdr, rec, \"meanR_BB\", &cluster_record->bb_cluster_stats.r_mean, 1);\n        bcf_update_info_float(hdr, rec, \"meanTHETA_AA\", &cluster_record->aa_cluster_stats.theta_mean, 1);\n        bcf_update_info_float(hdr, rec, \"meanTHETA_AB\", &cluster_record->ab_cluster_stats.theta_mean, 1);\n        bcf_update_info_float(hdr, rec, \"meanTHETA_BB\", &cluster_record->bb_cluster_stats.theta_mean, 1);\n        bcf_update_info_float(hdr, rec, \"Intensity_Threshold\", &cluster_record->intensity_threshold, 1);\n    }\n\n    if (is_indel && ref_is_del < 0) {\n        if (flags & VERBOSE) fprintf(stderr, \"Unable to determine alleles for indel %s\\n\", locus_entry->ilmn_id);\n        return 1;\n    }\n    return 0;\n}\n\nstatic void gtcs_to_vcf(faidx_t *fai, const bpm_t *bpm, const egt_t *egt, gtc_t **gtc, int n, htsFile *out_fh,\n                        bcf_hdr_t *hdr, int flags, int gc_win) {\n    int i, j;\n    uint8_t *gts = (uint8_t *)malloc(n * sizeof(uint8_t));\n    int32_t *gt_arr = (int32_t *)malloc(n * 2 * sizeof(int32_t));\n    int32_t *gq_arr = (int32_t *)malloc(n * sizeof(int32_t));\n    float *igc_arr = (float *)malloc(n * sizeof(float));\n    float *baf_arr = (float *)malloc(n * sizeof(float));\n    float *lrr_arr = (float *)malloc(n * sizeof(float));\n    float *norm_x_arr = (float *)malloc(n * sizeof(float));\n    float *norm_y_arr = (float *)malloc(n * sizeof(float));\n    float *ilmn_r_arr = (float *)malloc(n * sizeof(float));\n    float *ilmn_theta_arr = (float *)malloc(n * sizeof(float));\n    int32_t *raw_x_arr = (int32_t *)malloc(n * sizeof(int32_t));\n    int32_t *raw_y_arr = (int32_t *)malloc(n * sizeof(int32_t));\n\n    bcf1_t *rec = bcf_init();\n    kstring_t allele_a = {0, 0, NULL};\n    kstring_t allele_b = {0, 0, NULL};\n    kstring_t flank = {0, 0, NULL};\n    int32_t allele_a_idx, allele_b_idx;\n    int n_missing = 0, n_skipped = 0;\n    for (j = 0; j < bpm->num_loci; j++) {\n        bcf_clear(rec);\n        LocusEntry *locus_entry = &bpm->locus_entries[j];\n        int norm_id = bpm->norm_lookups && bpm->locus_entries[j].norm_id != 0xFF\n                          ? bpm->norm_lookups[bpm->locus_entries[j].norm_id]\n                          : -1;\n        ClusterRecord *cluster_record = NULL;\n        if (flags & EGT_LOADED) {\n            int idx;\n            int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n            if (ret < 0) error(\"Illumina probe %s not found in cluster file\\n\", locus_entry->name);\n            cluster_record = &egt->cluster_records[idx];\n        }\n        switch (locus2bcf(locus_entry, cluster_record, hdr, fai, gc_win, flags, &allele_a, &allele_b, &flank,\n                          &allele_a_idx, &allele_b_idx, rec)) {\n        case -1:\n            n_skipped++;\n            continue;\n        case 1:\n            n_missing++;\n            break;\n        }\n\n        uint16_t raw_x, raw_y;\n        rec->n_sample = n;\n        for (i = 0; i < n; i++) {\n            get_element(gtc[i]->genotypes, (void *)&gts[i], j);\n            get_element(gtc[i]->genotype_scores, (void *)&igc_arr[i], j);\n            gq_arr[i] = (int)(-10 * log10(1 - igc_arr[i]) + .5);\n            if (gq_arr[i] < 0) gq_arr[i] = 0;\n            if (gq_arr[i] > 50) gq_arr[i] = 50;\n            get_element(gtc[i]->raw_x, (void *)&raw_x, j);\n            get_element(gtc[i]->raw_y, (void *)&raw_y, j);\n            raw_x_arr[i] = (int32_t)raw_x;\n            raw_y_arr[i] = (int32_t)raw_y;\n            norm_x_arr[i] = -NAN;\n            norm_y_arr[i] = -NAN;\n            ilmn_r_arr[i] = -NAN;\n            ilmn_theta_arr[i] = -NAN;\n            baf_arr[i] = -NAN;\n            lrr_arr[i] = -NAN;\n            if ((raw_x || raw_y) && norm_id >= 0) {\n                XForm *xform = &gtc[i]->normalization_transforms[norm_id];\n                raw_x_y2norm_x_y(raw_x, raw_y, xform->offset_x, xform->offset_y, gtc[i]->cos_theta[norm_id],\n                                 gtc[i]->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y,\n                                 &norm_x_arr[i], &norm_y_arr[i]);\n                norm_x_y2ilmn_theta_r(norm_x_arr[i], norm_y_arr[i], &ilmn_theta_arr[i], &ilmn_r_arr[i]);\n                if (cluster_record)\n                    get_baf_lrr(ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean,\n                                cluster_record->ab_cluster_stats.theta_mean,\n                                cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean,\n                                cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean,\n                                locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]);\n            }\n            if (isnan(baf_arr[i])) get_element(gtc[i]->b_allele_freqs, (void *)&baf_arr[i], j);\n            if (isnan(lrr_arr[i])) get_element(gtc[i]->logr_ratios, (void *)&lrr_arr[i], j);\n        }\n\n        if ((flags & ADJUST_CLUSTERS) && norm_id >= 0 && cluster_record && !bpm->locus_entries[j].intensity_only) {\n            adjust_clusters(gts, ilmn_theta_arr, ilmn_r_arr, n, cluster_record);\n            for (i = 0; i < n; i++) {\n                if (!isnan(ilmn_theta_arr[i]) && !isnan(ilmn_r_arr[i]))\n                    get_baf_lrr(ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean,\n                                cluster_record->ab_cluster_stats.theta_mean,\n                                cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean,\n                                cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean,\n                                locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]);\n            }\n        }\n\n        if (!bpm->locus_entries[j].intensity_only) {\n            gts_to_gt_arr(gt_arr, gts, n, allele_a_idx, allele_b_idx);\n            bcf_update_genotypes(hdr, rec, gt_arr, n * 2);\n            bcf_update_format_int32(hdr, rec, \"GQ\", gq_arr, n);\n            bcf_update_format_float(hdr, rec, \"IGC\", igc_arr, n);\n        }\n        bcf_update_format_float(hdr, rec, \"BAF\", baf_arr, n);\n        bcf_update_format_float(hdr, rec, \"LRR\", lrr_arr, n);\n        bcf_update_format_float(hdr, rec, \"NORMX\", norm_x_arr, n);\n        bcf_update_format_float(hdr, rec, \"NORMY\", norm_y_arr, n);\n        bcf_update_format_float(hdr, rec, \"R\", ilmn_r_arr, n);\n        bcf_update_format_float(hdr, rec, \"THETA\", ilmn_theta_arr, n);\n        bcf_update_format_int32(hdr, rec, \"X\", raw_x_arr, n);\n        bcf_update_format_int32(hdr, rec, \"Y\", raw_y_arr, n);\n        if (bcf_write(out_fh, hdr, rec) < 0) error(\"Unable to write to output VCF file\\n\");\n    }\n    fprintf(stderr, \"Lines   total/missing-reference/skipped:\\t%d/%d/%d\\n\", bpm->num_loci, n_missing, n_skipped);\n\n    free(gts);\n    free(gt_arr);\n    free(gq_arr);\n    free(igc_arr);\n    free(baf_arr);\n    free(lrr_arr);\n    free(norm_x_arr);\n    free(norm_y_arr);\n    free(ilmn_r_arr);\n    free(ilmn_theta_arr);\n    free(raw_x_arr);\n    free(raw_y_arr);\n\n    free(allele_a.s);\n    free(allele_b.s);\n    free(flank.s);\n\n    bcf_destroy(rec);\n    bcf_hdr_destroy(hdr);\n}\n\n#define GS_GT 0\n#define GS_TOP_STRAND 1\n#define GS_REF_STRAND 2\n#define GS_IGC 3\n#define GS_BAF 4\n#define GS_LRR 5\n#define GS_NORMX 6\n#define GS_NORMY 7\n#define GS_R 8\n#define GS_THETA 9\n#define GS_X 10\n#define GS_Y 11\n\ntypedef struct {\n    int *col2sample;\n    int type;\n    void *ptr;\n} gs_col_t;\n\nstatic int tsv_setter_gs_col(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    gs_col_t *gs_col = (gs_col_t *)usr;\n    uint8_t *gts;\n    char *strand_alleles, *endptr;\n    switch (gs_col->type) {\n    case GS_GT:\n        gts = (uint8_t *)gs_col->ptr + gs_col->col2sample[tsv->icol];\n        if ((tsv->ss[0] == 'N' && tsv->ss[1] == 'C') || (tsv->ss[0] == '-' && tsv->ss[1] == '-'))\n            *gts = GT_NC;\n        else if (tsv->ss[0] == 'A' && tsv->ss[1] == 'A')\n            *gts = GT_AA;\n        else if (tsv->ss[0] == 'A' && tsv->ss[1] == 'B')\n            *gts = GT_AB;\n        else if (tsv->ss[0] == 'B' && tsv->ss[1] == 'B')\n            *gts = GT_BB;\n        else\n            return -1;\n        break;\n    case GS_TOP_STRAND:\n    case GS_REF_STRAND:\n        strand_alleles = (char *)gs_col->ptr + 2 * gs_col->col2sample[tsv->icol];\n        strand_alleles[0] = tsv->ss[0];\n        strand_alleles[1] = tsv->ss[1];\n        break;\n    case GS_IGC:\n    case GS_BAF:\n    case GS_LRR:\n    case GS_NORMX:\n    case GS_NORMY:\n    case GS_R:\n    case GS_THETA:\n        ((float *)gs_col->ptr + gs_col->col2sample[tsv->icol])[0] = strtof(tsv->ss, &endptr);\n        if (tsv->ss == endptr) return -1;\n        break;\n    case GS_X:\n    case GS_Y:\n        ((int32_t *)gs_col->ptr + gs_col->col2sample[tsv->icol])[0] = strtol(tsv->ss, &endptr, 0);\n        if (tsv->ss == endptr) return -1;\n        break;\n    default:\n        return -1;\n    }\n    return 0;\n}\n\nstatic int tsv_setter_chrom_flexible(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    char tmp = *tsv->se;\n    *tsv->se = 0;\n    rec->rid = bcf_hdr_name2id_flexible((bcf_hdr_t *)usr, tsv->ss);\n    *tsv->se = tmp;\n    return rec->rid == -1 ? -1 : 0;\n}\n\nstatic int tsv_setter_ilmn_strand(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    char **strand = (char **)usr;\n    *strand = tsv->ss;\n    return 0;\n}\n\nstatic int tsv_setter_snp(tsv_t *tsv, bcf1_t *rec, void *usr) {\n    char **snp = (char **)usr;\n    if (strncmp(tsv->ss, \"[N/A]\", 5) == 0)\n        *snp = NULL;\n    else\n        *snp = tsv->ss;\n    return 0;\n}\n\nstatic int tsv_register_all(tsv_t *tsv, const char *id, tsv_setter_t setter, void *usr) {\n    int i, n = 0;\n    for (i = 0; i < tsv->ncols; i++) {\n        if (!tsv->cols[i].name || strcasecmp(tsv->cols[i].name, id)) continue;\n        tsv->cols[i].setter = setter;\n        tsv->cols[i].usr = usr;\n        n++;\n    }\n    return n ? 0 : -1;\n}\n\n// adapted from Petr Danecek's implementation of tsv_parse() in bcftools/tsv2vcf.c\nstatic int tsv_parse_delimiter(tsv_t *tsv, bcf1_t *rec, char *str, int delimiter) {\n    int status = 0;\n    tsv->icol = 0;\n    tsv->ss = tsv->se = str;\n    while (*tsv->ss && tsv->icol < tsv->ncols) {\n        if (delimiter)\n            while (*tsv->se && (*tsv->se) != delimiter) tsv->se++;\n        else\n            while (*tsv->se && !isspace(*tsv->se)) tsv->se++;\n        if (tsv->cols[tsv->icol].setter) {\n            int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr);\n            if (ret < 0) return -1;\n            status++;\n        }\n        if (delimiter)\n            tsv->se++;\n        else\n            while (*tsv->se && isspace(*tsv->se)) tsv->se++;\n        tsv->ss = tsv->se;\n        tsv->icol++;\n    }\n    return status ? 0 : -1;\n}\n\nstatic void gs_to_vcf(faidx_t *fai, const bpm_t *bpm, const egt_t *egt, htsFile *gs_fh, htsFile *out_fh, bcf_hdr_t *hdr,\n                      const char *output_fname, char *index_fname, int write_index, int flags, int gc_win) {\n    // read the header of the table\n    kstring_t line = {0, 0, NULL};\n    if (hts_getline(gs_fh, KS_SEP_LINE, &line) <= 0) error(\"Empty file: %s\\n\", gs_fh->fn);\n    int i, moff = 0, *off = NULL, ncols = ksplit_core(line.s, '\\t', &moff, &off);\n    kstring_t str = {0, 0, NULL};\n    int *col2sample = (int *)malloc(sizeof(int) * ncols);\n    for (i = 0; i < ncols; i++) {\n        char *ptr;\n        if (i > 0) kputc(',', &str);\n        if ((ptr = strrchr(&line.s[off[i]], '.'))) {\n            *ptr++ = '\\0';\n            if ((bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, &line.s[off[i]]) < 0)) bcf_hdr_add_sample(hdr, &line.s[off[i]]);\n            if (strcmp(ptr, \"GType\") == 0)\n                kputs(\"GT\", &str);\n            else if (strcmp(ptr, \"Score\") == 0 || strcmp(ptr, \"GC Score\") == 0)\n                kputs(\"IGC\", &str);\n            else if (strcmp(ptr, \"Theta\") == 0 || strcmp(ptr, \"Theta Illumina\") == 0)\n                kputs(\"THETA\", &str);\n            else if (strcmp(ptr, \"R\") == 0 || strcmp(ptr, \"R Illumina\") == 0)\n                kputc('R', &str);\n            else if (strcmp(ptr, \"X Raw\") == 0 || strcmp(ptr, \"Raw X\") == 0)\n                kputc('X', &str);\n            else if (strcmp(ptr, \"Y Raw\") == 0 || strcmp(ptr, \"Raw Y\") == 0)\n                kputc('Y', &str);\n            else if (strcmp(ptr, \"X\") == 0)\n                kputs(\"NORMX\", &str);\n            else if (strcmp(ptr, \"Y\") == 0)\n                kputs(\"NORMY\", &str);\n            else if (strcmp(ptr, \"B Allele Freq\") == 0)\n                kputs(\"BAF\", &str);\n            else if (strcmp(ptr, \"Log R Ratio\") == 0)\n                kputs(\"LRR\", &str);\n            else if (strcmp(ptr, \"Top Alleles\") == 0)\n                kputs(\"TOP_STRAND\", &str);\n            else if (strcmp(ptr, \"Plus/Minus Alleles\") == 0)\n                kputs(\"REF_STRAND\", &str);\n            else if (strcmp(ptr, \"Import Calls\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"Concordance\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"Orig Call\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"CNV Value\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"CNV Confidence\") == 0)\n                kputc('-', &str);\n            else\n                error(\"Could not recognize FORMAT field: %s\\n\", ptr);\n            col2sample[i] = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, &line.s[off[i]]);\n        } else {\n            ptr = &line.s[off[i]];\n            if (strcmp(ptr, \"Index\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"Name\") == 0 || strcmp(ptr, \"SNP Name\") == 0)\n                kputs(\"ID\", &str);\n            else if (strcmp(ptr, \"Address\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"Chr\") == 0 || strcmp(ptr, \"Chromosome\") == 0)\n                kputs(\"CHROM\", &str);\n            else if (strcmp(ptr, \"Manifest\") == 0)\n                kputc('-', &str);\n            else if (strcmp(ptr, \"Position\") == 0)\n                kputs(\"POS\", &str);\n            else if (strcmp(ptr, \"GenTrain Score\") == 0)\n                kputs(\"GENTRAIN_SCORE\", &str);\n            else if (strcmp(ptr, \"Frac A\") == 0)\n                kputs(\"FRAC_A\", &str);\n            else if (strcmp(ptr, \"Frac C\") == 0)\n                kputs(\"FRAC_C\", &str);\n            else if (strcmp(ptr, \"Frac G\") == 0)\n                kputs(\"FRAC_G\", &str);\n            else if (strcmp(ptr, \"Frac T\") == 0)\n                kputs(\"FRAC_T\", &str);\n            else if (strcmp(ptr, \"IlmnStrand\") == 0 || strcmp(ptr, \"ILMN Strand\") == 0)\n                kputs(\"STRAND\", &str);\n            else if (strcmp(ptr, \"SNP\") == 0)\n                kputs(\"SNP\", &str);\n            else\n                error(\"Could not recognize INFO field: %s\\n\", ptr);\n            col2sample[i] = -1;\n        }\n    }\n    free(off);\n    if (bcf_hdr_sync(hdr) < 0) error_errno(\"[%s] Failed to update header\",\n                                           __func__); // updates the number of samples\n    int n = bcf_hdr_nsamples(hdr);\n\n    tsv_t *tsv = tsv_init(str.s);\n    if (tsv_register(tsv, \"CHROM\", tsv_setter_chrom_flexible, hdr) < 0) error(\"Expected Chr or Chromosome column\\n\");\n    if (tsv_register(tsv, \"POS\", tsv_setter_pos, NULL) < 0) error(\"Expected Position column\\n\");\n    if (tsv_register(tsv, \"ID\", tsv_setter_id, hdr) < 0 && bpm)\n        error(\"Expected Name or SNP Name column when using --genome-studio with --bpm/--csv\\n\");\n\n    char *ilmn_strand = NULL;\n    tsv_register(tsv, \"STRAND\", tsv_setter_ilmn_strand, &ilmn_strand);\n    char *snp = NULL;\n    tsv_register(tsv, \"SNP\", tsv_setter_snp, &snp);\n\n    float total_score;\n    int gentrain_score = tsv_register(tsv, \"GENTRAIN_SCORE\", tsv_read_float, &total_score);\n    if (gentrain_score)\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=GenTrain_Score,Number=1,Type=Float,Description=\\\"The SNP \"\n                       \"cluster quality from the GenTrain clustering algorithm\\\">\");\n    float frac[4];\n    int frac_a = tsv_register(tsv, \"FRAC_A\", tsv_read_float, &frac[0]);\n    if (frac_a == 0)\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_A,Number=1,Type=Float,Description=\\\"Fraction of the A \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n    int frac_c = tsv_register(tsv, \"FRAC_C\", tsv_read_float, &frac[1]);\n    if (frac_c == 0)\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_C,Number=1,Type=Float,Description=\\\"Fraction of the C \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n    int frac_g = tsv_register(tsv, \"FRAC_G\", tsv_read_float, &frac[2]);\n    if (frac_g == 0)\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_G,Number=1,Type=Float,Description=\\\"Fraction of the G \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n    int frac_t = tsv_register(tsv, \"FRAC_T\", tsv_read_float, &frac[3]);\n    if (frac_t == 0)\n        bcf_hdr_append(hdr,\n                       \"##INFO=<ID=FRAC_T,Number=1,Type=Float,Description=\\\"Fraction of the T \"\n                       \"nucleotide in the top genomic sequence\\\">\");\n\n    if (bcf_hdr_write(out_fh, hdr) < 0) error(\"Unable to write to output VCF file\\n\");\n    if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0)\n        error(\"Error: failed to initialise index for %s\\n\", output_fname);\n\n    uint8_t *gts = (uint8_t *)malloc(n * sizeof(uint8_t));\n    char *top_strand_alleles = (char *)malloc(n * 2 * sizeof(char));\n    const char *strand_alleles = top_strand_alleles;\n    char *ref_strand_alleles = (char *)malloc(n * 2 * sizeof(char));\n    int32_t *gt_arr = (int32_t *)malloc(n * 2 * sizeof(int32_t));\n    int32_t *gq_arr = (int32_t *)malloc(n * sizeof(int32_t));\n    float *igc_arr = (float *)malloc(n * sizeof(float));\n    float *baf_arr = (float *)malloc(n * sizeof(float));\n    float *lrr_arr = (float *)malloc(n * sizeof(float));\n    float *norm_x_arr = (float *)malloc(n * sizeof(float));\n    float *norm_y_arr = (float *)malloc(n * sizeof(float));\n    float *ilmn_r_arr = (float *)malloc(n * sizeof(float));\n    float *ilmn_theta_arr = (float *)malloc(n * sizeof(float));\n    int32_t *raw_x_arr = (int32_t *)malloc(n * sizeof(int32_t));\n    int32_t *raw_y_arr = (int32_t *)malloc(n * sizeof(int32_t));\n\n    int gs_input[12], gs_output[12];\n\n    gs_col_t gs_gts = {col2sample, GS_GT, gts};\n    gs_input[GS_GT] = !tsv_register_all(tsv, \"GT\", tsv_setter_gs_col, &gs_gts);\n    if (!gs_input[GS_GT]) error(\"Expected GType column\\n\");\n\n    gs_col_t gs_top_strand = {col2sample, GS_TOP_STRAND, top_strand_alleles};\n    gs_input[GS_TOP_STRAND] = !tsv_register_all(tsv, \"TOP_STRAND\", tsv_setter_gs_col, &gs_top_strand);\n\n    gs_col_t gs_ref_strand = {col2sample, GS_REF_STRAND, ref_strand_alleles};\n    gs_input[GS_REF_STRAND] = !tsv_register_all(tsv, \"REF_STRAND\", tsv_setter_gs_col, &gs_ref_strand);\n    if (gs_input[GS_REF_STRAND]) strand_alleles = ref_strand_alleles;\n\n    gs_col_t gs_igc = {col2sample, GS_IGC, igc_arr};\n    gs_input[GS_IGC] = !tsv_register_all(tsv, \"IGC\", tsv_setter_gs_col, &gs_igc);\n\n    gs_col_t gs_baf = {col2sample, GS_BAF, baf_arr};\n    gs_input[GS_BAF] = !tsv_register_all(tsv, \"BAF\", tsv_setter_gs_col, &gs_baf);\n\n    gs_col_t gs_lrr = {col2sample, GS_LRR, lrr_arr};\n    gs_input[GS_LRR] = !tsv_register_all(tsv, \"LRR\", tsv_setter_gs_col, &gs_lrr);\n\n    gs_col_t gs_norm_x = {col2sample, GS_NORMX, norm_x_arr};\n    gs_input[GS_NORMX] = !tsv_register_all(tsv, \"NORMX\", tsv_setter_gs_col, &gs_norm_x);\n\n    gs_col_t gs_norm_y = {col2sample, GS_NORMY, norm_y_arr};\n    gs_input[GS_NORMY] = !tsv_register_all(tsv, \"NORMY\", tsv_setter_gs_col, &gs_norm_y);\n\n    gs_col_t gs_ilmn_r = {col2sample, GS_R, ilmn_r_arr};\n    gs_input[GS_R] = !tsv_register_all(tsv, \"R\", tsv_setter_gs_col, &gs_ilmn_r);\n\n    gs_col_t gs_ilmn_theta = {col2sample, GS_THETA, ilmn_theta_arr};\n    gs_input[GS_THETA] = !tsv_register_all(tsv, \"THETA\", tsv_setter_gs_col, &gs_ilmn_theta);\n\n    gs_col_t gs_raw_x = {col2sample, GS_X, raw_x_arr};\n    gs_input[GS_X] = !tsv_register_all(tsv, \"X\", tsv_setter_gs_col, &gs_raw_x);\n\n    gs_col_t gs_raw_y = {col2sample, GS_Y, raw_y_arr};\n    gs_input[GS_Y] = !tsv_register_all(tsv, \"Y\", tsv_setter_gs_col, &gs_raw_y);\n\n    gs_output[GS_GT] = flags & FORMAT_GT;\n    gs_output[GS_IGC] = (flags & FORMAT_IGC) && gs_input[GS_IGC];\n    gs_output[GS_X] = (flags & FORMAT_X) && gs_input[GS_X];\n    gs_output[GS_Y] = (flags & FORMAT_Y) && gs_input[GS_Y];\n    gs_output[GS_NORMX] = (flags & FORMAT_NORMX) && gs_input[GS_NORMX];\n    gs_output[GS_NORMY] = (flags & FORMAT_NORMY) && gs_input[GS_NORMY];\n    gs_output[GS_R] = (flags & FORMAT_R) && (gs_input[GS_R] || (gs_input[GS_NORMX] && gs_input[GS_NORMY]));\n    gs_output[GS_THETA] = (flags & FORMAT_THETA) && (gs_input[GS_THETA] || (gs_input[GS_NORMX] && gs_input[GS_NORMY]));\n    gs_output[GS_BAF] =\n        (flags & FORMAT_BAF)\n        && (gs_input[GS_BAF]\n            || (egt && ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] && gs_input[GS_THETA]))));\n    gs_output[GS_LRR] =\n        (flags & FORMAT_LRR)\n        && (gs_input[GS_LRR]\n            || (egt && ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] && gs_input[GS_THETA]))));\n\n    int compute_ilmn_theta_r =\n        gs_input[GS_NORMX] && gs_input[GS_NORMY]\n        && (gs_output[GS_R] || gs_output[GS_THETA] || (egt && (gs_output[GS_BAF] || gs_output[GS_LRR])));\n    int compute_baf_lrr = ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] || gs_input[GS_THETA])) && egt\n                          && (gs_output[GS_BAF] || gs_output[GS_LRR]);\n\n    bcf1_t *rec = bcf_init();\n    kstring_t allele_a = {0, 0, NULL};\n    kputc('.', &allele_a);\n    kstring_t allele_b = {0, 0, NULL};\n    kputc('.', &allele_b);\n    kstring_t flank = {0, 0, NULL};\n    int32_t allele_a_idx, allele_b_idx;\n    int n_total = 0, n_missing = 0, n_skipped = 0;\n    while (hts_getline(gs_fh, KS_SEP_LINE, &line) > 0) {\n        if (line.s[0] == '#') continue; // skip comments\n        n_total++;\n        bcf_clear(rec);\n        rec->n_sample = n;\n\n        int intensity_only = 0;\n        if (!tsv_parse_delimiter(tsv, rec, line.s, '\\t')) {\n            if (bpm) {\n                int idx;\n                int ret = khash_str2int_get(bpm->names2index, rec->d.id, &idx);\n                if (ret < 0) error(\"Illumina probe %s not found in manifest file\\n\", rec->d.id);\n                LocusEntry *locus_entry = &bpm->locus_entries[idx];\n                intensity_only = locus_entry->intensity_only;\n                ClusterRecord *cluster_record = NULL;\n                if (flags & EGT_LOADED) {\n                    int idx;\n                    int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n                    if (ret < 0) error(\"Illumina probe %s not found in cluster file\\n\", locus_entry->name);\n                    cluster_record = &egt->cluster_records[idx];\n                }\n                switch (locus2bcf(locus_entry, cluster_record, hdr, fai, gc_win, flags, &allele_a, &allele_b, &flank,\n                                  &allele_a_idx, &allele_b_idx, rec)) {\n                case -1:\n                    n_skipped++;\n                    continue;\n                case 1:\n                    n_missing++;\n                    break;\n                }\n                if (compute_ilmn_theta_r)\n                    for (i = 0; i < n; i++)\n                        norm_x_y2ilmn_theta_r(norm_x_arr[i], norm_y_arr[i], &ilmn_theta_arr[i], &ilmn_r_arr[i]);\n                if (compute_baf_lrr) {\n                    if ((flags & ADJUST_CLUSTERS) && !locus_entry->intensity_only)\n                        adjust_clusters(gts, ilmn_theta_arr, ilmn_r_arr, n, cluster_record);\n                    for (i = 0; i < n; i++) {\n                        if (!isnan(ilmn_theta_arr[i]) && !isnan(ilmn_r_arr[i])) {\n                            get_baf_lrr(\n                                ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean,\n                                cluster_record->ab_cluster_stats.theta_mean,\n                                cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean,\n                                cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean,\n                                locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]);\n                        } else {\n                            baf_arr[i] = -NAN;\n                            lrr_arr[i] = -NAN;\n                        }\n                    }\n                }\n            } else {\n                if (rec->rid < 0 || rec->pos < 0) {\n                    if (flags & VERBOSE) fprintf(stderr, \"Skipping unlocalized marker %s\\n\", rec->d.id);\n                    n_skipped++;\n                    continue;\n                }\n\n                // determine A and B alleles\n                allele_a.s[0] = '.';\n                allele_b.s[0] = '.';\n                if (ilmn_strand && snp) {\n                    if (strncmp(ilmn_strand, \"BOT\", 3) == 0) {\n                        allele_a.s[0] = rev_nt(snp[1]);\n                        allele_b.s[0] = rev_nt(snp[3]);\n                    } else {\n                        allele_a.s[0] = snp[1];\n                        allele_b.s[0] = snp[3];\n                    }\n                } else {\n                    for (i = 0; i < n; i++) {\n                        switch (gts[i]) {\n                        case GT_NC:\n                            break;\n                        case GT_AA:\n                            allele_a.s[0] = strand_alleles[2 * i];\n                            break;\n                        case GT_AB:\n                            allele_a.s[0] = strand_alleles[2 * i];\n                            allele_b.s[0] = strand_alleles[2 * i + 1];\n                            break;\n                        case GT_BB:\n                            allele_b.s[0] = strand_alleles[2 * i];\n                            break;\n                        default:\n                            error(\"Unable to process marker %s\\n\", rec->d.id);\n                            break;\n                        }\n                    }\n                }\n                int len, win = min(max(100, gc_win), rec->pos);\n                char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len);\n                if (!ref || len == 1)\n                    error(\"faidx_fetch_seq failed at %s:%\" PRId64 \" (are you using the correct reference genome?)\\n\",\n                          bcf_seqname(hdr, rec), rec->pos + 1);\n                strupper(ref);\n                if (!(flags & NO_INFO_GC)) {\n                    float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]);\n                    bcf_update_info_float(hdr, rec, \"GC\", &gc_ratio, 1);\n                }\n                char ref_base[] = {ref[win], '\\0'};\n                int is_indel =\n                    allele_a.s[0] == 'D' || allele_a.s[0] == 'I' || allele_b.s[0] == 'D' || allele_b.s[0] == 'I';\n                if (is_indel) {\n                    if (allele_a.s[0] == '.') {\n                        allele_a.s[0] = allele_b.s[0] == 'D' ? 'I' : 'D';\n                    }\n                    if (allele_b.s[0] == '.') {\n                        allele_b.s[0] = allele_a.s[0] == 'D' ? 'I' : 'D';\n                    }\n                    ref_base[0] = allele_a.s[0];\n                    n_missing++;\n                } else if ((ilmn_strand && snp) || strand_alleles == top_strand_alleles) {\n                    if (allele_a.s[0] == '.' || allele_b.s[0] == '.') {\n                        allele_a.s[0] = '.';\n                        allele_b.s[0] = '.';\n                    } else {\n                        int strand = get_strand_from_top_alleles(allele_a.s, allele_b.s, ref, win, len);\n                        if (strand < 0) {\n                            if (flags & VERBOSE)\n                                fprintf(stderr, \"Unable to determine reference strand for SNP %s\\n\", rec->d.id);\n                            allele_a.s[0] = '.';\n                            allele_b.s[0] = '.';\n                        } else if (strand == 1) {\n                            allele_a.s[0] = rev_nt(allele_a.s[0]);\n                            allele_b.s[0] = rev_nt(allele_b.s[0]);\n                        }\n                    }\n                }\n                free(ref);\n\n                allele_b_idx = get_allele_b_idx(ref_base[0], allele_a.s, allele_b.s);\n                allele_a_idx = get_allele_a_idx(allele_b_idx);\n                const char *alleles[3];\n                int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a.s, allele_b.s, allele_b_idx);\n                if (nals < 0) error(\"Unable to process marker %s\\n\", rec->d.id);\n                bcf_update_alleles(hdr, rec, alleles, nals);\n                bcf_update_info_int32(hdr, rec, \"ALLELE_A\", &allele_a_idx, 1);\n                bcf_update_info_int32(hdr, rec, \"ALLELE_B\", &allele_b_idx, 1);\n\n                if (gentrain_score == 0) bcf_update_info_float(hdr, rec, \"GenTrain_Score\", &total_score, 1);\n                if (frac_a == 0) bcf_update_info_float(hdr, rec, \"FRAC_A\", &frac[0], 1);\n                if (frac_c == 0) bcf_update_info_float(hdr, rec, \"FRAC_C\", &frac[1], 1);\n                if (frac_g == 0) bcf_update_info_float(hdr, rec, \"FRAC_G\", &frac[2], 1);\n                if (frac_t == 0) bcf_update_info_float(hdr, rec, \"FRAC_T\", &frac[3], 1);\n            }\n\n            if (!intensity_only) {\n                if (allele_a_idx >= 0 && allele_b_idx >= 0) {\n                    gts_to_gt_arr(gt_arr, gts, n, allele_a_idx, allele_b_idx);\n                } else {\n                    for (i = 0; i < n; i++) {\n                        gt_arr[2 * i] = bcf_gt_missing;\n                        gt_arr[2 * i + 1] = bcf_gt_missing;\n                    }\n                }\n                bcf_update_genotypes(hdr, rec, gt_arr, n * 2);\n\n                if (gs_output[GS_IGC]) {\n                    for (i = 0; i < n; i++) {\n                        gq_arr[i] = (int)(-10 * log10(1 - igc_arr[i]) + .5);\n                        if (gq_arr[i] < 0) gq_arr[i] = 0;\n                        if (gq_arr[i] > 50) gq_arr[i] = 50;\n                    }\n                    bcf_update_format_float(hdr, rec, \"IGC\", (float *)gs_igc.ptr, n);\n                    if (flags && FORMAT_GQ) bcf_update_format_int32(hdr, rec, \"GQ\", gq_arr, n);\n                }\n            }\n            if (gs_output[GS_BAF]) bcf_update_format_float(hdr, rec, \"BAF\", baf_arr, n);\n            if (gs_output[GS_LRR]) bcf_update_format_float(hdr, rec, \"LRR\", lrr_arr, n);\n            if (gs_output[GS_NORMX]) bcf_update_format_float(hdr, rec, \"NORMX\", norm_x_arr, n);\n            if (gs_output[GS_NORMY]) bcf_update_format_float(hdr, rec, \"NORMY\", norm_y_arr, n);\n            if (gs_output[GS_R]) bcf_update_format_float(hdr, rec, \"R\", ilmn_r_arr, n);\n            if (gs_output[GS_THETA]) bcf_update_format_float(hdr, rec, \"THETA\", ilmn_theta_arr, n);\n            if (gs_output[GS_X]) bcf_update_format_int32(hdr, rec, \"X\", raw_x_arr, n);\n            if (gs_output[GS_Y]) bcf_update_format_int32(hdr, rec, \"Y\", raw_y_arr, n);\n            if (bcf_write(out_fh, hdr, rec) < 0) error(\"Unable to write to output VCF file\\n\");\n        } else {\n            if (flags & VERBOSE) fprintf(stderr, \"Failed to process marker %s\\n\", rec->d.id);\n            n_skipped++;\n        }\n    }\n    fprintf(stderr, \"Lines   total/missing-reference/skipped:\\t%d/%d/%d\\n\", n_total, n_missing, n_skipped);\n    free(line.s);\n\n    free(col2sample);\n    free(gts);\n    free(gt_arr);\n    free(gq_arr);\n    free(igc_arr);\n    free(baf_arr);\n    free(lrr_arr);\n    free(norm_x_arr);\n    free(norm_y_arr);\n    free(ilmn_r_arr);\n    free(ilmn_theta_arr);\n    free(raw_x_arr);\n    free(raw_y_arr);\n    free(top_strand_alleles);\n    free(ref_strand_alleles);\n    tsv_destroy(tsv);\n    free(str.s);\n\n    free(allele_a.s);\n    free(allele_b.s);\n    free(flank.s);\n\n    bcf_destroy(rec);\n    bcf_hdr_destroy(hdr);\n    if (hts_close(gs_fh) < 0) error(\"Error: close failed: %s\\n\", gs_fh->fn);\n}\n\n/****************************************\n * PLUGIN                               *\n ****************************************/\n\nconst char *about(void) { return \"Convert Illumina GTC files to VCF.\\n\"; }\n\nstatic const char *usage_text(void) {\n    return \"\\n\"\n           \"About: convert Illumina GTC files containing intensity data into VCF. \"\n           \"(version \" GTC2VCF_VERSION\n           \" http://github.com/freeseek/gtc2vcf)\\n\"\n           \"Usage: bcftools +gtc2vcf [options] [<A.gtc> ...]\\n\"\n           \"\\n\"\n           \"Plugin options:\\n\"\n           \"    -l, --list-tags                   list available FORMAT tags with description for VCF output\\n\"\n           \"    -t, --tags LIST                   list of output FORMAT tags [\" TAG_LIST_DFLT\n           \"]\\n\"\n           \"    -b, --bpm <file>                  BPM manifest file\\n\"\n           \"    -c, --csv <file>                  CSV manifest file (can be gzip compressed)\\n\"\n           \"    -e, --egt <file>                  EGT cluster file\\n\"\n           \"    -f, --fasta-ref <file>            reference sequence in fasta format\\n\"\n           \"        --set-cache-size <int>        select fasta cache size in bytes\\n\"\n           \"        --gc-window-size <int>        window size in bp used to compute the GC content (-1 for no \"\n           \"estimate) [\" GC_WIN_DFLT\n           \"]\\n\"\n           \"    -g, --gtcs <dir|file>             GTC genotype files from directory or list from file\\n\"\n           \"    -i, --idat                        input IDAT files rather than GTC files\\n\"\n           \"        --capacity <int>              number of variants to read from intensity files per I/O operation \"\n           \"[\" CAPACITY_DFLT\n           \"]\\n\"\n           \"        --adjust-clusters             adjust cluster centers in (Theta, R) space (requires --bpm and \"\n           \"--egt)\\n\"\n           \"        --use-gtc-sample-names        use sample name in GTC files rather than GTC file name\\n\"\n           \"        --do-not-check-bpm            do not check whether BPM and GTC files match manifest file name\\n\"\n           \"        --do-not-check-eof            do not check whether the BPM and EGT readers reach the end of the \"\n           \"file\\n\"\n           \"        --genome-studio <file>        input a GenomeStudio final report file (in matrix format)\\n\"\n           \"        --no-version                  do not append version and command line to the header\\n\"\n           \"    -o, --output <file>               write output to a file [standard output]\\n\"\n           \"    -O, --output-type u|b|v|z|t[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF\\n\"\n           \"                                      t: GenomeStudio tab-delimited text output, 0-9: compression level \"\n           \"[v]\\n\"\n           \"        --threads <int>               number of extra output compression threads [0]\\n\"\n           \"    -x, --extra <file>                write GTC metadata to a file\\n\"\n           \"    -v, --verbose                     print verbose information\\n\"\n           \"    -W, --write-index[=FMT]           Automatically index the output files [off]\\n\"\n           \"\\n\"\n           \"Manifest options:\\n\"\n           \"        --beadset-order               output BeadSetID normalization order (requires --bpm and --csv)\\n\"\n           \"        --fasta-flank                 output flank sequence in FASTA format (requires --csv)\\n\"\n           \"    -s, --sam-flank <file>            input flank sequence alignment in SAM/BAM format (requires --csv)\\n\"\n           \"        --genome-build <assembly>     genome build ID used to update the manifest file [\" GENOME_BUILD_DFLT\n           \"]\\n\"\n           \"\\n\"\n           \"Examples:\\n\"\n           \"    bcftools +gtc2vcf -i 5434246082_R03C01_Grn.idat\\n\"\n           \"    bcftools +gtc2vcf 5434246082_R03C01.gtc\\n\"\n           \"    bcftools +gtc2vcf -b HumanOmni2.5-4v1_H.bpm -c HumanOmni2.5-4v1_H.csv\\n\"\n           \"    bcftools +gtc2vcf -e HumanOmni2.5-4v1_H.egt\\n\"\n           \"    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv -e GSA-24v3-0_A1_ClusterFile.egt -f human_g1k_v37.fasta -o \"\n           \"GSA-24v3-0_A1.vcf\\n\"\n           \"    bcftools +gtc2vcf -c HumanOmni2.5-4v1_H.csv -f human_g1k_v37.fasta 5434246082_R03C01.gtc -o \"\n           \"5434246082_R03C01.vcf\\n\"\n           \"    bcftools +gtc2vcf -f human_g1k_v37.fasta --genome-studio GenotypeReport.txt -o GenotypeReport.vcf\\n\"\n           \"\\n\"\n           \"Examples of manifest file options:\\n\"\n           \"    bcftools +gtc2vcf -b GSA-24v3-0_A1.bpm -c GSA-24v3-0_A1.csv --beadset-order\\n\"\n           \"    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --fasta-flank -o GSA-24v3-0_A1.fasta\\n\"\n           \"    bwa mem -M Homo_sapiens_assembly38.fasta GSA-24v3-0_A1.fasta -o GSA-24v3-0_A1.sam\\n\"\n           \"    bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --sam-flank GSA-24v3-0_A1.sam -o GSA-24v3-0_A1.GRCh38.csv\\n\"\n           \"\\n\";\n}\n\nstatic int parse_tags(const char *str) {\n    int i, flags = 0, n;\n    char **tags = hts_readlist(str, 0, &n);\n    for (i = 0; i < n; i++) {\n        if (!strcasecmp(tags[i], \"GT\"))\n            flags |= FORMAT_GT;\n        else if (!strcasecmp(tags[i], \"GQ\"))\n            flags |= FORMAT_GQ;\n        else if (!strcasecmp(tags[i], \"IGC\"))\n            flags |= FORMAT_IGC;\n        else if (!strcasecmp(tags[i], \"X\"))\n            flags |= FORMAT_X;\n        else if (!strcasecmp(tags[i], \"Y\"))\n            flags |= FORMAT_Y;\n        else if (!strcasecmp(tags[i], \"NORMX\"))\n            flags |= FORMAT_NORMX;\n        else if (!strcasecmp(tags[i], \"NORMY\"))\n            flags |= FORMAT_NORMY;\n        else if (!strcasecmp(tags[i], \"R\"))\n            flags |= FORMAT_R;\n        else if (!strcasecmp(tags[i], \"THETA\"))\n            flags |= FORMAT_THETA;\n        else if (!strcasecmp(tags[i], \"LRR\"))\n            flags |= FORMAT_LRR;\n        else if (!strcasecmp(tags[i], \"BAF\"))\n            flags |= FORMAT_BAF;\n        else\n            error(\"Error parsing \\\"--tags %s\\\": the tag \\\"%s\\\" is not supported\\n\", str, tags[i]);\n        free(tags[i]);\n    }\n    if (n) free(tags);\n    return flags;\n}\n\nstatic void list_tags(void) {\n    error(\n        \"FORMAT/GT       Number:1  Type:String   ..  Genotype\\n\"\n        \"FORMAT/GQ       Number:1  Type:Integer  ..  Genotype Quality\\n\"\n        \"FORMAT/IGC      Number:1  Type:Float    ..  Illumina GenCall Confidence Score\\n\"\n        \"FORMAT/BAF      Number:1  Type:Float    ..  B Allele Frequency\\n\"\n        \"FORMAT/LRR      Number:1  Type:Float    ..  Log R Ratio\\n\"\n        \"FORMAT/NORMX    Number:1  Type:Float    ..  Normalized X intensity\\n\"\n        \"FORMAT/NORMY    Number:1  Type:Float    ..  Normalized Y intensity\\n\"\n        \"FORMAT/R        Number:1  Type:Float    ..  Normalized R value\\n\"\n        \"FORMAT/THETA    Number:1  Type:Float    ..  Normalized Theta value\\n\"\n        \"FORMAT/X        Number:1  Type:Integer  ..  Raw X intensity\\n\"\n        \"FORMAT/Y        Number:1  Type:Integer  ..  Raw Y intensity\\n\");\n}\n\nint run(int argc, char *argv[]) {\n    const char *tag_list = TAG_LIST_DFLT;\n    const char *bpm_fname = NULL;\n    const char *csv_fname = NULL;\n    const char *egt_fname = NULL;\n    const char *gs_fname = NULL;\n    const char *output_fname = \"-\";\n    const char *ref_fname = NULL;\n    const char *pathname = NULL;\n    const char *extra_fname = NULL;\n    const char *sam_fname = NULL;\n    const char *genome_build = GENOME_BUILD_DFLT;\n    char *index_fname;\n    char *tmp;\n    int i, j;\n    int flags = 0;\n    int output_type = FT_VCF;\n    int clevel = -1;\n    size_t capacity = 0;\n    int cache_size = 0;\n    int gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0);\n    int gtc_sample_names = 0;\n    int bpm_check = 1;\n    int eof_check = 1;\n    int n_threads = 0;\n    int record_cmd_line = 1;\n    int write_index = 0;\n    int binary_to_csv = 0;\n    int beadset_order = 0;\n    int fasta_flank = 0;\n    faidx_t *fai = NULL;\n    htsFile *out_fh = NULL;\n    FILE *out_txt = NULL;\n\n    static struct option loptions[] = {\n        {\"list-tags\", no_argument, NULL, 'l'},          {\"tags\", required_argument, NULL, 't'},\n        {\"bpm\", required_argument, NULL, 'b'},          {\"csv\", required_argument, NULL, 'c'},\n        {\"egt\", required_argument, NULL, 'e'},          {\"fasta-ref\", required_argument, NULL, 'f'},\n        {\"set-cache-size\", required_argument, NULL, 1}, {\"gc-window-size\", required_argument, NULL, 2},\n        {\"gtcs\", required_argument, NULL, 'g'},         {\"idat\", no_argument, NULL, 'i'},\n        {\"capacity\", required_argument, NULL, 3},       {\"adjust-clusters\", no_argument, NULL, 4},\n        {\"use-gtc-sample-names\", no_argument, NULL, 5}, {\"do-not-check-bpm\", no_argument, NULL, 6},\n        {\"do-not-check-eof\", no_argument, NULL, 7},     {\"genome-studio\", required_argument, NULL, 8},\n        {\"no-version\", no_argument, NULL, 9},           {\"output\", required_argument, NULL, 'o'},\n        {\"output-type\", required_argument, NULL, 'O'},  {\"threads\", required_argument, NULL, 10},\n        {\"extra\", required_argument, NULL, 'x'},        {\"verbose\", no_argument, NULL, 'v'},\n        {\"beadset-order\", no_argument, NULL, 12},       {\"fasta-flank\", no_argument, NULL, 13},\n        {\"sam-flank\", required_argument, NULL, 's'},    {\"genome-build\", required_argument, NULL, 14},\n        {\"write-index\", optional_argument, NULL, 'W'},  {NULL, 0, NULL, 0}};\n    int c;\n    while ((c = getopt_long(argc, argv, \"h?lt:b:c:e:f:g:io:O:x:vs:W::\", loptions, NULL)) >= 0) {\n        switch (c) {\n        case 'l':\n            list_tags();\n            break;\n        case 't':\n            tag_list = optarg;\n            break;\n        case 'b':\n            bpm_fname = optarg;\n            break;\n        case 'c':\n            csv_fname = optarg;\n            break;\n        case 'e':\n            egt_fname = optarg;\n            break;\n        case 'f':\n            ref_fname = optarg;\n            break;\n        case 1:\n            cache_size = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --set-cache-size %s\\n\", optarg);\n            break;\n        case 2:\n            gc_win = (int)strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --gc-window-size %s\\n\", optarg);\n            if (gc_win <= 0) flags |= NO_INFO_GC;\n            break;\n        case 'g':\n            pathname = optarg;\n            break;\n        case 'i':\n            flags |= LOAD_IDAT;\n            break;\n        case 3:\n            capacity = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --capacity %s\\n\", optarg);\n            break;\n        case 4:\n            flags |= ADJUST_CLUSTERS;\n            break;\n        case 5:\n            gtc_sample_names = 1;\n            break;\n        case 6:\n            bpm_check = 0;\n            break;\n        case 7:\n            eof_check = 0;\n            break;\n        case 8:\n            gs_fname = optarg;\n            break;\n        case 9:\n            record_cmd_line = 0;\n            break;\n        case 'o':\n            output_fname = optarg;\n            break;\n        case 'O':\n            switch (optarg[0]) {\n            case 'b':\n                output_type = FT_BCF_GZ;\n                break;\n            case 'u':\n                output_type = FT_BCF;\n                break;\n            case 'z':\n                output_type = FT_VCF_GZ;\n                break;\n            case 'v':\n                output_type = FT_VCF;\n                break;\n            case 't':\n                output_type = FT_TAB_TEXT;\n                break;\n            default: {\n                clevel = strtol(optarg, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9) error(\"The output type \\\"%s\\\" not recognised\\n\", optarg);\n            }\n            }\n            if (optarg[1]) {\n                clevel = strtol(optarg + 1, &tmp, 10);\n                if (*tmp || clevel < 0 || clevel > 9)\n                    error(\"Could not parse argument: --compression-level %s\\n\", optarg + 1);\n            }\n            break;\n        case 10:\n            n_threads = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse argument: --threads %s\\n\", optarg);\n            break;\n        case 'x':\n            extra_fname = optarg;\n            break;\n        case 'v':\n            flags |= VERBOSE;\n            break;\n        case 12:\n            beadset_order = 1;\n            break;\n        case 13:\n            fasta_flank = 1;\n            break;\n        case 's':\n            sam_fname = optarg;\n            break;\n        case 14:\n            genome_build = optarg;\n            break;\n        case 'W':\n            if (!(write_index = write_index_parse(optarg))) error(\"Unsupported index format '%s'\\n\", optarg);\n            break;\n        case 'h':\n        case '?':\n        default:\n            error(\"%s\", usage_text());\n        }\n    }\n    if ((((bpm_fname != NULL) || (csv_fname != NULL)) + (egt_fname != NULL) + (argc - optind > 0) + (pathname != NULL)\n         == 1)\n        && ref_fname == NULL && gs_fname == NULL)\n        binary_to_csv = 1;\n    if (sam_fname && (csv_fname == NULL)) error(\"The --sam-flank option requires the --csv option\\n%s\", usage_text());\n    if (binary_to_csv) {\n        if (beadset_order && (bpm_fname == NULL || csv_fname == NULL))\n            error(\"The --beadset-order option requires both the --bpm and the --csv options\\n%s\", usage_text());\n        if (fasta_flank && (csv_fname == NULL))\n            error(\"The --fasta-flank option requires the --csv option\\n%s\", usage_text());\n        if (beadset_order + fasta_flank + (sam_fname != NULL) > 1)\n            error(\n                \"Only one of --beadset-order or --fasta-flank or --sam-flank options can be \"\n                \"used at once\\n%s\",\n                usage_text());\n    } else {\n        if (flags & LOAD_IDAT)\n            error(\"The --idat option can only be used alone or with option --gtcs\\n%s\", usage_text());\n        if (beadset_order)\n            error(\"The --beadset-order option can only be used with options --bpm and --csv\\n%s\", usage_text());\n        if (fasta_flank)\n            error(\"The --fasta-flank option can only be used with options --bpm and --csv\\n%s\", usage_text());\n        if (!bpm_fname && !csv_fname && !gs_fname)\n            error(\"Manifest file required when converting to VCF\\n%s\", usage_text());\n        if (!egt_fname && (flags & ADJUST_CLUSTERS))\n            error(\"Cluster file required when adjusting cluster centers\\n%s\", usage_text());\n        if (gs_fname && (argc - optind > 0 || pathname))\n            error(\"If a GenomeStudio final report file is provided, do not pass GTC files\\n%s\", usage_text());\n        if (gs_fname && output_type == FT_TAB_TEXT)\n            error(\"If a GenomeStudio final report file is provided, you cannot output in GenomeStudio format\\n%s\",\n                  usage_text());\n        if (argc - optind > 0 && pathname)\n            error(\"GTC files cannot be listed through both command interface and file list\\n%s\", usage_text());\n        if (!gs_fname && output_type != FT_TAB_TEXT && extra_fname) out_txt = get_file_handle(extra_fname);\n    }\n    flags |= parse_tags(tag_list);\n\n    // beginning of plugin run\n    fprintf(stderr, \"gtc2vcf \" GTC2VCF_VERSION \" http://github.com/freeseek/gtc2vcf\\n\");\n\n    int nfiles = 0;\n    char **filenames = NULL;\n    if (pathname) {\n        filenames = get_file_list(pathname, flags & LOAD_IDAT ? \"idat\" : \"gtc\", &nfiles);\n    } else {\n        nfiles = argc - optind;\n        filenames = argv + optind;\n    }\n    void **files = (void **)malloc(nfiles * sizeof(void *));\n\n    // make sure the process is allowed to open enough files\n    struct rlimit lim;\n    getrlimit(RLIMIT_NOFILE, &lim);\n    if (nfiles + 10 > lim.rlim_max)\n        error(\"On this system you cannot open more than %ld files at once while %d is required\\n\", lim.rlim_max,\n              nfiles + 10);\n    if (nfiles + 10 > lim.rlim_cur) {\n        lim.rlim_cur = nfiles + 10;\n        fprintf(stderr, \"Adjusting the limit of how many files can be open at once to %ld\\n\", lim.rlim_cur);\n        setrlimit(RLIMIT_NOFILE, &lim);\n    }\n\n    if ((flags & ADJUST_CLUSTERS) && nfiles < 100)\n        fprintf(stderr, \"Warning: adjusting clusters with %d sample(s) is not recommended\\n\", nfiles);\n\n    if (binary_to_csv || output_type == FT_TAB_TEXT) {\n        out_txt = get_file_handle(output_fname);\n    } else {\n        char wmode[8];\n        set_wmode(wmode, output_type, (char *)output_fname, clevel);\n        out_fh = hts_open(output_fname, hts_bcf_wmode(output_type));\n        if (out_fh == NULL) error(\"[%s] Error: cannot write to \\\"%s\\\": %s\\n\", __func__, output_fname, strerror(errno));\n        if (n_threads) hts_set_threads(out_fh, n_threads);\n        if (!ref_fname) error(\"VCF output requires the --fasta-ref option\\n\");\n        fai = fai_load(ref_fname);\n        if (!fai) error(\"Could not load the reference %s\\n\", ref_fname);\n        if (cache_size) fai_set_cache_size(fai, cache_size);\n        if (extra_fname) out_txt = get_file_handle(extra_fname);\n    }\n\n    bpm_t *bpm = NULL;\n    if (bpm_fname) {\n        fprintf(stderr, \"Reading BPM file %s\\n\", bpm_fname);\n        bpm = bpm_init(bpm_fname, eof_check, gs_fname != NULL);\n        flags |= BPM_LOADED;\n        if (binary_to_csv && !csv_fname) bpm_to_csv(bpm, out_txt, flags);\n    }\n\n    if (csv_fname) {\n        fprintf(stderr, \"Reading CSV file %s\\n\", csv_fname);\n        bpm = bpm_csv_init(csv_fname, bpm, gs_fname != NULL);\n        flags |= CSV_LOADED;\n        if (binary_to_csv && !sam_fname && !beadset_order && !fasta_flank) bpm_to_csv(bpm, out_txt, flags);\n    }\n\n    // output source sequences in FASTA format to be realigned by bwa mem\n    if (fasta_flank) {\n        for (i = 0; i < bpm->num_loci; i++)\n            flank2fasta(bpm->locus_entries[i].ilmn_id, bpm->locus_entries[i].source_seq, out_txt);\n    }\n\n    // input source sequence alignments in SAM format to generate new coordinates for the\n    // CSV manifest file\n    if (sam_fname) {\n        fprintf(stderr, \"Reading SAM file %s\\n\", sam_fname);\n        bpm = sam_csv_init(sam_fname, bpm, genome_build, flags);\n        if (binary_to_csv) bpm_to_csv(bpm, out_txt, flags);\n    }\n\n    // the BeadSet normalization order is the only information in the BPM manifest file\n    // missing from the CSV manifest file\n    kstring_t str = {0, 0, NULL};\n    if ((flags & BPM_LOADED) && (flags & CSV_LOADED)) {\n        int32_t norm_id_to_beadset_id[100] = {0};\n        for (i = 0; i < bpm->num_loci; i++) {\n            uint8_t norm_id = bpm->norm_ids[i] % 100;\n            if (norm_id_to_beadset_id[norm_id] != 0\n                && norm_id_to_beadset_id[norm_id] != bpm->locus_entries[i].beadset_id) {\n                if (norm_id > 4) // exception for possible overflow with Omni5 arrays\n                    error(\"Normalization ID %d corresponds to multiple BeadSet IDs %d and %d\\n\", norm_id,\n                          norm_id_to_beadset_id[norm_id], bpm->locus_entries[i].beadset_id);\n                if (bpm->norm_ids[i] < 100) continue;\n            }\n            norm_id_to_beadset_id[norm_id] = bpm->locus_entries[i].beadset_id;\n        }\n        for (i = 0, j = 0; i < 100; i++) {\n            if (norm_id_to_beadset_id[i] == 0) continue;\n            if (i != j) error(\"Normalization ID %d not corresponding to any BeadSet ID\", j);\n            if (i > 0) kputc(',', &str);\n            kputw(norm_id_to_beadset_id[i], &str);\n            j++;\n        }\n        if (beadset_order && out_txt) fprintf(out_txt, \"%s,%s\\n\", bpm->manifest_name, str.s);\n    }\n    if ((flags & ADJUST_CLUSTERS) && !(flags & BPM_LOADED))\n        error(\"Cannot adjust clusters as couldn't generate the normalization lookup table\\n\");\n\n    egt_t *egt = NULL;\n    if (egt_fname) {\n        fprintf(stderr, \"Reading EGT file %s\\n\", egt_fname);\n        egt = egt_init(egt_fname, eof_check);\n        if (binary_to_csv)\n            egt_to_csv(egt, out_txt, flags & VERBOSE);\n        else\n            flags |= EGT_LOADED;\n    }\n\n    if (bpm && egt) {\n        if (bpm->num_loci < egt->num_records)\n            fprintf(stderr, \"Warning: Manifest file includes less loci (%d) than records in the cluster file (%d)\\n\",\n                    bpm->num_loci, egt->num_records);\n        else if (bpm->num_loci > egt->num_records)\n            error(\"Manifest file includes more loci (%d) than records in the cluster file (%d)\\n\", bpm->num_loci,\n                  egt->num_records);\n    }\n\n    if (gs_fname) flags |= GENOME_STUDIO;\n\n    for (i = 0; i < nfiles; i++) {\n        if (flags & LOAD_IDAT) {\n            fprintf(stderr, \"Reading IDAT file %s\\n\", filenames[i]);\n            idat_t *idat = idat_init(filenames[i], nfiles == 1);\n            files[i] = (void *)idat;\n        } else {\n            fprintf(stderr, \"Reading GTC file %s\\n\", filenames[i]);\n            gtc_t *gtc = gtc_init(filenames[i], capacity);\n            // GenCall fills the GTC SNP manifest with the BPM file name rather than\n            // the BPM manifest name\n            if (bpm && bpm->fn && bpm_check\n                && strncmp(bpm->manifest_name, gtc->snp_manifest, strlen(bpm->manifest_name))\n                && strcmp(strrchr(bpm->fn, '/') ? strrchr(bpm->fn, '/') + 1 : bpm->fn, gtc->snp_manifest))\n                error(\n                    \"Manifest name %s in BPM file %s does not match manifest name %s in GTC \"\n                    \"file %s\\nUse --do-not-check-bpm to suppress this check\\n\",\n                    bpm->manifest_name, bpm->fn, gtc->snp_manifest, gtc->fn);\n            files[i] = (void *)gtc;\n        }\n    }\n\n    if (binary_to_csv && nfiles > 0) {\n        if (flags & LOAD_IDAT) {\n            if (nfiles == 1)\n                idat_to_csv((idat_t *)files[0], out_txt, flags & VERBOSE);\n            else\n                idats_to_tsv((idat_t **)files, nfiles, out_txt);\n        } else {\n            if (nfiles == 1)\n                gtc_to_csv((gtc_t *)files[0], out_txt, flags & VERBOSE);\n            else\n                gtcs_to_tsv((gtc_t **)files, nfiles, out_txt);\n        }\n    }\n\n    if (!binary_to_csv) {\n        if (nfiles == 1) fprintf(stderr, \"Warning: it is recommended to convert multiple GTC files at once\\n\");\n        if (output_type == FT_TAB_TEXT) {\n            fprintf(stderr, \"Writing GenomeStudio final report file\\n\");\n            gtcs_to_gs((gtc_t **)files, nfiles, bpm, egt, out_txt, flags);\n        } else {\n            fprintf(stderr, \"Writing VCF file\\n\");\n            bcf_hdr_t *hdr = hdr_init(fai, flags);\n            if (bpm_fname)\n                bcf_hdr_printf(hdr, \"##BPM=%s\", strrchr(bpm_fname, '/') ? strrchr(bpm_fname, '/') + 1 : bpm_fname);\n            if (csv_fname)\n                bcf_hdr_printf(hdr, \"##CSV=%s\", strrchr(csv_fname, '/') ? strrchr(csv_fname, '/') + 1 : csv_fname);\n            if (egt_fname)\n                bcf_hdr_printf(hdr, \"##EGT=%s\", strrchr(egt_fname, '/') ? strrchr(egt_fname, '/') + 1 : egt_fname);\n            if (sam_fname)\n                bcf_hdr_printf(hdr, \"##SAM=%s\", strrchr(sam_fname, '/') ? strrchr(sam_fname, '/') + 1 : sam_fname);\n            if ((flags & BPM_LOADED) && (flags & CSV_LOADED)) bcf_hdr_printf(hdr, \"##BeadSet_Order=%s\", str.s);\n            if (record_cmd_line) bcf_hdr_append_version(hdr, argc, argv, \"bcftools_gtc2vcf\");\n            if (gs_fname) {\n                htsFile *gs_fh = hts_open(gs_fname, \"r\");\n                bcf_hdr_printf(hdr, \"##GenomeStudio=%s\",\n                               strrchr(gs_fname, '/') ? strrchr(gs_fname, '/') + 1 : gs_fname);\n                gs_to_vcf(fai, bpm, egt, gs_fh, out_fh, hdr, output_fname, index_fname, write_index, flags, gc_win);\n            } else {\n                if (extra_fname) gtcs_to_tsv((gtc_t **)files, nfiles, out_txt);\n                for (i = 0; i < nfiles; i++) {\n                    gtc_t *gtc = (gtc_t *)files[i];\n                    const char *sample_name =\n                        (gtc_sample_names && gtc->sample_name) ? gtc->sample_name : gtc->display_name;\n                    if (bcf_hdr_add_sample(hdr, sample_name) < 0)\n                        error(\"GTC files must correspond to different samples\\n\");\n                }\n                if (bcf_hdr_write(out_fh, hdr) < 0) error(\"Unable to write to output VCF file\\n\");\n                if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0)\n                    error(\"Error: failed to initialise index for %s\\n\", output_fname);\n                gtcs_to_vcf(fai, bpm, egt, (gtc_t **)files, nfiles, out_fh, hdr, flags, gc_win);\n            }\n            if (write_index) {\n                if (bcf_idx_save(out_fh) < 0) {\n                    if (hts_close(out_fh) != 0)\n                        error(\"Close failed %s\\n\", strcmp(output_fname, \"-\") ? output_fname : \"stdout\");\n                    error(\"Error: cannot write to index %s\\n\", index_fname);\n                }\n                free(index_fname);\n            }\n            if (hts_close(out_fh) != 0) error(\"Close failed %s\\n\", strcmp(output_fname, \"-\") ? output_fname : \"stdout\");\n        }\n    }\n\n    free(str.s);\n    fai_destroy(fai);\n    egt_destroy(egt);\n    bpm_destroy(bpm);\n    if (pathname) {\n        for (i = 0; i < nfiles; i++) free(filenames[i]);\n        free(filenames);\n    }\n    for (i = 0; i < nfiles; i++) {\n        if (flags & LOAD_IDAT)\n            idat_destroy((idat_t *)files[i]);\n        else\n            gtc_destroy((gtc_t *)files[i]);\n    }\n    free(files);\n    if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt);\n    return 0;\n}\n"
  },
  {
    "path": "gtc2vcf.h",
    "content": "/* The MIT License\n\n   Copyright (c) 2018-2025 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n#include <dirent.h>\n#include <htslib/hfile.h>\n#include <htslib/faidx.h>\n#include <htslib/sam.h>\n\n#define min(a, b)                                                                                                      \\\n    ({                                                                                                                 \\\n        __typeof__(a) _a = (a);                                                                                        \\\n        __typeof__(b) _b = (b);                                                                                        \\\n        _a < _b ? _a : _b;                                                                                             \\\n    })\n\n#define max(a, b)                                                                                                      \\\n    ({                                                                                                                 \\\n        __typeof__(a) _a = (a);                                                                                        \\\n        __typeof__(b) _b = (b);                                                                                        \\\n        _a > _b ? _a : _b;                                                                                             \\\n    })\n\n// tests the end-of-file indicator for an hFILE\nstatic inline int heof(hFILE *hfile) {\n    if (hgetc(hfile) == EOF) return 1;\n    hfile->begin--;\n    return 0;\n}\n\n// read or skip a fixed number of bytes\nstatic inline void read_bytes(hFILE *hfile, void *buffer, size_t nbytes) {\n    if (buffer) {\n        if (hread(hfile, buffer, nbytes) < nbytes) {\n            error(\"Failed to read %ld bytes from stream\\n\", nbytes);\n        }\n    } else {\n        int i, c = 0;\n        for (i = 0; i < nbytes; i++) c = hgetc(hfile);\n        if (c == EOF) error(\"Failed to reposition stream forward %ld bytes\\n\", nbytes);\n    }\n}\n\nstatic inline char **get_file_list(const char *pathname, const char *extension, int *nfiles) {\n    char **filenames = NULL;\n    DIR *d = opendir(pathname);\n    if (d) { // check if d is a directory\n        struct dirent *dir;\n        int mfiles = 0;\n        int p = strlen(pathname);\n        while ((dir = readdir(d))) {\n            const char *ptr = strrchr(dir->d_name, '.');\n            if (ptr && strcmp(ptr + 1, extension) == 0) {\n                hts_expand0(char *, *nfiles + 1, mfiles, filenames);\n                int q = strlen(dir->d_name);\n                filenames[*nfiles] = (char *)malloc((p + q + 2) * sizeof(char));\n                memcpy(filenames[*nfiles], pathname, p);\n                filenames[*nfiles][p] = '/';\n                memcpy(filenames[*nfiles] + p + 1, dir->d_name, q + 1);\n                (*nfiles)++;\n            }\n        }\n        closedir(d);\n    } else {\n        filenames = hts_readlines(pathname, nfiles);\n        if (!filenames) error(\"Failed to read from file %s\\n\", pathname);\n    }\n    if (*nfiles == 0) error(\"No .%s files found in %s\\n\", extension, pathname);\n    return filenames;\n}\n\nstatic inline FILE *get_file_handle(const char *str) {\n    if (!str) return NULL;\n    FILE *ret;\n    if (strcmp(str, \"-\") == 0) {\n        ret = stdout;\n    } else {\n        ret = fopen(str, \"w\");\n        if (!ret) error(\"Failed to open %s: %s\\n\", str, strerror(errno));\n    }\n    return ret;\n}\n\nstatic inline void flank2fasta(const char *name, const char *flank, FILE *stream) {\n    if (!flank) return;\n    const char *left = strchr(flank, '[');\n    const char *middle = strchr(flank, '/');\n    const char *right = strchr(flank, ']');\n    fprintf(stream, \"@%s:1\\n\", name);\n    if (!left && !middle && !right) {\n        fprintf(stream, \"%s\\n\", flank);\n        return;\n    }\n    if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank);\n    if (*(middle - 1) == '-')\n        fprintf(stream, \"%.*s%s\\n\", (int)(left - flank), flank, right + 1);\n    else\n        fprintf(stream, \"%.*s%.*s%s\\n\", (int)(left - flank), flank, (int)(middle - left) - 1, left + 1, right + 1);\n    fprintf(stream, \"@%s:2\\n\", name);\n    if (*(middle - 1) == '-')\n        fprintf(stream, \"%.*s%.*s%s\\n\", (int)(left - flank), flank, (int)(right - middle) - 1, middle + 1, right + 1);\n    else\n        fprintf(stream, \"%.*s%.*s%s\\n\", (int)(left - flank), flank, (int)(right - middle) - 1, middle + 1, right + 1);\n}\n\nstatic inline int bcf_hdr_name2id_flexible(const bcf_hdr_t *hdr, char *chr) {\n    if (!chr) return -1;\n    char buf[] = {'c', 'h', 'r', '\\0', '\\0', '\\0'};\n    int rid = bcf_hdr_name2id(hdr, chr);\n    if (rid >= 0) return rid;\n    if (strncmp(chr, \"chr\", 3) == 0) rid = bcf_hdr_name2id(hdr, chr + 3);\n    if (rid >= 0) return rid;\n    strncpy(buf + 3, chr, 2);\n    rid = bcf_hdr_name2id(hdr, buf);\n    if (rid >= 0) return rid;\n    if (strcmp(chr, \"23\") == 0 || strcmp(chr, \"25\") == 0 || strcmp(chr, \"XY\") == 0 || strcmp(chr, \"XX\") == 0\n        || strcmp(chr, \"PAR1\") == 0 || strcmp(chr, \"PAR2\") == 0) {\n        rid = bcf_hdr_name2id(hdr, \"X\");\n        if (rid >= 0) return rid;\n        rid = bcf_hdr_name2id(hdr, \"chrX\");\n    } else if (strcmp(chr, \"24\") == 0) {\n        rid = bcf_hdr_name2id(hdr, \"Y\");\n        if (rid >= 0) return rid;\n        rid = bcf_hdr_name2id(hdr, \"chrY\");\n    } else if (strcmp(chr, \"26\") == 0 || strcmp(chr, \"MT\") == 0 || strcmp(chr, \"chrM\") == 0) {\n        rid = bcf_hdr_name2id(hdr, \"MT\");\n        if (rid >= 0) return rid;\n        rid = bcf_hdr_name2id(hdr, \"chrM\");\n    }\n    return rid;\n}\n\nstatic inline char rev_nt(char iupac) {\n    static const char iupac_complement[128] = {\n        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,\n        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,\n        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, '-',  0x2E, '/',\n        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,\n        0x40, 'T',  'V',  'G',  'H',  0x45, 0x46, 'C',  'D',  0x49, 0x4A, 'M',  0x4C, 'K',  'N',  0x4F,\n        0x50, 0x51, 'Y',  'S',  'A',  0x55, 'B',  'W',  0x58, 'R',  0x5A, ']',  0x5C, '[',  0x5E, 0x5F,\n        0x60, 't',  'v',  'g',  'h',  0x65, 0x66, 'c',  'd',  0x69, 0x6A, 'm',  0x6C, 'k',  'n',  0x6F,\n        0x70, 0x71, 'y',  's',  'a',  0x75, 'b',  'w',  0x78, 'r',  0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,\n    };\n    return iupac_complement[(int)(iupac & 0x7F)];\n}\n\nstatic inline char mask_nt(char iupac) {\n    static const char iupac_mask[128] = {\n        0, 0, 0,  0, 0,  0, 0, 0, 0,  0, 0, 0,  0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0,\n        0, 0, 0,  0, 0,  0, 0, 0, 0,  0, 0, 0,  0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0,\n        0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 0, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0,\n        0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 0, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0,\n    };\n    return iupac_mask[(int)(iupac & 0x7F)];\n}\n\n#define MAX_LENGTH_LEFT_ALLELE 256\nstatic inline void flank_reverse_complement(char *flank) {\n    // swap alleles, but only if first allele is one base pair long\n    char *left = strchr(flank, '[');\n    char *middle = strchr(flank, '/');\n    char *right = strchr(flank, ']');\n    if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank);\n\n    char buf[MAX_LENGTH_LEFT_ALLELE];\n    if (middle - left - 1 > MAX_LENGTH_LEFT_ALLELE) error(\"Cannot swap alleles in flank sequence %s\\n\", flank);\n    memmove((void *)buf, left + 1, middle - left - 1);\n    memmove((void *)left + 1, middle + 1, right - middle - 1);\n    *(left + (right - middle)) = '/';\n    memmove(left + (right - middle) + 1, (void *)buf, middle - left - 1);\n\n    size_t i, len = strlen(flank);\n    for (i = 0; i < len / 2; i++) {\n        char tmp = flank[i];\n        flank[i] = rev_nt(flank[len - i - 1]);\n        flank[len - i - 1] = rev_nt(tmp);\n    }\n    if (len % 2 == 1) flank[len / 2] = rev_nt(flank[len / 2]);\n}\n\n// this is the weird way Illumina left shifts indels\n// http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py\nstatic inline int flank_left_shift(char *flank) {\n    char *left = strchr(flank, '[');\n    char *middle = strchr(flank, '/');\n    char *right = strchr(flank, ']');\n    if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank);\n\n    int n = 0;\n    int len = (int)(right - middle) - 1;\n    while ((left - flank >= len) && (strncmp(left - len, middle + 1, len) == 0)) {\n        memmove(left - len, left, right - left + 1);\n        left -= len;\n        middle -= len;\n        right -= len;\n        memmove(right + 1, middle + 1, len);\n        n += len;\n    }\n\n    const char *ptr;\n    char nt = *(middle + 1);\n    for (ptr = middle + 2; ptr < right; ptr++)\n        if (*ptr != nt) nt = -1;\n    while (nt > 0 && *(left - 1) == nt) {\n        memmove(left - 1, left, right - left + 1);\n        *right = nt;\n        left--;\n        middle--;\n        right--;\n        n++;\n    }\n    return n;\n}\n\n// returns 1 if the first sequence is the best alignment, and 2 if the second sequence is\n// if neither sequence is better or neither provides an alignment, it returns 0\n// if it fails to read from the hts file, it returns -1\nstatic inline int get_position(htsFile *hts, sam_hdr_t *sam_hdr, bam1_t *b, const char *name, const char *flank,\n                               int left_shift, const char **chromosome, int *position, int *strand) {\n    const char *left = strchr(flank, '[');\n    const char *middle = strchr(flank, '/');\n    const char *right = strchr(flank, ']');\n    int cnv = !left && !middle && !right;\n    if (!cnv && (!left || !middle || !right)) error(\"Flank sequence is malformed: %s\\n\", flank);\n    const char *chromosome_pair[2];\n    int position_pair[2], strand_pair[2];\n    int64_t aln_score_pair[2];\n    int idx = -1, ret;\n    while (idx < 1 - cnv && (ret = sam_read1(hts, sam_hdr, b)) >= 0) {\n        const char *qname = bam_get_qname(b);\n        if (b->core.flag & BAM_FSECONDARY || b->core.flag & BAM_FSUPPLEMENTARY) continue;\n        int qname_l = strlen(qname);\n        if (strncmp(qname, name, qname_l - 2) != 0)\n            error(\"Query ID %.*s found in SAM file but %s expected\\n\", qname_l - 2, qname, name);\n        idx = qname[qname_l - 1] == '1' ? 0 : (qname[qname_l - 1] == '2' ? 1 : -1);\n        if (idx < 0) error(\"Query ID %s found in SAM file does not end with :1 or :2\\n\", qname);\n\n        chromosome_pair[idx] = sam_hdr_tid2name(sam_hdr, b->core.tid);\n        position_pair[idx] = 0;\n        strand_pair[idx] = -1;\n        if (!(b->core.flag & BAM_FUNMAP)) {\n            strand_pair[idx] = bam_is_rev(b);\n            int n_cigar = b->core.n_cigar;\n            const uint32_t *cigar = bam_get_cigar(b);\n            position_pair[idx] = b->core.pos;\n\n            int qlen =\n                cnv ? (strlen(flank) + 1) / 2 : (bam_is_rev(b) ? strlen(flank) - (right - flank) : left - flank + 1);\n            if (strchr(flank, '-')) {\n                if (left_shift) {\n                    int len = (int)(right - middle) - 1;\n                    char nt = toupper(*(middle + 1));\n                    const char *ptr;\n                    for (ptr = middle + 2; ptr < right; ptr++)\n                        if (*ptr != nt) nt = -1;\n                    if (bam_is_rev(b)) {\n                        ptr = right + 1;\n                        while (strncasecmp(middle + 1, ptr, len) == 0) {\n                            qlen -= len;\n                            ptr += len;\n                        }\n                        while (nt > 0 && toupper(*ptr) == nt) {\n                            qlen--;\n                            ptr++;\n                        }\n                    } else {\n                        ptr = left - len;\n                        while (ptr >= flank && (strncasecmp(ptr, middle + 1, len) == 0)) {\n                            qlen -= len;\n                            ptr -= len;\n                        }\n                        ptr += len - 1;\n                        while (nt > 0 && toupper(*ptr) == nt) {\n                            qlen--;\n                            ptr--;\n                        }\n                    }\n                }\n                if (idx == 0) qlen--;\n            }\n\n            int k;\n            for (k = 0; k < n_cigar && qlen > 1; k++) {\n                int type = bam_cigar_type(bam_cigar_op(cigar[k]));\n                int len = bam_cigar_oplen(cigar[k]);\n                if ((type & 1) && (type & 2)) { // consume reference sequence ( case M )\n                    position_pair[idx] += min(len, qlen);\n                    qlen -= len;\n                } else if (type & 1) { // consume query sequence ( case I )\n                    qlen -= len;\n                    if (qlen <= 0) // we skipped the base pair that needed\n                                   // to be localized\n                    {\n                        position_pair[idx] = 0;\n                    }\n                } else if (type & 2) {\n                    position_pair[idx] += len; // consume reference sequence ( case D )\n                }\n            }\n            if (qlen == 1) position_pair[idx]++;\n        }\n        uint8_t *as = bam_aux_get(b, \"AS\");\n        aln_score_pair[idx] = bam_aux2i(as);\n    }\n    if (ret < -1) return -1;\n\n    if (!cnv\n        && ((aln_score_pair[0] == aln_score_pair[1] && position_pair[0] != position_pair[1])\n            || (position_pair[0] == 0 && position_pair[1] == 0))) {\n        idx = -1;\n        *chromosome = NULL;\n        *position = 0;\n        *strand = -1;\n    } else {\n        idx = cnv ? 0 : (aln_score_pair[1] > aln_score_pair[0]);\n        *chromosome = chromosome_pair[idx];\n        *position = position_pair[idx];\n        *strand = strand_pair[idx];\n    }\n    return idx + 1;\n}\n\nstatic inline void strupper(char *str) {\n    char *s = str;\n    while (*s) {\n        *s = toupper((unsigned char)*s);\n        s++;\n    }\n}\n\nstatic inline float get_gc_ratio(const char *beg, const char *end) {\n    int at_cnt = 0, cg_cnt = 0;\n    const char *ptr;\n    for (ptr = beg; ptr < end; ptr++) {\n        int c = toupper(*ptr);\n        if (c == 'A' || c == 'T') at_cnt++;\n        if (c == 'C' || c == 'G') cg_cnt++;\n    }\n    return (float)(cg_cnt) / (float)(at_cnt + cg_cnt);\n}\n\nstatic inline int len_common_suffix(const char *s1, const char *s2, size_t n) {\n    int ret = 0;\n    while (ret < n && *s1 == *s2) {\n        s1--;\n        s2--;\n        ret++;\n    }\n    return ret;\n}\n\nstatic inline int len_common_prefix(const char *s1, const char *s2, size_t n) {\n    int ret = 0;\n    while (ret < n && *s1 == *s2) {\n        s1++;\n        s2++;\n        ret++;\n    }\n    return ret;\n}\n\n// http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py\n// For an insertion relative to the reference, the position of the base immediately 5' to the\n// insertion (on the plus strand) is given. For a deletion relative to the reference, the\n// position of the most 5' deleted base (on the plus strand) is given\nstatic inline int get_indel_alleles(kstring_t *allele_a, kstring_t *allele_b, const char *flank, const char *ref,\n                                    int win, int len, int shift) {\n    const char *left = strchr(flank, '[');\n    const char *middle = strchr(flank, '/');\n    const char *right = strchr(flank, ']');\n    if (!left || !middle || !right) error(\"Flank sequence is malformed: %s\\n\", flank);\n\n    int del_left = len_common_suffix(left - 1, &ref[win], left - flank);\n    int del_right = len_common_prefix(right + 1, &ref[win] + 1, strlen(right + 1));\n    int ins_match = strncmp(middle + 1, &ref[win], right - middle - 1) == 0; // same as indel_sequence_match\n    int ins_left = len_common_suffix(left - 1, &ref[win] - 1, left - flank);\n    int ins_right = len_common_prefix(right + 1, &ref[win] + (right - middle) - 1, strlen(right + 1));\n    int ref_is_del = (del_left >= ins_left) && (del_right >= ins_right);\n    if ((ref_is_del && del_left * del_right == 0) || (!ref_is_del && (!ins_match || ins_left * ins_right == 0))) {\n        // computes it again but with shifted coordinates to better match Illumina's _calculate_is_deletion()\n        del_left = len_common_suffix(left - 1, &ref[win - shift], left - flank);\n        del_right = len_common_prefix(right + 1, &ref[win - shift] + 1, strlen(right + 1));\n        ref_is_del = (del_left >= ins_left) && (del_right >= ins_right);\n        if ((ref_is_del && del_left * del_right == 0) || (!ref_is_del && (!ins_match || ins_left * ins_right == 0)))\n            return -1;\n    }\n    int allele_b_is_del = allele_b->s[0] == 'D';\n    allele_a->l = allele_b->l = 0;\n    kputc(ref[win - 1 + ref_is_del], allele_a);\n    kputc(ref[win - 1 + ref_is_del], allele_b);\n    kputsn(ref_is_del ? middle + 1 : &ref[win], right - middle - 1, allele_b_is_del ? allele_a : allele_b);\n    return ref_is_del;\n}\n\nstatic inline int get_allele_b_idx(char ref_base, char *allele_a, char *allele_b) {\n    if (*allele_a == '.' && *allele_b == '.') {\n        return -1;\n    } else if (*allele_a == 'D' || *allele_a == 'I' || *allele_b == 'D' || *allele_b == 'I') {\n        return 1;\n    } else if (*allele_a == ref_base) {\n        return 1;\n    } else if (*allele_b == ref_base) {\n        return 0;\n    } else if (*allele_a == '.') {\n        *allele_a = ref_base;\n        return 1;\n    } else if (*allele_b == '.') {\n        *allele_b = ref_base;\n        return 0;\n    } else {\n        return 2;\n    }\n}\n\nstatic inline int get_allele_a_idx(int allele_b_idx) {\n    switch (allele_b_idx) {\n    case 0:\n        return 1;\n    case 1:\n        return 0;\n    case 2:\n        return 1;\n    default:\n        return -1;\n    }\n}\n\nstatic inline int alleles_ab_to_vcf(const char **alleles, const char *ref_base, const char *allele_a,\n                                    const char *allele_b, int allele_b_idx) {\n    switch (allele_b_idx) {\n    case -1:\n        alleles[0] = ref_base;\n        return 1;\n    case 0:\n        alleles[0] = allele_b;\n        if (*allele_a == '.') return 1;\n        alleles[1] = allele_a;\n        return 2;\n    case 1:\n        alleles[0] = allele_a;\n        if (*allele_b == '.') return 1;\n        alleles[1] = allele_b;\n        return 2;\n    case 2:\n        alleles[0] = ref_base;\n        alleles[1] = allele_a;\n        alleles[2] = allele_b;\n        return 3;\n    default:\n        return -1;\n    }\n}\n\n// Petr Danecek's similar implementation in bcftools/plugins/fixref.c\n// http://www.illumina.com/documents/products/technotes/technote_topbot.pdf\nstatic inline int get_strand_from_top_alleles(char *allele_a, char *allele_b, const char *ref, int win, int len) {\n    int i;\n    char ref_base = ref[win];\n    int ia = (int)mask_nt(*allele_a);\n    int ib = (int)mask_nt(*allele_b);\n    int ir = (int)mask_nt(ref_base);\n\n    // as alleles must be designated on the TOP strand, the only acceptable pairs are (A,C),\n    // (A,G), (A, T), (C, G)\n    switch (ia | ib) {\n    case 1 | 2: // A and C\n    case 1 | 4: // A and G\n        if (ir == ia || ir == ib)\n            return 0;\n        else if (ref_base == rev_nt(*allele_a) || ref_base == rev_nt(*allele_b))\n            return 1;\n        else\n            return -1; // Reference allele is not A/C/G/T\n        break;\n    case 1 | 8: // A and T\n    case 2 | 4: // C and G\n        for (i = 1; i <= win; i++) {\n            int ra = (int)mask_nt(ref[win - i]);\n            int rb = (int)mask_nt(ref[win + i]);\n            if (ra == 15 || rb == 15 || ra == rb) continue; // N\n            switch (ra | rb) {\n            case 1 | 2:              // A and C\n            case 1 | 4:              // A and G\n            case 2 | 8:              // C and T\n            case 4 | 8:              // G and T\n                return ra & (2 | 4); // A or T\n            case 1 | 8:              // A and T\n            case 2 | 4:              // C and G\n                continue;\n            default:\n                return -1; // Flanking reference alleles are not valid alleles for TOP/BOT strand determination\n            }\n        }\n        return -1; // Unable to determine reference sequence strand\n    default:\n        return -1; // Alleles are not TOP alleles\n    }\n}\n\n// compute BAF and LRR from Theta and R as explained in Peiffer, D. A. et al. High-resolution genomic profiling of\n// chromosomal aberrations using Infinium whole-genome genotyping. Genome Res. 16, 1136–1148 (2006)\nstatic inline void get_baf_lrr(float ilmn_theta, float ilmn_r, float aa_theta, float ab_theta, float bb_theta,\n                               float aa_r, float ab_r, float bb_r, float r_mean, float *baf, float *lrr) {\n    float r_ref;\n    if (ilmn_theta == ab_theta) {\n        r_ref = ab_r;\n        *baf = 0.5f;\n    } else if (ilmn_theta < ab_theta) {\n        float slope = (aa_r - ab_r) / (aa_theta - ab_theta);\n        float b = aa_r - (aa_theta * slope);\n        r_ref = (slope * ilmn_theta) + b;\n        *baf = 0.5f - (ab_theta - ilmn_theta) * 0.5f / (ab_theta - aa_theta);\n    } else if (ilmn_theta > ab_theta) {\n        float slope = (ab_r - bb_r) / (ab_theta - bb_theta);\n        float b = ab_r - (ab_theta * slope);\n        r_ref = (slope * ilmn_theta) + b;\n        *baf = 1.0f - (bb_theta - ilmn_theta) * 0.5f / (bb_theta - ab_theta);\n    } else {\n        *lrr = -NAN;\n        *baf = -NAN;\n        return;\n    }\n    // for non-polymorphic (Illumina) markers we compute the LRR using the clusters mean\n    *lrr = logf(ilmn_r / (isnan(r_mean) ? r_ref : r_mean)) * (float)M_LOG2E;\n}\n"
  },
  {
    "path": "gtc2vcf_plot.R",
    "content": "#!/usr/bin/env Rscript\n###\n#  The MIT License\n#\n#  Copyright (C) 2019-2025 Giulio Genovese\n#\n#  Author: Giulio Genovese <giulio.genovese@gmail.com>\n#\n#  Permission is hereby granted, free of charge, to any person obtaining a copy\n#  of this software and associated documentation files (the \"Software\"), to deal\n#  in the Software without restriction, including without limitation the rights\n#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n#  copies of the Software, and to permit persons to whom the Software is\n#  furnished to do so, subject to the following conditions:\n#\n#  The above copyright notice and this permission notice shall be included in\n#  all copies or substantial portions of the Software.\n#\n#  THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n#  THE SOFTWARE.\n###\n\noptions(error = function() {traceback(3); q(\"no\", 1)})\n\ngtc2vcf_plot_version <- '2025-08-19'\n\nsuppressPackageStartupMessages(library(optparse))\nsuppressPackageStartupMessages(library(data.table))\nsuppressPackageStartupMessages(library(ggplot2))\nsuppressPackageStartupMessages(library(grid))\nsuppressPackageStartupMessages(library(gridExtra))\nif (capabilities()[['cairo']]) options(bitmapType = 'cairo')\n\nparser <- OptionParser('usage: gtc2vcf_plot.R [options] --illumina|--affymetrix --vcf <file.vcf> --chrom <string> --pos <integer> --pdf|--png <file>')\nparser <- add_option(parser, c('--vcf'), type = 'character', help = 'input VCF file', metavar = '<file.vcf>')\nparser <- add_option(parser, c('--illumina'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Illumina data')\nparser <- add_option(parser, c('--affymetrix'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Affymetrix data')\nparser <- add_option(parser, c('--birdseed'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Affymetrix data from Birdseed')\nparser <- add_option(parser, c('--pdf'), type = 'character', help = 'output PDF file', metavar = '<file.pdf>')\nparser <- add_option(parser, c('--png'), type = 'character', help = 'output PNG file', metavar = '<file.png>')\nparser <- add_option(parser, c('--width'), type = 'double', default = 7.0, help = 'inches width of the output file [7.0]', metavar = '<float>')\nparser <- add_option(parser, c('--height'), type = 'double', default = 7.0, help = 'inches height of the output file [7.0]', metavar = '<float>')\nparser <- add_option(parser, c('--fontsize'), type = 'integer', default = 12, help = 'font size [12]', metavar = '<integer>')\nparser <- add_option(parser, c('--chrom'), type = 'character', help = 'chromosome', metavar = '<string>')\nparser <- add_option(parser, c('--pos'), type = 'integer', help = 'chromosome position', metavar = '<integer>')\nparser <- add_option(parser, c('--id'), type = 'character', help = 'variant ID', metavar = '<string>')\nparser <- add_option(parser, c('--samples'), type = 'character', help = 'comma-separated list of samples to include', metavar = '<list>')\nparser <- add_option(parser, c('--samples-file'), type = 'character', help = 'file with list of samples to include', metavar = '<file>')\nparser <- add_option(parser, c('--minimal'), action = 'store_true', default = FALSE, help = 'only plot NORMX/NORMY and BAF/LRR plots')\nparser <- add_option(parser, c('--zcall'), action = 'store_true', default = FALSE, help = 'plot ZCall thresholds')\nargs <- parse_args(parser, commandArgs(trailingOnly = TRUE), convert_hyphens_to_underscores = TRUE)\n\nwrite(paste('gtc2vcf_plot.R', gtc2vcf_plot_version, 'http://github.com/freeseek/gtc2vcf'), stderr())\n\n# make sure VCF is passed\nif (is.null(args$vcf)) {print_help(parser); stop('option --vcf is required')}\nif (is.null(args$chrom)) {print_help(parser); stop('option --chrom is required')}\nif (is.null(args$pos)) {print_help(parser); stop('option --pos is required')}\nif (args$illumina && args$affymetrix) {print_help(parser); stop('cannot use --illumina and --affymetrix at the same time')}\nif (args$illumina && args$birdseed) {print_help(parser); stop('cannot use --illumina and --birdseed at the same time')}\nif (args$affymetrix && args$zcall) {print_help(parser); stop('cannot use --affymetrix and --zcall at the same time')}\nif (is.null(args$pdf) && is.null(args$png)) {print_help(parser); stop('either --pdf or --png is required')}\nif (!is.null(args$pdf) && !is.null(args$png)) {print_help(parser); stop('cannot use --pdf and --png at the same time')}\nif (!is.null(args$png) && !capabilities('png')) {print_help(parser); stop('unable to start device PNG: no png support in this version of R\\nyou need to reinstall R with support for PNG to use the --png option\\n')}\nif (!is.null(args$samples) && !is.null(args$samples_file)) {print_help(parser); stop('cannot use --samples and --samples-file at the same time')}\n\nbase <- c('CHROM', 'POS', 'ID')\nif (args$illumina) {\n  info <- c('meanR_AA', 'meanR_AB', 'meanR_BB', 'meanTHETA_AA', 'meanTHETA_AB', 'meanTHETA_BB', 'devR_AA', 'devR_AB', 'devR_BB', 'devTHETA_AA', 'devTHETA_AB', 'devTHETA_BB')\n  format <- c('GT', 'X', 'Y', 'NORMX', 'NORMY', 'R', 'THETA', 'BAF', 'LRR')\n  if (args$zcall) {\n    info <- c(info, c('zthresh_X', 'zthresh_Y'))\n  }\n} else if (args$affymetrix) {\n  info <- c('meanX_AA', 'meanX_AB', 'meanX_BB', 'meanY_AA', 'meanY_AB', 'meanY_BB', 'varX_AA', 'varX_AB', 'varX_BB', 'varY_AA', 'varY_AB', 'varY_BB', 'covarXY_AA', 'covarXY_AB', 'covarXY_BB')\n  info <- c(info, paste0(info, '.1'))\n  format <- c('GT', 'NORMX', 'NORMY', 'DELTA', 'SIZE', 'BAF', 'LRR')\n} else {\n  info <- c()\n  format <- c('GT', 'BAF', 'LRR')\n}\n\nfmt <- paste0('\"[%', paste(base, collapse = '\\\\t%'), paste(c('', info), collapse = '\\\\t%INFO/'), paste(c('', format), collapse = '\\\\t%'), '\\\\n]\"')\nnames <- c(base, info, format)\ncmd <- paste0('bcftools query --format ', fmt, ' ', args$vcf, ' -r ', args$chrom, ':', args$pos, '-', args$pos)\nif (!is.null(args$samples)) cmd <- paste(cmd, '--samples', args$samples)\nif (!is.null(args$samples_file)) cmd <- paste(cmd, '--samples-file', args$samples_file)\nwrite(paste('Command:', cmd), stderr())\nif (packageVersion('data.table') < '1.11.6') {\n  df <- setNames(fread(cmd, sep = '\\t', header = FALSE, na.strings = '.', data.table = FALSE), names)\n} else {\n  df <- setNames(fread(cmd = cmd, sep = '\\t', header = FALSE, na.strings = '.', data.table = FALSE), names)\n}\nif (!is.null(args$id)) {\n  if (!(args$id %in% unique(df$ID))) stop('Specified ID not present at specified location')\n  df <- df[df$ID == args$id,]\n} else {\n  if ( length(unique(df$ID)) > 1 ) stop('More than one variant at the specified position, use --id to specify which variant to plot')\n}\nv <- sapply(df[, info], unique)\n\nif (args$illumina) {\n  p1 <- ggplot(df, aes(x = Y, y = X, color = GT, shape = GT)) +\n    geom_point(size = .5) +\n    scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    theme_bw(base_size = args$fontsize) +\n    theme(legend.position = 'none')\n  p2 <- ggplot(df, aes(x = NORMY, y = NORMX, color = GT, shape = GT)) +\n    geom_point(size = .5) +\n    scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    theme_bw(base_size = args$fontsize) +\n    theme(legend.position = 'none')\n  if (args$zcall) {\n    zthresh_X <- unique(df$zthresh_X)\n    zthresh_Y <- unique(df$zthresh_Y)\n    p2 <- p2 + geom_vline(xintercept = zthresh_Y, color = 'gray') +\n      geom_hline(yintercept = zthresh_X, color = 'gray')\n  }\n  p3 <- ggplot(df, aes(x = THETA, y = R, color = GT, shape = GT)) +\n    geom_point(size = .5) +\n    scale_x_continuous(limits = c(0,1), expand = expand_scale(0)) +\n    theme_bw(base_size = args$fontsize) +\n    theme(legend.position = 'none')\n  for (gt in c('AA', 'AB', 'BB')) {\n    t <- seq(0, 2*pi, length.out = 100)\n    x <- unname(v[paste0('meanTHETA_', gt)]) + unname(v[paste0('devTHETA_', gt)])*cos(t)\n    y <- unname(v[paste0('meanR_', gt)]) + unname(v[paste0('devR_', gt)])*sin(t)\n    p3 <- p3 + annotate('path', x = x, y = y)\n  }\n} else if (args$affymetrix) {\n  p2 <- ggplot(df, aes(x = NORMX, y = NORMY, color = GT, shape = GT)) +\n    geom_point(size = .5) +\n    scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) +\n    theme_bw(base_size = args$fontsize) +\n    theme(legend.position = 'none')\n  p3 <- ggplot(df, aes(x = DELTA, y = SIZE, color = GT, shape = GT)) +\n    geom_point(size = .5) +\n    theme_bw(base_size = args$fontsize) +\n    theme(legend.position = 'none')\n  for (gt in c('AA', 'AB', 'BB', 'AA.1', 'BB.1')) {\n    a <- unname(v[paste0('varX_', gt)])\n    b <- unname(v[paste0('covarXY_', gt)])\n    c <- unname(v[paste0('varY_', gt)])\n    lambda1 <- (a+c)/2 + sqrt(((a-c)/2)^2+b^2)\n    lambda2 <- (a+c)/2 - sqrt(((a-c)/2)^2+b^2)\n    theta <- atan2(lambda1 - a, b)\n    t <- seq(0, 2*pi, length.out = 100)\n    x <- unname(v[paste0('meanX_', gt)]) + sqrt(lambda1)*cos(theta)*cos(t) - sqrt(lambda2)*sin(theta)*sin(t)\n    y <- unname(v[paste0('meanY_', gt)]) + sqrt(lambda1)*sin(theta)*cos(t) + sqrt(lambda2)*cos(theta)*sin(t)\n    if (args$birdseed) {\n      p2 <- p2 + annotate('path', x = x, y = y)\n    } else {\n      p3 <- p3 + annotate('path', x = x, y = y)\n    }\n  }\n}\np4 <- ggplot(df, aes(x = BAF, y = LRR, color = GT, shape = GT)) +\n  geom_point(size = .5) +\n  theme_bw(base_size = args$fontsize) +\n  theme(legend.position = 'bottom', legend.box = 'horizontal')\n\nif (!is.null(args$pdf)) {\n  pdf(args$pdf, width = args$width, height = args$height)\n} else {\n  png(args$png, width = args$width, height = args$height, units = 'in', res = 150)\n}\n\nif (args$minimal) {\n  grid.arrange(p2, p4, nrow = 2, ncol = 1, heights = c(3, 4), top = unique(df$ID))\n} else {\n  if (args$illumina) grid.arrange(p1, p2, p3, p4, nrow = 4, ncol = 1, heights = c(3, 3, 3, 4), top = unique(df$ID))\n  else if (args$affymetrix) grid.arrange(p2, p3, p4, nrow = 3, ncol = 1, heights = c(3, 3, 4), top = unique(df$ID))\n  else grid.arrange(p4, nrow = 1, ncol = 1, top = unique(df$ID))\n}\ninvisible(dev.off())\n"
  },
  {
    "path": "idat2gtc.c",
    "content": "/* The MIT License\n\n   Copyright (c) 2024-2026 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n// the code in this file reimplements functionalities and ideas present in:\n// - AutoConvert (v1.6.3.1)\n// - GTCtoVCF\n// - BeadArrayFiles\n// these resources were provided by Illumina without license restrictions\n\n// the code in this file can be used as a replacement of the Illumina AutoCall software to convert IDAT intensity files\n// into GTC genotype files for Infinium arrays which was implemented over time in different proprietary software:\n// - AutoConvert (v1.6.3.1) - http://support.illumina.com/downloads/beeline_software_v10.html\n// - AutoConvert 2.0 (v2.0.1.179) - http://support.illumina.com/downloads/beeline-software-2-0.html\n// - IAAP CLI (v1.1) - http://support.illumina.com/downloads/iaap-genotyping-cli.html\n// - Array Analysis CLI (v2.1) -\n// http://support.illumina.com/downloads/illumina-microarray-analytics-array-analysis-cli-v2-installers.html\n\n// the Illumina AutoCall software performs three main steps:\n// - Normalization\n// - Genotyping\n// - Gender Estimation\n// if AutoConvert and AutoConvert 2.0 are run without an input cluster file, only the normalization will be performed\n\n// the normalization, clustering, and genotype calling functionalities of Illumina AutoCall were covered by the\n// following patents:\n// - http://patents.google.com/patent/US7035740 - covers normalization algorithm (2024-05-05)\n// - http://patents.google.com/patent/US7467117 - divisional, covers clustering and genotyping (2024-03-24)\n// - http://patents.google.com/patent/US20050216207 - same as US7035740\n// - http://patents.google.com/patent/US20060224529 - same as US7467117\n\n// GenCall GenTrain 2.0 uses the following algorithms:\n// - Normalization algorithm (version 1.1.2)\n// - Clustering algorithm (version 6.3.1)\n// - Genotyping algorithm (version 6.3.0)\n// GenCall GenTrain 3.0 uses the following algorithms:\n// - Normalization algorithm version 1.2.0\n// - Clustering algorithm version 7.0.0\n// - Genotyping algorithm version 7.0.0\n\n// the Illumina GenCall Source Code (http://support.illumina.com/downloads/gencall_software.html) includes:\n// - NormalizationGoldenGate.cs - normalization routines (version 1.1.0)\n// - NormalizationInfinium.cs - normalization routines (version 1.1.2)\n// - GenTrain60.cs - clustering (version 6.3.1) and genotyping (6.3.0) routines\n// - Utils.cs - closest points to axis, MATLAB robust fit, and other MATLAB routines\n\n// the InfiniumIDATParser Java implementation of the normalization algorithm (version 1.1.2) by Jay Carey includes:\n// - InfiniumIDATParser.java - IDAT parsing routines (2010-02-25)\n// - InfiniumNormalization.java - normalization routines (version 1.1.2) (2010-01-07)\n// - InfiniumUtils.java - closest points to axis, MATLAB robust fit, and other MATLAB routines (2010-01-08)\n// this software was used in the 1000 Genomes project (Supplementary chapter 5.3 of http://doi.org/10.1038/nature15394)\n// as part of the intensity rank sum test (IRS test) in the Genome STRiP software\n\n// the differences between the normalization algorithm version 1.1.2 and version 1.2.0 are:\n// - the original implementation of the madsigma function for robust line fitting is updated as it was updated in MATLAB\n// - HandleScale will not use loci with missing data anymore for sub-bead pool bins with less than 192 loci\n// - NormalizeSingleBinSingleChannel handles Infinium I (A/T and C/G) probes for sub-bead pool bins with less than 192\n// loci\n//   for which version 1.1.2 would previously not attempt to compute a background intensity offset\n\n// each AutoCall software determines gender in a slightly different way:\n// - AutoConvert (v1.6.3.1) - only uses X chromosome heterozygosity and checks whether it is higher than 0.1\n// - AutoConvert 2.0 (v2.0.1.179) - checks whether Y chromosome intensity R values are higher than 0.3 if autosomal call\n// rate is higher than 0.97\n// - IAAP CLI (v1.1) - same as above but there is a bug in the determination of the autosomal call rate that includes\n// loci with null cluster scores as missing\n// - Array Analysis CLI (v2.1) - same as above but with the bug removed\n// we follow the approach of AutoConvert 2.0 and Array Analysis CLI as default and allow the user to use the approach of\n// AutoConvert if requested for inexplicable reasons, AutoConvert 2.0, IAAP CLI, and Array Analysis CLI downsample to\n// 10000 random autosomal loci to estimate the autosomal call rate this behavior can be suppressed by setting the\n// autosomal call rate threshold from 0.97 to 0.0. However, this cannot be done with Array Analysis CLI\n\n// to replicate the functionality for interoperability purposes, the following bugs were reimplemented:\n// matlab_robustfit0 deviates from the original MATLAB implementation (statrobustfit) to match Illumina implementation\n// (robustLineFit) when input option addconst/calcoffset is false by erroneously summing the vector into a scalar and\n// causing the adjfactor variable to be always equal to 100.0 normalization IDs are allowed to overflow beyond 255,\n// which happens with some probes in the Omni5 arrays, which can cause some Infinium I (G/C) probes to be normalized\n// together with some Infinium II probes probe pairs with missing values are still used in the normalization step as\n// probes with zero values the additional code included in GenTrain 3.0 in the Illumina implementation\n// (NormalizeSingleBinSingleChannel) calls MATLAB function trimmean on an array where some values are artificially set\n// to zero for no good reasons while other values are left out when determining scale_x with GenTrain 2.0 for\n// normalization bins with less than 192 loci we include failed loci as AA loci\n\n/****************************************\n * LITERATURE MENTIONING NORMALIZATION  *\n ****************************************/\n\n// http://doi.org/10.1101/sqb.2003.68.69\n// Fan,J.B. et al. (2003) Highly parallel SNP genotyping. Cold Spring Harb Symp Quant Biol, 68, 69–78\n// first document that mentions GenCall and GenTrain\n\n// http://patents.google.com/patent/US7035740\n// Kermani 2005, Artificial intelligence and global normalization methods for genotyping\n// explains how normalization works\n\n// http://patents.google.com/patent/US7467117\n// Kermani 2006, Artificial intelligence and global normalization methods for genotyping\n// also explains how normaliation works(???)\n\n// http://www.illumina.com/Documents/products/technotes/technote_gencall_data_analysis_software.pdf\n// Illumina 2005, Illumina GenCall Data Analysis Software\n// it does not describe the normalization but it refers to it\n\n// http://doi.org/10.1016/j.mrfmmm.2004.07.022\n// Shen 2005, High-throughput SNP genotyping on universal bead arrays\n// introduces the GenTrain algorithm. It explains the GenScores are computed using fuzzy logic\n\n// http://doi.org/10.1038/sj.ejhg.5201528;\n// Moorhead et al. 2006, Optimal genotype determination in highly multiplexed SNP data\n// in the supplement a normalization procedure very similar to Illumina's is proposed\n\n// http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf\n// http://dnatech.genomecenter.ucdavis.edu/documents/illumina_gt_normalization.pdf\n// Illumina 2006, Illumina’s Genotyping Data Normalization Methods\n// has color versions of the patent figures with details that are missing from the patent including the use of 400\n// homozygotes\n\n// http://doi.org/10.1101/gr.5402306\n// Peiffer et al. 2006, High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome\n// genotyping explains Illumina normalization with minimum details\n\n// http://www.illumina.com/documents/products/technotes/technote_cnv_algorithms.pdf\n// Illumina 2007, DNA Copy Number and Loss of Heterozygosity Analysis Algorithms\n// explains how LRR and BAF behave over CNVs\n\n// http://doi.org/10.1093/bioinformatics/btm443\n// Teo et al. 2007, A genotype calling algorithm for the Illumina BeadArray platform\n// explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes\n// (paper about Illuminus caller)\n\n// http://doi.org/10.1101/gr.5686107\n// Oosting et al. 2007, High-resolution copy number analysis of paraffin-embedded archival tissue using SNP BeadArrays\n// explains an alternative normalization strategy\n\n// http://doi.org/10.1101/gr.6861907\n// Wang et al. 2007, PennCNV: An integrated hidden Markov model designed for high-resolution copy number variation\n// detection in whole-genome SNP genotyping data explains Illumina normalization with minimum details\n\n// http://doi.org/10.1093/bioinformatics/btn386\n// Giannoulatou et al. 2008 GenoSNP: a variational Bayes within-sample SNP genotyping algorithm that does not require a\n// reference population explains an alternative normalization strategy still based on beadpools (paper about GenoSNP\n// caller)\n\n// http://doi.org/10.1186/1471-2105-9-409\n// Staaf et al. 2008 Normalization of Illumina Infinium whole-genome SNP data improves copy number estimates and allelic\n// intensity ratios explains Illumina normalization with minimum details\n\n// http://www.illumina.com/documents/products/technotes/technote_gentrain2.pdf\n// Illumina 2009, Improved Cluster Generation with Gentrain2\n// explains Gentrain 2.0\n\n// http://doi.org/10.1093/bioinformatics/btp470\n// Ritchie et al. 2009 R/Bioconductor software for Illumina’s Inﬁnium whole-genome genotyping BeadChips\n// explains an alternative normalization strategy\n\n// http://doi.org/10.1093/nar/gkp552\n// LaFramboise et al. 2009 Single nucleotide polymorphism arrays: a decade of biological, computational and\n// technological advances explains Illumina normalization with minimum details but defines it as \"The computational\n// workhorse in the Illumina protocol\"\n\n// http://support.illumina.com/documents/products/technotes/technote_array_analysis_workflows.pdf\n// Illumina 2011, Microarray Data Analysis Workflows\n// explains how IDAT are converted to GTC with AutoCall\n\n// http://doi.org/10.1186/1471-2105-12-68\n// Ritchie et al. 2011 Comparing genotyping algorithms for Illumina’s Infinium whole-genome SNP BeadChips\n// explains Illumina normalization with minimum details (paper comparing GenCall GenTrain 1.0, Infinium, GenoSNP, CRLMM)\n\n// http://doi.org/10.1007/978-1-61779-555-8_29\n// Teo 2011 Genotype Calling for the Illumina Platform\n// explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes\n\n// http://doi.org/10.1093/bioinformatics/bts47\n// Goldstein et al. 2012 zCall: a rare variant caller for array-based genotyping\n// uses Illumina normalization but no details provided\n\n// http://doi.org/10.1093/bioinformatics/btr673\n// Li et al. 2012, M3 : an improved SNP calling algorithm for Illumina BeadArray data\n// explains Illumina normalization with minimum details (paper about M3 caller)\n\n// http://doi.org/10.1093/bioinformatics/bts180\n// Shah et al. 2012, optiCall: a robust genotype-calling algorithm for rare, low-frequency and common variants\n// explains Illumina normalization with minimum details (paper about optiCall caller which uses Illumina normalization)\n\n// http://doi.org/10.1093/bioinformatics/btu107\n// Zhou et al. 2014, iCall: a genotype-calling algorithm for rare, low-frequency and common variants on the Illumina\n// exome array paper about iCall which uses Illumina normalization\n\n// http://web.stat.tamu.edu/sheather/PDF/WZhou_MSProject.pdf\n// Zhou 2014, Segmentation-Based Detection of Mosaic Chromosomal Abnormality in Bladder Cancer Cells Using Whole Genome\n// SNP Array includes explanation of the normalization following Illumina's technical note\n\n// http://doi.org/10.1111/pbi.12183\n// Wang,S. et al. (2014) Characterization of polyploid wheat genomic diversity using a high-density 90,000 single\n// nucleotide polymorphism array. Plant Biotechnol J, 12, 787–796. introduces the polyploid clustering algorithm\n// released by Illumina on 2013-10-07\n\n// http://emea.illumina.com/content/dam/illumina-marketing/documents/products/technotes/gentrain3-technical-note-370-2016-015.pdf\n// Illumina 2016, Improved Genotype Clustering with GenTrain 3.0\n// explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two\n// degrees of freedom rather than six\n\n// http://www.illumina.com/content/dam/illumina/gcs/assembled-assets/marketing-literature/gentrain-tech-note-m-gl-01258/gentrain-tech-note-m-gl-01258.pdf\n// Illumina 2023, Genotype clustering with GenTrain 3.0\n// explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two\n// degrees of freedom rather than six\n\n#include <ctype.h>\n#include <getopt.h>\n#include <errno.h>\n#include <time.h>\n#include <dirent.h>\n#include <math.h>\n#include <float.h>\n#include <htslib/hts.h>\n#include <htslib/hfile.h>\n#include <htslib/khash.h>\n#include <htslib/ksort.h>\n#include <htslib/khash_str2int.h>\n#include \"bcftools.h\"\n#define IDAT2GTC_VERSION \"2026-01-26\"\n\n#define AUTOCALL_DATE_FORMAT_DFLT \"%m/%d/%y %#I:%M %p\" // equivalent to \"MM/dd/yyyy h:mm tt\"\n#define AUTOCALL_VERSION_DFLT \"3.0.0\"\n\nKSORT_INIT_GENERIC(float)\nKSORT_INIT_GENERIC(int)\n\n// void error(const char *format, ...)\n//{\n//     va_list ap;\n//     va_start(ap, format);\n//     vfprintf(stderr, format, ap);\n//     va_end(ap);\n//     exit(-1);\n// }\n//\n// static inline int iupac2bitmask(char iupac)\n//{\n//     const int A = 1;\n//     const int C = 2;\n//     const int G = 4;\n//     const int T = 8;\n//     if ( iupac >= 97 ) iupac -= 32;\n//     if ( iupac == 'A' ) return A;\n//     if ( iupac == 'C' ) return C;\n//     if ( iupac == 'G' ) return G;\n//     if ( iupac == 'T' ) return T;\n//     if ( iupac == 'M' ) return A|C;\n//     if ( iupac == 'R' ) return A|G;\n//     if ( iupac == 'W' ) return A|T;\n//     if ( iupac == 'S' ) return C|G;\n//     if ( iupac == 'Y' ) return C|T;\n//     if ( iupac == 'K' ) return G|T;\n//     if ( iupac == 'V' ) return A|C|G;\n//     if ( iupac == 'H' ) return A|C|T;\n//     if ( iupac == 'D' ) return A|G|T;\n//     if ( iupac == 'B' ) return C|G|T;\n//     if ( iupac == 'N' ) return A|C|G|T;\n//     return -1;\n// }\n//\n///**\n// *  mkdir_p() - create new directory for a file $fname\n// *  @fname:   the file name to create the directory for, the part after last \"/\" is ignored\n// */\n// void mkdir_p(const char *fmt, ...)\n//{\n//    va_list ap;\n//    va_start(ap, fmt);\n//    int n = vsnprintf(NULL, 0, fmt, ap) + 2;\n//    va_end(ap);\n//\n//    char *path = (char*)malloc(n);\n//    va_start(ap, fmt);\n//    vsnprintf(path, n, fmt, ap);\n//    va_end(ap);\n//\n//    char *tmp = strdup(path), *p = tmp+1;\n//    while (*p)\n//    {\n//        while (*p && *p!='/') p++;\n//        if ( !*p ) break;\n//        char ctmp = *p;\n//        *p = 0;\n//        int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);\n//        if ( ret!=0 && errno!=EEXIST ) error(\"Error creating directory %s: %s\\n\", path,strerror(errno));\n//        *p = ctmp;\n//        while ( *p && *p=='/' ) p++;\n//    }\n//    free(tmp);\n//    free(path);\n//}\n\n/****************************************\n * hFILE READING FUNCTIONS              *\n ****************************************/\n\nstatic inline ssize_t HTS_RESULT_USED md5_hread(hFILE *fp, void *buffer, size_t nbytes, hts_md5_context *md5) {\n    ssize_t ret = hread(fp, buffer, nbytes);\n    if (md5 && ret > 0) hts_md5_update(md5, buffer, ret);\n    return ret;\n}\n\nstatic inline int md5_hgetc(hFILE *fp, hts_md5_context *md5) {\n    int c = hgetc(fp);\n    if (md5 && c != EOF) hts_md5_update(md5, &c, 1);\n    return c;\n}\n\n// read or skip a fixed number of bytes\nstatic void read_bytes(hFILE *hfile, void *buffer, size_t nbytes, hts_md5_context *md5) {\n    if (buffer) {\n        if (md5_hread(hfile, buffer, nbytes, md5) < nbytes) {\n            error(\"Failed to read %ld bytes from stream\\n\", nbytes);\n        }\n    } else {\n        int i, c = 0;\n        for (i = 0; i < nbytes; i++) c = md5_hgetc(hfile, md5);\n        if (c == EOF) error(\"Failed to reposition stream forward %ld bytes\\n\", nbytes);\n    }\n}\n\n// tests the end-of-file indicator for an hFILE\nstatic int heof(hFILE *hfile) {\n    if (hgetc(hfile) == EOF) return 1;\n    hfile->begin--;\n    return 0;\n}\n\n// read or skip a fixed length array\nstatic void read_array(hFILE *hfile, void **arr, size_t *m_arr, size_t nmemb, size_t size, size_t term,\n                       hts_md5_context *md5) {\n    if (arr) {\n        if (!m_arr) {\n            *arr = malloc((nmemb + term) * size);\n            if (!*arr) error(\"Failed to allocate memory for array\\n\");\n        } else if (*m_arr < nmemb + term) {\n            void *tmp = realloc(*arr, (nmemb + term) * size);\n            if (!tmp) error(\"Failed to allocate memory for array\\n\");\n            *arr = tmp;\n            *m_arr = nmemb + term;\n        }\n        if (md5_hread(hfile, *arr, nmemb * size, md5) < nmemb * size) {\n            error(\"Failed to read %ld bytes from stream\\n\", nmemb * size);\n        }\n    } else {\n        int i, c = 0;\n        for (i = 0; i < nmemb * size; i++) c = md5_hgetc(hfile, md5);\n        if (c == EOF) error(\"Failed to reposition stream forward %ld bytes\\n\", nmemb * size);\n    }\n}\n\n// read or skip a length-prefixed string\n// http://en.wikipedia.org/wiki/LEB128#Decode_unsigned_integer\nstatic void read_pfx_string(hFILE *hfile, char **str, size_t *m_str, hts_md5_context *md5) {\n    uint8_t byte;\n    size_t n = 0, shift = 0;\n    while (1) {\n        if (md5_hread(hfile, (void *)&byte, 1, md5) < 1) {\n            error(\"Failed to read 1 byte from stream\\n\");\n        }\n        n |= (size_t)(byte & 0x7F) << shift;\n        if (!(byte & 0x80)) break;\n        shift += 7;\n    }\n    if (n || m_str) {\n        read_array(hfile, (void **)str, m_str, n, 1, 1, md5);\n        if (str) (*str)[n] = '\\0';\n    }\n}\n\n// check whether file is compressed with gzip\nstatic int is_gzip(hFILE *hfile) {\n    uint8_t buffer[2];\n    if (hpeek(hfile, (void *)buffer, 2) < 2) error(\"Failed to read 2 bytes from stream\\n\");\n    return (buffer[0] == 0x1f && buffer[1] == 0x8b);\n}\n\nstatic inline int hwrite_uint16(hFILE *hfile, uint16_t num) { return hwrite(hfile, &num, sizeof(uint16_t)); }\n\nstatic inline int hwrite_int32(hFILE *hfile, int32_t num) { return hwrite(hfile, &num, sizeof(int32_t)); }\n\n// http://en.wikipedia.org/wiki/LEB128#Encode_unsigned_integer\nstatic int hwrite_pfx_string(hFILE *hfile, const char *str) {\n    if (!str) {\n        hputc(0, hfile);\n        return 0;\n    }\n    size_t n = strlen(str);\n    size_t value = n;\n    int ret = n;\n    do {\n        uint8_t byte = value & 0x7f;\n        value >>= 7;\n        if (value) byte ^= 0x80;\n        if (hputc(byte, hfile) == EOF) return -1;\n        ret++;\n    } while (value);\n    if (hwrite(hfile, str, n) < 0) return -1;\n    return ret;\n}\n\n/****************************************\n * IDAT FILE IMPLEMENTATION             *\n ****************************************/\n\n// http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py\n// http://github.com/HenrikBengtsson/illuminaio/blob/master/R/readIDAT.R\n// /humgen/cnp04/sandbox/bobh/idat_parser/src/edu/mit/broad/gapcore/apps/infinium_idat_parser/InfiniumIDATParser.java\n\n#define NUM_SNPS_READ 1000 // ID_N_CORES\n// #define ... 100 // ID_BACKGROUNDS - not used\n// #define ... 101 // ID_BACKGROUND_DEVS - not used\n#define ILLUMINA_ID 102 // ID_BEAD_TYPES\n#define SD 103          // ID_DEVS\n#define MEAN 104        // ID_MEANS\n// #define ... 105 // ID_MEDIANS - not used\n// #define ... 106 // ID_N_BEADS - not used\n#define NBEADS 107 // ID_N_GOOD_BEADS\n// #define ... 108 // ID_TRIMMED_MEANS - not used\n#define MID_BLOCK 200         // ID_ILLUMICODES\n#define RUN_INFO 300          // ID_PROCESS_HISTORY\n#define RED_GREEN 400         // ID_TENTH_PERCENTILE\n#define IDAT_SNP_MANIFEST 401 // ID_SAMPLE_BEADSET\n#define SENTRIX_BARCODE 402   // ID_BARCODE\n#define CHIP_TYPE 403         // ID_SENTRIX_FORMAT\n#define SENTRIX_POSITION 404  // ID_SECTION_LABEL\n#define BEADSET 405           // ID_BEADSET\n#define IDAT_SAMPLE_NAME 406  // ID_DNA\n#define DESCRIPTION 407       // ID_OPA\n#define IDAT_SAMPLE_PLATE 408 // ID_DNA_PLATE\n#define IDAT_SAMPLE_WELL 409  // ID_WELL\n#define IDAT_SAMPLE_COUNT 410 // ID_SAMPLE_COUNT\n// #define ... 411 // ID_DX - not used\n#define IDAT_VLN 510 // ID_VLN\n\ntypedef struct {\n    const char *chip_type;\n    int num_snps;\n    int num_mid_blocks;\n    const char *chip_type_guess;\n} chip_type_t;\n\nstatic chip_type_t chip_types[] = {\n    {\"1-95um_multi-swath_for_4x5M\", 4568350, 4568350, \"HumanOmni5-4-v1-0\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4640213, 4640213, \"HumanOmni5-4v1-1\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4685673, 4685673, \"InfiniumOmni5-4v1-2\"},\n    {\"1-95um_multi-swath_for_4x5M\", 4696316, 4696316, \"HumanOmni5-4-v1-0\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266191, 2266191, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266367, 2266367, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266404, 2266404, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2266406, 2266406, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2268676, 2268676, \"MEGAEx_BioVU_15075710\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2315574, 2315574, \"Multi-EthnicGlobal\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2389000, 2389000, \"CCPMBiobankMEGA2_20002558X345183\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2508689, 2508689, \"GDA-8v1-0\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2550870, 2550870, \"HumanOmni2.5-8v1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2563064, 2563064, \"HumanOmni25M-8v1-1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2575219, 2575219, \"HumanOmni2.5-8v1\"},\n    {\"1-95um_multi-swath_for_8x2-5M\", 2605775, 2605775, \"HumanOmni25M-8v1-1\"},\n    {\"BeadChip 12x1\", 55300, 55300, \"humanmethylation27_270596_v1-2 ???\"},\n    {\"BeadChip 12x1Q\", 191668, 191668, \"CanineHD\"},\n    {\"BeadChip 12x1Q\", 299260, 299260, \"HumanCytoSNP-12v2-1\"},\n    {\"BeadChip 12x8\", 301084, 301084, \"HumanCore-12v1-0\"},\n    {\"BeadChip 12x8\", 304138, 304138, \"HumanExome-12v1-1\"},\n    {\"BeadChip 12x8\", 567727, 567727, \"HumanCoreExome-12-v1-0\"},\n    {\"BeadChip 12x8\", 569060, 569060, \"HumanCoreExome-12-v1-0\"},\n    {\"BeadChip 12x8\", 573012, 573012, \"HumanCoreExome-12-v1-1\"},\n    {\"BeadChip 12x8\", 576769, 576769, \"HumanCoreExome-12-v1-1\"},\n    {\"BeadChip 12x8\", 622399, 622399, \"humanmethylation450_15017482_v-1-2 ???\"},\n    {\"BeadChip 12x8\", 722405, 722405, \"HumanOmniExpress-12-v1-1\"},\n    {\"BeadChip 12x8\", 734889, 734889, \"HumanOmniExpress-12-v1-0\"},\n    {\"BeadChip 12x8\", 736136, 736136, \"HumanOmniExpress-12-v1-0\"},\n    {\"BeadChip 1x12\", 577085, 8627, \"HumanHap550v3\"},\n    {\"BeadChip 1x12\", 661182, 49163, \"HumanHap650Yv3\"},\n    {\"BeadChip 1x40\", 1129736, 57373, \"Human1Mv1\"},\n    {\"BeadChip 1x40 66\", 1078890, 52497, \"Human1Mv1\"},\n    {\"BeadChip 24x1x4\", 306776, 306776, \"InfiniumCore-24v1-2\"},\n    {\"BeadChip 24x1x4\", 527136, 527136, \"OncoArray-500K\"},\n    {\"BeadChip 24x1x4\", 577781, 577781, \"HumanCoreExome-24v1-0\"},\n    {\"BeadChip 24x1x4\", 581261, 581261, \"HumanCoreExome-24v1-2\"},\n    {\"BeadChip 24x1x4\", 582684, 582684, \"HumanCoreExome-24v1-1\"},\n    {\"BeadChip 24x1x4\", 611866, 611866, \"HumanCoreExome-24v1-4\"},\n    {\"BeadChip 24x1x4\", 623302, 623302, \"PsychChip_15048346\"},\n    {\"BeadChip 24x1x4\", 623513, 623513, \"InfiniumPsychArray-24v1-1\"},\n    {\"BeadChip 24x1x4\", 638714, 638714, \"PsychChip_v1-1_15073391\"},\n    {\"BeadChip 24x1x4\", 647864, 647864, \"InfiniumPsychArray-24v1-3\"},\n    {\"BeadChip 24x1x4\", 663209, 663209, \"GSA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 704215, 704215, \"GSA-24v3-0\"},\n    {\"BeadChip 24x1x4\", 708013, 708013, \"DeCodeGenetics_V1_20012591\"},\n    {\"BeadChip 24x1x4\", 710576, 710576, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 710606, 710606, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 710608, 710608, \"GSAMD-24v1-0_20011747\"},\n    {\"BeadChip 24x1x4\", 715653, 715653, \"HumanOmniExpress-24v1-1\"},\n    {\"BeadChip 24x1x4\", 716279, 716279, \"InfiniumOmniExpress-24v1-2\"},\n    {\"BeadChip 24x1x4\", 718963, 718963, \"HumanOmniExpress-24-v1-0\"},\n    {\"BeadChip 24x1x4\", 719234, 719234, \"HumanOmniExpress-24-v1-0\"},\n    {\"BeadChip 24x1x4\", 729110, 729110, \"ASA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 733354, 733354, \"GSA-24v2-0\"},\n    {\"BeadChip 24x1x4\", 749019, 749019, \"DeCodeGenetics_V3_20032937X331991\"},\n    {\"BeadChip 24x1x4\", 751614, 751614, \"GSAMD-24v3-0-EA_20034606\"},\n    {\"BeadChip 24x1x4\", 766804, 766804, \"JSA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 776509, 776509, \"ASA-24v1-0\"},\n    {\"BeadChip 24x1x4\", 780343, 780343, \"GSAMD-24v2-0_20024620\"},\n    {\"BeadChip 24x1x4\", 780509, 780509, \"GSAMD-24v2-0_20024620\"},\n    {\"BeadChip 24x1x4\", 818205, 818205, \"GSA-24v2-0\"},\n    {\"BeadChip 2x10\", 321354, 37161, \"HumanHap300v2\"},\n    {\"BeadChip 2x12\", 381079, 29275, \"HumanCNV370v1\"},\n    {\"BeadChip 2x20\", 561686, 54936, \"HumanHap550v3\"},\n    {\"BeadChip 2x6Q\", 1224000, 180026, \"Human1M-Duov3\"},\n    {\"BeadChip 2x6Q\", 1224629, 180026, \"Human1M-Duov3\"},\n    {\"BeadChip 48x4\", 730546, 730546, \"GSA-MD-48v4-0_20098041\"},\n    {\"BeadChip 4x10\", 2623923, 1300482, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2623923, 1323441, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624666, 1300941, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624666, 1323725, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2624671, 1323726, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4x10\", 2655594, 1354653, \"HumanOmni2.5-4v1\"},\n    {\"BeadChip 4X1X14\", 1186430, 1186430, \"HumanOmni1-Quad_v1-0\"},\n    {\"BeadChip 4x2Q\", 376216, 186490, \"HumanCNV370-Quadv3\"},\n    {\"BeadChip 4x3Q\", 626122, 208778, \"Human610-Quadv1\"},\n    {\"BeadChip 4x3Q\", 667447, 208778, \"Human660W-Quad_v1\"},\n    {\"BeadChip 8x5\", 1052641, 1052641, \"infinium-methylationepic-v-1-0 ???\"},\n    {\"BeadChip 8x5\", 867478, 867478, \"CytoSNP-850K\"},\n    {\"BeadChip 8x5\", 988240, 988240, \"HumanOmniExpressExome-8-v1-1\"},\n    {\"BeadChip 8x5\", 989536, 989536, \"HumanOmniExpressExome-8-v1-1\"},\n    {\"BeadChip 8x5\", 992824, 992824, \"HumanOmniExpressExome-8-v1-4\"},\n    {\"BeadChip 8x5\", 996003, 996003, \"HumanOmniExpressExome-8-v1-2\"},\n    {\"BeadChip 8x5\", 996055, 996055, \"HumanOmniExpressExome-8-v1-2\"},\n    {\"SLIDE.15028542.24x1x3\", 307984, 307984, \"HumanCore-24v1-0\"},\n    {\"SLIDE.15028542.24x1x3\", 311460, 311460, \"HumanCore-24v1-0\"},\n    {NULL, 0, 0, NULL}};\n\ntypedef struct {\n    char *run_time;\n    char *block_type;\n    char *block_pars;\n    char *block_code;\n    char *code_version;\n} RunInfo;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int64_t version;\n    int32_t number_toc_entries;\n    uint16_t *id;\n    int64_t *toc;\n    int32_t num_snps;\n    int32_t num_mid_blocks;\n    int32_t *ilmn_id;\n    uint16_t *sd;\n    uint16_t *mean;\n    uint8_t *nbeads;\n    const uint16_t *trimmed_mean; // only used for historical purposes\n    uint8_t *mid_block;\n    uint8_t red_green[4];\n    char *snp_manifest;\n    char *sentrix_barcode;\n    char *chip_type;\n    char *sentrix_position;\n    char *beadset;\n    char *sample_name;\n    char *description;\n    char *sample_plate;\n    char *sample_well;\n    int32_t sample_count;\n    char *vln;\n    RunInfo *run_infos;\n    int32_t m_run_infos;\n    const char *chip_type_guess;\n    const char *imaging_date;\n    const char *scanner_data;\n    void *ilmn_id2index;\n} idat_t;\n\nKHASH_MAP_INIT_INT(32, int32_t)\n\nstatic int idat_read(idat_t *idat, uint16_t id) {\n    int i;\n    for (i = 0; i < idat->number_toc_entries && id != idat->id[i]; i++);\n    if (i == idat->number_toc_entries) return -1;\n    if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)\n        error(\"Fail to seek to position %ld in IDAT %s file\\n\", idat->toc[i], idat->fn);\n\n    switch (id) {\n    case NUM_SNPS_READ:\n        read_bytes(idat->hfile, (void *)&idat->num_snps, sizeof(int32_t), NULL);\n        break;\n    case ILLUMINA_ID:\n        idat->ilmn_id = (int32_t *)malloc(idat->num_snps * sizeof(int32_t));\n        read_bytes(idat->hfile, (void *)idat->ilmn_id, idat->num_snps * sizeof(int32_t), NULL);\n        int ret;\n        idat->ilmn_id2index = kh_init(32);\n        khash_t(32) *hash = (khash_t(32) *)idat->ilmn_id2index;\n        for (i = 0; i < idat->num_snps; i++) {\n            khiter_t k = kh_put(32, hash, idat->ilmn_id[i], &ret);\n            if (ret < 0) error(\"Unable to insert Illumina ID %d in hash table\\n\", idat->ilmn_id[i]);\n            if (ret > 0)\n                kh_val(hash, k) = kh_size(hash) - 1;\n            else\n                error(\"Duplicate Illumina ID %d in hash table\\n\", idat->ilmn_id[i]);\n        }\n        break;\n    case SD:\n        idat->sd = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));\n        read_bytes(idat->hfile, (void *)idat->sd, idat->num_snps * sizeof(uint16_t), NULL);\n        break;\n    case MEAN:\n        idat->mean = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));\n        read_bytes(idat->hfile, (void *)idat->mean, idat->num_snps * sizeof(uint16_t), NULL);\n        idat->trimmed_mean = idat->mean;\n        break;\n    case NBEADS:\n        idat->nbeads = (uint8_t *)malloc(idat->num_snps * sizeof(uint8_t));\n        read_bytes(idat->hfile, (void *)idat->nbeads, idat->num_snps * sizeof(uint8_t), NULL);\n        break;\n    case MID_BLOCK:\n        read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL);\n        idat->mid_block = (uint8_t *)malloc(idat->num_mid_blocks * sizeof(uint8_t));\n        read_bytes(idat->hfile, (void *)idat->mid_block, idat->num_mid_blocks * sizeof(uint8_t), NULL);\n        break;\n    case RED_GREEN:\n        read_bytes(idat->hfile, (void *)&idat->red_green, 4 * sizeof(uint8_t), NULL);\n        break;\n    case IDAT_SNP_MANIFEST:\n        read_pfx_string(idat->hfile, &idat->snp_manifest, NULL, NULL);\n        break;\n    case SENTRIX_BARCODE:\n        read_pfx_string(idat->hfile, &idat->sentrix_barcode, NULL, NULL);\n        break;\n    case CHIP_TYPE:\n        read_pfx_string(idat->hfile, &idat->chip_type, NULL, NULL);\n        break;\n    case SENTRIX_POSITION:\n        read_pfx_string(idat->hfile, &idat->sentrix_position, NULL, NULL);\n        break;\n    case BEADSET:\n        read_pfx_string(idat->hfile, &idat->beadset, NULL, NULL);\n        break;\n    case IDAT_SAMPLE_NAME:\n        read_pfx_string(idat->hfile, &idat->sample_name, NULL, NULL);\n        break;\n    case DESCRIPTION:\n        read_pfx_string(idat->hfile, &idat->description, NULL, NULL);\n        break;\n    case IDAT_SAMPLE_PLATE:\n        read_pfx_string(idat->hfile, &idat->sample_plate, NULL, NULL);\n        break;\n    case IDAT_SAMPLE_WELL:\n        read_pfx_string(idat->hfile, &idat->sample_well, NULL, NULL);\n        break;\n    case IDAT_SAMPLE_COUNT:\n        read_bytes(idat->hfile, (void *)&idat->sample_count, sizeof(int32_t), NULL);\n        break;\n    case IDAT_VLN:\n        read_pfx_string(idat->hfile, &idat->vln, NULL, NULL);\n        break;\n    case RUN_INFO:\n        read_bytes(idat->hfile, (void *)&idat->m_run_infos, sizeof(int32_t), NULL);\n        idat->run_infos = (RunInfo *)calloc(idat->m_run_infos, sizeof(RunInfo));\n        for (i = 0; i < idat->m_run_infos; i++) {\n            read_pfx_string(idat->hfile, &idat->run_infos[i].run_time, NULL, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_type, NULL, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_pars, NULL, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].block_code, NULL, NULL);\n            read_pfx_string(idat->hfile, &idat->run_infos[i].code_version, NULL, NULL);\n        }\n        break;\n    default:\n        error(\"IDAT file format does not support TOC entry %d\\n\", id);\n        break;\n    }\n    return 0;\n}\n\nstatic idat_t *idat_init(const char *fn, int load_arrays) {\n    idat_t *idat = (idat_t *)calloc(1, sizeof(idat_t));\n    idat->fn = strdup(fn);\n    idat->hfile = hopen(idat->fn, \"rb\");\n    if (idat->hfile == NULL) error(\"Could not open %s: %s\\n\", idat->fn, strerror(errno));\n    if (is_gzip(idat->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", idat->fn);\n\n    int i;\n    uint8_t buffer[4];\n    if (hread(idat->hfile, (void *)buffer, 4) < 4) error(\"Failed to read magic number from %s file\\n\", idat->fn);\n    if (memcmp(buffer, \"IDAT\", 4) != 0) error(\"IDAT file %s format identifier is bad\\n\", idat->fn);\n\n    read_bytes(idat->hfile, (void *)&idat->version, sizeof(int64_t), NULL);\n    if (idat->version < 3)\n        error(\"Cannot read IDAT file %s. Unsupported IDAT file format version: %ld\\n\", idat->fn, idat->version);\n\n    read_bytes(idat->hfile, (void *)&idat->number_toc_entries, sizeof(int32_t), NULL);\n    idat->id = (uint16_t *)malloc(idat->number_toc_entries * sizeof(uint16_t));\n    idat->toc = (int64_t *)malloc(idat->number_toc_entries * sizeof(int64_t));\n    for (i = 0; i < idat->number_toc_entries; i++) {\n        read_bytes(idat->hfile, (void *)&idat->id[i], sizeof(uint16_t), NULL);\n        read_bytes(idat->hfile, (void *)&idat->toc[i], sizeof(int64_t), NULL);\n    }\n\n    for (i = 0; i < idat->number_toc_entries; i++) {\n        if (!load_arrays && idat->id[i] <= MID_BLOCK) {\n            if (idat->id[i] == MID_BLOCK) {\n                if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)\n                    error(\"Fail to seek to position %ld in IDAT %s file\\n\", idat->toc[i], idat->fn);\n                read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL);\n            }\n            continue;\n        }\n        idat_read(idat, idat->id[i]);\n    }\n\n    if (idat->chip_type) {\n        const chip_type_t *ptr;\n        for (ptr = chip_types; ptr->chip_type; ptr++) {\n            if (strcmp(idat->chip_type, ptr->chip_type) == 0 && ptr->num_snps == idat->num_snps\n                && ptr->num_mid_blocks == idat->num_mid_blocks)\n                idat->chip_type_guess = ptr->chip_type_guess;\n        }\n    }\n\n    for (i = 0; i < idat->m_run_infos; i++) {\n        if (strcmp(idat->run_infos[i].block_type, \"Scan\") != 0) continue;\n        idat->imaging_date = idat->run_infos[i].run_time;\n        idat->scanner_data = idat->run_infos[i].block_pars;\n    }\n\n    return idat;\n}\n\nstatic void idat_destroy(idat_t *idat) {\n    if (!idat) return;\n    if (hclose(idat->hfile) < 0) error(\"Error closing IDAT file %s\\n\", idat->fn);\n    free(idat->fn);\n    free(idat->id);\n    free(idat->toc);\n    free(idat->snp_manifest);\n    free(idat->sentrix_barcode);\n    free(idat->chip_type);\n    free(idat->sentrix_position);\n    free(idat->beadset);\n    free(idat->sample_name);\n    free(idat->description);\n    free(idat->sample_plate);\n    free(idat->sample_well);\n    free(idat->vln);\n    int i;\n    for (i = 0; i < idat->m_run_infos; i++) {\n        free(idat->run_infos[i].run_time);\n        free(idat->run_infos[i].block_type);\n        free(idat->run_infos[i].block_pars);\n        free(idat->run_infos[i].block_code);\n        free(idat->run_infos[i].code_version);\n    }\n    free(idat->run_infos);\n    free(idat->ilmn_id);\n    free(idat->sd);\n    free(idat->mean);\n    free(idat->nbeads);\n    free(idat->mid_block);\n    if (idat->ilmn_id2index) kh_destroy(32, idat->ilmn_id2index);\n    free(idat);\n}\n\nstatic void idat_to_csv(const idat_t *idat, FILE *stream, int verbose) {\n    int i;\n    fprintf(stream, \"Illumina, Inc.\\n\");\n    fprintf(stream, \"[Heading]\\n\");\n    fprintf(stream, \"Descriptor File Name,%s\\n\", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn);\n    fprintf(stream, \"IDAT file version,%ld\\n\", idat->version);\n    fprintf(stream, \"Number of TOC entries,%d\\n\", idat->number_toc_entries);\n    fprintf(stream, \"Probes Count,%d\\n\", idat->num_snps);\n    fprintf(stream, \"Mid Blocks Count,%d\\n\", idat->num_mid_blocks);\n    fprintf(stream, \"Red Green,%02x %02x %02x %02x\\n\", idat->red_green[0], idat->red_green[1], idat->red_green[2],\n            idat->red_green[3]);\n    fprintf(stream, \"SNP Manifest,%s\\n\", idat->snp_manifest ? idat->snp_manifest : \"\");\n    fprintf(stream, \"Sentrix Barcode,%s\\n\", idat->sentrix_barcode);\n    fprintf(stream, \"Chip Type,%s\\n\", idat->chip_type);\n    fprintf(stream, \"Sentrix Position,%s\\n\", idat->sentrix_position);\n    fprintf(stream, \"BeadSet,%s\\n\", idat->beadset ? idat->beadset : \"\");\n    fprintf(stream, \"Sample Name,%s\\n\", idat->sample_name ? idat->sample_name : \"\");\n    fprintf(stream, \"Description,%s\\n\", idat->description ? idat->description : \"\");\n    fprintf(stream, \"Sample Plate,%s\\n\", idat->sample_plate ? idat->sample_plate : \"\");\n    fprintf(stream, \"Sample Well,%s\\n\", idat->sample_well ? idat->sample_well : \"\");\n    fprintf(stream, \"Sample Count,%d\\n\", idat->sample_count);\n    fprintf(stream, \"Vln,%s\\n\", idat->vln ? idat->vln : \"\");\n    fprintf(stream, \"Chip Prefix (Guess),%s\\n\", idat->chip_type_guess ? idat->chip_type_guess : \"Unknown\");\n    fprintf(stream, \"[Assay]\\n\");\n    fprintf(stream, \"IlmnID,Sd,Mean,Nbeads\\n\");\n    if (verbose) {\n        for (i = 0; i < idat->num_snps; i++)\n            fprintf(stream, \"%d,%d,%d,%d\\n\", idat->ilmn_id[i], idat->sd[i], idat->mean[i], idat->nbeads[i]);\n        fprintf(stream, \"[Mid Blocks]\\n\");\n        for (i = 0; i < idat->num_mid_blocks; i++) fprintf(stream, \"%d\\n\", idat->mid_block[i]);\n    } else {\n        fprintf(stream, \"... use --verbose to visualize Assay data ...\\n\");\n        fprintf(stream, \"[Mid Blocks]\\n\");\n        fprintf(stream, \"... use --verbose to visualize Mid Blocks data ...\\n\");\n    }\n    fprintf(stream, \"[Run Infos]\\n\");\n    for (i = 0; i < idat->m_run_infos; i++) {\n        fprintf(stream, \"%s\\t%s\\t%s\\t%s\\t%s\\n\", idat->run_infos[i].run_time, idat->run_infos[i].block_type,\n                idat->run_infos[i].block_pars, idat->run_infos[i].block_code, idat->run_infos[i].code_version);\n    }\n}\n\nstatic void idats_to_tsv(idat_t **idats, int n, FILE *stream) {\n    fprintf(stream,\n            \"idat\\tnumber_probes\\tnumber_mid_blocks\\tred_green\\tmanifest_file\\tsentrix_\"\n            \"barcode\\tchip_type\\t\"\n            \"sentrix_position\\tbeadset\\tsample_name\\tdescription\\tsample_plate\\tsample_\"\n            \"well\\tsample_count\\tvln\\t\"\n            \"chip_type_guess\\tscan_date\\tscanner_data\\n\");\n    int i;\n    for (i = 0; i < n; i++) {\n        idat_t *idat = idats[i];\n        fprintf(stream,\n                \"%s\\t%d\\t%d\\t%02x %02x %02x \"\n                \"%02x\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%d\\t%s\\t%s\\t%s\\t%s\\n\",\n                strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn, idat->num_snps, idat->num_mid_blocks,\n                idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3],\n                idat->snp_manifest ? idat->snp_manifest : \"\", idat->sentrix_barcode, idat->chip_type,\n                idat->sentrix_position, idat->beadset ? idat->beadset : \"\", idat->sample_name ? idat->sample_name : \"\",\n                idat->description ? idat->description : \"\", idat->sample_plate ? idat->sample_plate : \"\",\n                idat->sample_well ? idat->sample_well : \"\", idat->sample_count, idat->vln ? idat->vln : \"\",\n                idat->chip_type_guess ? idat->chip_type_guess : \"Unknown\", idat->imaging_date ? idat->imaging_date : \"\",\n                idat->scanner_data ? idat->scanner_data : \"\");\n    }\n}\n\n/****************************************\n * GTC FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumGTCFile.java\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/GenotypeCalls.py\n\n#define NUM_SNPS 1\n#define PLOIDY 2      // AutoConvert 2.0\n#define PLOIDY_TYPE 3 // AutoConvert 2.0\n#define GTC_SAMPLE_NAME 10\n#define GTC_SAMPLE_PLATE 11\n#define GTC_SAMPLE_WELL 12\n#define CLUSTER_FILE 100\n#define GTC_SNP_MANIFEST 101\n#define IMAGING_DATE 200\n#define AUTOCALL_DATE 201\n#define AUTOCALL_VERSION 300\n#define NORMALIZATION_TRANSFORMS 400\n#define CONTROLS_X 500\n#define CONTROLS_Y 501\n#define RAW_X 1000\n#define RAW_Y 1001\n#define GENOTYPES 1002\n#define BASE_CALLS 1003\n#define GENOTYPE_SCORES 1004\n#define SCANNER_DATA 1005\n#define CALL_RATE 1006\n#define GENDER 1007\n#define LOGR_DEV 1008\n#define GC10 1009\n#define DX 1010\n#define SAMPLE_DATA 1011\n#define B_ALLELE_FREQS 1012   // AutoConvert 2.0\n#define LOGR_RATIOS 1013      // AutoConvert 2.0\n#define PERCENTILES_X 1014    // AutoConvert 2.0\n#define PERCENTILES_Y 1015    // AutoConvert 2.0\n#define SLIDE_IDENTIFIER 1016 // AutoConvert 2.0\n\n// static const char *code2genotype[] = {\n//     \"NC\",       \"AA\",       \"AB\",       \"BB\",       \"NULL\",     \"A\",        \"B\",        \"AAA\",\n//     \"AAB\",      \"ABB\",      \"BBB\",      \"AAAA\",     \"AAAB\",     \"AABB\",     \"ABBB\",     \"BBBB\",\n//     \"AAAAA\",    \"AAAAB\",    \"AAABB\",    \"AABBB\",    \"ABBBB\",    \"BBBBB\",    \"AAAAAA\",   \"AAAAAB\",\n//     \"AAAABB\",   \"AAABBB\",   \"AABBBB\",   \"ABBBBB\",   \"BBBBBB\",   \"AAAAAAA\",  \"AAAAAAB\",  \"AAAAABB\",\n//     \"AAAABBB\",  \"AAABBBB\",  \"AABBBBB\",  \"ABBBBBB\",  \"BBBBBBB\",  \"AAAAAAAA\", \"AAAAAAAB\", \"AAAAAABB\",\n//     \"AAAAABBB\", \"AAAABBBB\", \"AAABBBBB\", \"AABBBBBB\", \"ABBBBBBB\", \"BBBBBBBB\"};\n\ntypedef struct {\n    int32_t version;\n    float offset_x;\n    float offset_y;\n    float scale_x;\n    float scale_y;\n    float shear;\n    float theta;\n    float cvx;\n    float cvy;\n    float nn12;\n    float rr12;\n    float taa;\n    float tbb;\n} XForm;\n\ntypedef char BaseCall[2];\n\ntypedef struct {\n    char *scanner_name;\n    int32_t pmt_green;\n    int32_t pmt_red;\n    char *scanner_version;\n    char *imaging_user;\n} ScannerData;\n\ntypedef struct {\n    float p50gc;\n    int32_t num_calls;\n    int32_t num_no_calls;\n    int32_t num_intensity_only;\n} SampleData;\n\ntypedef uint16_t Percentiles[3];\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int32_t version;\n    int32_t number_toc_entries;\n    uint16_t *id;\n    int32_t *toc;\n    int32_t num_snps;\n    int32_t ploidy;\n    int32_t ploidy_type;\n    char *sample_name;\n    char *sample_plate;\n    char *sample_well;\n    char *cluster_file;\n    char *snp_manifest;\n    char *imaging_date;\n    char *autocall_date;\n    char *autocall_version;\n    XForm *normalization_transforms;\n    size_t m_normalization_transforms;\n    uint16_t *controls_x;\n    size_t m_controls_x;\n    uint16_t *controls_y;\n    size_t m_controls_y;\n    ScannerData scanner_data;\n    float call_rate;\n    char gender;\n    float logr_dev;\n    float p10gc;\n    int32_t dx;\n    SampleData sample_data;\n    Percentiles percentiles_x;\n    Percentiles percentiles_y;\n    char *sentrix_id;\n\n    char *display_name;\n    float *sin_theta; // precomputed sine transforms\n    float *cos_theta; // precomputed cosine transforms\n\n    uint16_t *raw_x;\n    size_t m_raw_x;\n    uint16_t *raw_y;\n    size_t m_raw_y;\n    uint8_t *genotypes;\n    size_t m_genotypes;\n    BaseCall *base_calls;\n    size_t m_base_calls;\n    float *genotype_scores;\n    size_t m_genotype_scores;\n    float *b_allele_freqs;\n    size_t m_b_allele_freqs;\n    float *logr_ratios;\n    size_t m_logr_ratios;\n} gtc_t;\n\n// returns the length of a string including the variable-length prefix encoding the number of characters\nstatic int leb128_strlen(const char *s) {\n    if (!s) return 1;\n    size_t n = strlen(s);\n    size_t value = n++;\n    while (value >>= 7) n++;\n    return n;\n}\n\nstatic int gtc_write(const gtc_t *gtc, const char *fn, int gtc_file_version) {\n    hFILE *hfile = hopen(fn, \"wb\");\n    if (hfile == NULL) error(\"Could not open %s: %s\\n\", fn, strerror(errno));\n    const uint8_t header[4] = {'g', 't', 'c', gtc_file_version};\n    if (hwrite(hfile, header, 4) < 0) return -1;\n    int32_t number_toc_entries = gtc_file_version == 3 ? 24 : 31;\n    if (hwrite_int32(hfile, number_toc_entries) < 0) return -1;\n    int offset = 4 + sizeof(int32_t) + number_toc_entries * (sizeof(uint16_t) + sizeof(int32_t));\n    if (hwrite_uint16(hfile, NUM_SNPS) < 0) return -1;\n    if (hwrite_int32(hfile, gtc->num_snps) < 0) return -1;\n    if (gtc_file_version != 3) {\n        if (hwrite_uint16(hfile, PLOIDY) < 0) return -1;\n        if (hwrite_int32(hfile, gtc->ploidy) < 0) return -1;\n        if (hwrite_uint16(hfile, PLOIDY_TYPE) < 0) return -1;\n        if (hwrite_int32(hfile, gtc->ploidy_type) < 0) return -1;\n    }\n    if (hwrite_uint16(hfile, GTC_SAMPLE_NAME) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->sample_name);\n    if (hwrite_uint16(hfile, GTC_SAMPLE_PLATE) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->sample_plate);\n    if (hwrite_uint16(hfile, GTC_SAMPLE_WELL) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->sample_well);\n    if (hwrite_uint16(hfile, CLUSTER_FILE) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->cluster_file);\n    if (hwrite_uint16(hfile, GTC_SNP_MANIFEST) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->snp_manifest);\n    if (hwrite_uint16(hfile, IMAGING_DATE) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->imaging_date);\n    if (hwrite_uint16(hfile, AUTOCALL_DATE) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->autocall_date);\n    if (hwrite_uint16(hfile, AUTOCALL_VERSION) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->autocall_version);\n    if (hwrite_uint16(hfile, NORMALIZATION_TRANSFORMS) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->m_normalization_transforms * sizeof(XForm) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, CONTROLS_X) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->m_controls_x * sizeof(uint16_t) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, CONTROLS_Y) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->m_controls_y * sizeof(uint16_t) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, RAW_X) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->num_snps * sizeof(uint16_t) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, RAW_Y) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->num_snps * sizeof(uint16_t) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, GENOTYPES) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->num_snps * sizeof(uint8_t) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, BASE_CALLS) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->num_snps * sizeof(BaseCall) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, GENOTYPE_SCORES) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += gtc->num_snps * sizeof(float) + sizeof(int32_t);\n    if (hwrite_uint16(hfile, SCANNER_DATA) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += leb128_strlen(gtc->scanner_data.scanner_name) + sizeof(float) + sizeof(float)\n              + leb128_strlen(gtc->scanner_data.scanner_version) + leb128_strlen(gtc->scanner_data.imaging_user);\n    if (hwrite_uint16(hfile, CALL_RATE) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(float);\n    if (hwrite_uint16(hfile, GENDER) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(char);\n    if (hwrite_uint16(hfile, LOGR_DEV) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(float);\n    if (hwrite_uint16(hfile, GC10) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(float);\n    if (hwrite_uint16(hfile, DX) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(int32_t);\n    if (hwrite_uint16(hfile, SAMPLE_DATA) < 0) return -1;\n    if (hwrite_int32(hfile, offset) < 0) return -1;\n    offset += sizeof(SampleData);\n    if (gtc_file_version != 3) {\n        if (hwrite_uint16(hfile, B_ALLELE_FREQS) < 0) return -1;\n        if (hwrite_int32(hfile, offset) < 0) return -1;\n        offset += gtc->num_snps * sizeof(float) + sizeof(int32_t);\n        if (hwrite_uint16(hfile, LOGR_RATIOS) < 0) return -1;\n        if (hwrite_int32(hfile, offset) < 0) return -1;\n        offset += gtc->num_snps * sizeof(float) + sizeof(int32_t);\n        if (hwrite_uint16(hfile, PERCENTILES_X) < 0) return -1;\n        if (hwrite_int32(hfile, offset) < 0) return -1;\n        offset += sizeof(Percentiles);\n        if (hwrite_uint16(hfile, PERCENTILES_Y) < 0) return -1;\n        if (hwrite_int32(hfile, offset) < 0) return -1;\n        offset += sizeof(Percentiles);\n        if (hwrite_uint16(hfile, SLIDE_IDENTIFIER) < 0) return -1;\n        if (hwrite_int32(hfile, offset) < 0) return -1;\n        offset += leb128_strlen(gtc->sentrix_id);\n    }\n\n    if (hwrite_pfx_string(hfile, gtc->sample_name) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->sample_plate) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->sample_well) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->cluster_file) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->snp_manifest) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->imaging_date) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->autocall_date) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->autocall_version) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->m_normalization_transforms, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->normalization_transforms, gtc->m_normalization_transforms * sizeof(XForm)) < 0)\n        return -1;\n    if (hwrite(hfile, (const void *)&gtc->m_controls_x, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->controls_x, gtc->m_controls_x * sizeof(uint16_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->m_controls_y, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->controls_y, gtc->m_controls_y * sizeof(uint16_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->raw_x, gtc->num_snps * sizeof(uint16_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->raw_y, gtc->num_snps * sizeof(uint16_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->genotypes, gtc->num_snps * sizeof(uint8_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->base_calls, gtc->num_snps * sizeof(BaseCall)) < 0) return -1;\n    if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, (const void *)gtc->genotype_scores, gtc->num_snps * sizeof(float)) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->scanner_data.scanner_name) < 0) return -1;\n    if (hwrite(hfile, &gtc->scanner_data.pmt_green, sizeof(float)) < 0) return -1;\n    if (hwrite(hfile, &gtc->scanner_data.pmt_red, sizeof(float)) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->scanner_data.scanner_version) < 0) return -1;\n    if (hwrite_pfx_string(hfile, gtc->scanner_data.imaging_user) < 0) return -1;\n    if (hwrite(hfile, &gtc->call_rate, sizeof(float)) < 0) return -1;\n    if (hwrite(hfile, &gtc->gender, sizeof(char)) < 0) return -1;\n    if (hwrite(hfile, &gtc->logr_dev, sizeof(float)) < 0) return -1;\n    if (hwrite(hfile, &gtc->p10gc, sizeof(float)) < 0) return -1;\n    if (hwrite(hfile, &gtc->dx, sizeof(int32_t)) < 0) return -1;\n    if (hwrite(hfile, &gtc->sample_data, sizeof(SampleData)) < 0) return -1;\n    if (gtc_file_version != 3) {\n        if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n        if (hwrite(hfile, (const void *)gtc->b_allele_freqs, gtc->num_snps * sizeof(float)) < 0) return -1;\n        if (hwrite(hfile, (const void *)&gtc->num_snps, sizeof(int32_t)) < 0) return -1;\n        if (hwrite(hfile, (const void *)gtc->logr_ratios, gtc->num_snps * sizeof(float)) < 0) return -1;\n        if (hwrite(hfile, (const void *)gtc->percentiles_x, sizeof(Percentiles)) < 0) return -1;\n        if (hwrite(hfile, (const void *)gtc->percentiles_y, sizeof(Percentiles)) < 0) return -1;\n        if (hwrite_pfx_string(hfile, gtc->sentrix_id) < 0) return -1;\n    }\n    if (hclose(hfile) < 0) error(\"Error closing GTC file %s\\n\", fn);\n    return 0;\n}\n\nstatic void gtc_destroy(gtc_t *gtc) {\n    if (!gtc) return;\n    if (gtc->hfile && hclose(gtc->hfile) < 0) error(\"Error closing GTC file %s\\n\", gtc->fn);\n    free(gtc->fn);\n    free(gtc->id);\n    free(gtc->toc);\n    free(gtc->sample_name);\n    free(gtc->sample_plate);\n    free(gtc->sample_well);\n    free(gtc->cluster_file);\n    free(gtc->snp_manifest);\n    free(gtc->imaging_date);\n    free(gtc->autocall_date);\n    free(gtc->autocall_version);\n    free(gtc->normalization_transforms);\n    free(gtc->controls_x);\n    free(gtc->controls_y);\n    free(gtc->scanner_data.scanner_name);\n    free(gtc->scanner_data.scanner_version);\n    free(gtc->scanner_data.imaging_user);\n    free(gtc->sentrix_id);\n\n    free(gtc->display_name);\n    free(gtc->sin_theta);\n    free(gtc->cos_theta);\n\n    free(gtc->raw_x);\n    free(gtc->raw_y);\n    free(gtc->genotypes);\n    free(gtc->base_calls);\n    free(gtc->genotype_scores);\n    free(gtc->b_allele_freqs);\n    free(gtc->logr_ratios);\n    free(gtc);\n}\n\n/****************************************\n * BPM FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/BeadPoolManifest.py\n\ntypedef struct {\n    int32_t version;\n    uint8_t norm_id; // Normalization lookups from manifest. This indexes into list of\n                     // normalization transforms read from GTC file\n    char *ilmn_id;   // IlmnID (probe identifier) of locus\n    char *name;      // Name (variant identifier) of locus\n    int32_t index;\n    char *ilmn_strand; // TOP BOT PLUS MINUS or Top Bot P M\n    char *snp;         // SNP value for locus (e.g., [A/C])\n    char *chrom;       // Chromosome for the locus (e.g., XY)\n    char *ploidy;\n    char *species;\n    char *map_info; // Mapping location of locus\n    char *customer_strand;\n    int32_t address_a;        // AddressA ID of locus\n    char *allele_a_probe_seq; // CSV files or BPM files with version 4 data block\n    int32_t address_b;        // AddressB ID of locus (0 if none)\n    char *allele_b_probe_seq; // CSV files or BPM files with version 4 data block (empty if\n                              // none)\n    char *genome_build;\n    char *source;\n    char *source_version;\n    char *source_strand;\n    char *source_seq;      // CSV files or BPM files with version 4 data block\n    char *top_genomic_seq; // CSV files or BPM files with version 4 data block\n    int32_t beadset_id;    // CSV files\n    uint8_t exp_clusters;\n    uint8_t intensity_only;\n    uint8_t assay_type; // Identifies type of assay (0 - Infinium II, 1 - Infinium I (A/T),\n                        // 2 - Infinium I (G/C)\n    uint8_t assay_type_csv;\n    float frac_a;\n    float frac_c;\n    float frac_g;\n    float frac_t;\n    char *ref_strand; // RefStrand annotation\n} LocusEntry;\n\n// retrieve assay type following (allele_a_probe_seq, source_seq) -> assay_type map\n// (...W., ...W[./.]W...) -> 1\n// (...S., ...S[./.]S...) -> 2\n// (...S., ...S[./.]W...) -> 1\n// (...S., ...W[./.]S...) -> 1\n// (...W., ...S[./.]W...) -> 2\n// (...W., ...W[./.]S...) -> 2\nstatic uint8_t get_assay_type(const char *allele_a_probe_seq, const char *allele_b_probe_seq, const char *source_seq) {\n    if (!allele_a_probe_seq || !source_seq) return 0xFF;\n    if (!allele_b_probe_seq) return 0;\n    const char *left = strchr(source_seq, '[');\n    const char *right = strchr(source_seq, ']');\n    if (!left || !right) error(\"Source sequence is malformed: %s\\n\", source_seq);\n    char trail_left = toupper(*(left - 1));\n    char trail_right = toupper(*(right + 1));\n    if ((trail_left == 'A' || trail_left == 'T') && (trail_right == 'A' || trail_right == 'T')) return 1;\n    if ((trail_left == 'C' || trail_left == 'G') && (trail_right == 'C' || trail_right == 'G')) return 2;\n    int i = 2;\n    while (!(iupac2bitmask(allele_a_probe_seq[strlen(allele_a_probe_seq) - i])\n             & iupac2bitmask(allele_b_probe_seq[strlen(allele_b_probe_seq) - i])))\n        i++;\n    char trail_a_probe_seq = toupper(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]);\n    if (trail_a_probe_seq == 'C' || trail_a_probe_seq == 'G' || trail_a_probe_seq == 'S') return 1;\n    if (trail_a_probe_seq == 'A' || trail_a_probe_seq == 'T' || trail_a_probe_seq == 'W') return 2;\n    // these weird rule were deduced from manifests for array GDA_PGx-8v1-0_20042614\n    if (trail_a_probe_seq == 'Y' && trail_right == 'G') return 1;\n    if (trail_a_probe_seq == 'Y' && trail_right == 'T') return 1;\n    if (trail_a_probe_seq == 'Y' && trail_right == 'A') return 2;\n    if (trail_a_probe_seq == 'K' && trail_right == 'C') return 1;\n    if (trail_a_probe_seq == 'K' && trail_right == 'A') return 2;\n    if (trail_a_probe_seq == 'M' && trail_right == 'G') return 1;\n    if (trail_a_probe_seq == 'M' && trail_right == 'T') return 2;\n    if (trail_a_probe_seq == 'R' && trail_right == 'C') return 1;\n    if (trail_a_probe_seq == 'R' && trail_right == 'T') return 2;\n    fprintf(stderr, \"Warning: Unable to retrieve assay type: %s %s %s\\n\", allele_a_probe_seq, allele_b_probe_seq,\n            source_seq);\n    return 0xFF;\n}\n\nstatic void locusentry_read(LocusEntry *locus_entry, hFILE *hfile, hts_md5_context *md5) {\n    locus_entry->norm_id = 0xFF;\n    read_bytes(hfile, (void *)&locus_entry->version, sizeof(int32_t), md5);\n    if (locus_entry->version < 4 || locus_entry->version == 5 || locus_entry->version > 8)\n        error(\"Locus version %d in manifest file not supported\\n\", locus_entry->version);\n    read_pfx_string(hfile, &locus_entry->ilmn_id, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->name, NULL, md5);\n    read_pfx_string(hfile, NULL, NULL, md5);\n    read_pfx_string(hfile, NULL, NULL, md5);\n    read_pfx_string(hfile, NULL, NULL, md5);\n    read_bytes(hfile, (void *)&locus_entry->index, sizeof(int32_t), md5);\n    read_pfx_string(hfile, NULL, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->ilmn_strand, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->snp, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->chrom, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->ploidy, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->species, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->map_info, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->top_genomic_seq, NULL, md5); // only version 4\n    read_pfx_string(hfile, &locus_entry->customer_strand, NULL, md5);\n    read_bytes(hfile, (void *)&locus_entry->address_a, sizeof(int32_t), md5);\n    read_bytes(hfile, (void *)&locus_entry->address_b, sizeof(int32_t), md5);\n    read_pfx_string(hfile, &locus_entry->allele_a_probe_seq, NULL, md5); // only version 4\n    read_pfx_string(hfile, &locus_entry->allele_b_probe_seq, NULL, md5); // only version 4\n    read_pfx_string(hfile, &locus_entry->genome_build, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->source, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->source_version, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->source_strand, NULL, md5);\n    read_pfx_string(hfile, &locus_entry->source_seq, NULL, md5); // only version 4\n    if (locus_entry->source_seq) {\n        char *ptr = strchr(locus_entry->source_seq, '-');\n        if (ptr && *(ptr - 1) == '/') {\n            *ptr = *(ptr - 2);\n            *(ptr - 2) = '-';\n        }\n    }\n\n    if (locus_entry->version >= 6) {\n        read_bytes(hfile, NULL, 1, md5);\n        read_bytes(hfile, (void *)&locus_entry->exp_clusters, sizeof(int8_t), md5);\n        read_bytes(hfile, (void *)&locus_entry->intensity_only, sizeof(int8_t), md5);\n        read_bytes(hfile, (void *)&locus_entry->assay_type, sizeof(uint8_t), md5);\n\n        if (locus_entry->assay_type < 0 || locus_entry->assay_type > 2)\n            error(\"Format error in reading assay type from locus entry\\n\");\n        if (locus_entry->address_b == 0 && locus_entry->assay_type != 0)\n            error(\"Manifest format error: Assay type is inconsistent with address B\\n\");\n        if (locus_entry->address_b != 0 && locus_entry->assay_type == 0)\n            error(\"Manifest format error: Assay type is inconsistent with address B\\n\");\n    } else {\n        locus_entry->assay_type =\n            get_assay_type(locus_entry->allele_a_probe_seq, locus_entry->allele_b_probe_seq, locus_entry->source_seq);\n    }\n\n    if (locus_entry->version >= 7) {\n        read_bytes(hfile, &locus_entry->frac_a, sizeof(float), md5);\n        read_bytes(hfile, &locus_entry->frac_c, sizeof(float), md5);\n        read_bytes(hfile, &locus_entry->frac_t, sizeof(float), md5);\n        read_bytes(hfile, &locus_entry->frac_g, sizeof(float), md5);\n    }\n    if (locus_entry->version >= 8) read_pfx_string(hfile, &locus_entry->ref_strand, NULL, md5);\n}\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile; // bpm file\n    htsFile *fp;  // csv file\n    int32_t version;\n    char *manifest_name;  // Name of manifest\n    char *control_config; // Control description from manifest\n    int32_t num_loci;     // Number of loci in manifest\n    int32_t *indexes;\n    char **names; // Names of loci from manifest\n    void *names2index;\n    uint8_t *norm_ids;\n    LocusEntry *locus_entries;\n    uint8_t *norm_lookups;\n    char **header;\n    size_t m_header;\n    char unsigned md5_buf[16];\n} bpm_t;\n\nstatic uint8_t *bpm_norm_lookups(bpm_t *bpm) {\n    int i;\n    uint8_t sorted_norm_ids[256];\n    for (i = 0; i < 256; i++) sorted_norm_ids[i] = 0xFF;\n    for (i = 0; i < bpm->num_loci; i++) {\n        int norm_id = bpm->locus_entries[i].norm_id;\n        sorted_norm_ids[norm_id] = norm_id;\n    }\n    int j = 0;\n    for (i = 0; i < 256; i++)\n        if (sorted_norm_ids[i] != 0xFF) sorted_norm_ids[j++] = sorted_norm_ids[i];\n    uint8_t *norm_lookups = (uint8_t *)malloc(256 * sizeof(uint8_t *));\n    memset((void *)norm_lookups, 0xFF, 256 * sizeof(uint8_t *));\n    for (i = 0; i < j; i++) norm_lookups[sorted_norm_ids[i]] = i;\n    return norm_lookups;\n}\n\nstatic bpm_t *bpm_init(const char *fn, int eof_check, int make_dict, int checksum) {\n    bpm_t *bpm = (bpm_t *)calloc(1, sizeof(bpm_t));\n    bpm->fn = strdup(fn);\n    bpm->hfile = hopen(bpm->fn, \"rb\");\n    if (bpm->hfile == NULL) error(\"Could not open %s: %s\\n\", bpm->fn, strerror(errno));\n    if (is_gzip(bpm->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", bpm->fn);\n\n    hts_md5_context *md5 = checksum ? hts_md5_init() : NULL;\n\n    int i;\n    uint8_t buffer[4];\n    if (md5_hread(bpm->hfile, (void *)buffer, 4, md5) < 4) error(\"Failed to read magic number from %s file\\n\", bpm->fn);\n    if (memcmp(buffer, \"BPM\", 3) != 0) error(\"BPM file %s format identifier is bad\\n\", bpm->fn);\n    if (buffer[3] != 1) error(\"BPM file %s version is unknown\\n\", bpm->fn);\n\n    read_bytes(bpm->hfile, (void *)&bpm->version, sizeof(int32_t), md5);\n    if (bpm->version & 0x1000) bpm->version ^= 0x1000;\n    if (bpm->version > 5 || bpm->version < 3) error(\"BPM file %s version %d is unsupported\\n\", bpm->fn, bpm->version);\n    read_pfx_string(bpm->hfile, &bpm->manifest_name, NULL, md5);\n\n    if (bpm->version > 1) read_pfx_string(bpm->hfile, &bpm->control_config, NULL, md5);\n\n    read_bytes(bpm->hfile, (void *)&bpm->num_loci, sizeof(int32_t), md5);\n    read_array(bpm->hfile, (void **)&bpm->indexes, NULL, bpm->num_loci, sizeof(int32_t), 0, md5);\n    bpm->names = (char **)malloc(bpm->num_loci * sizeof(char *));\n    for (i = 0; i < bpm->num_loci; i++) read_pfx_string(bpm->hfile, &bpm->names[i], NULL, md5);\n    if (make_dict) {\n        bpm->names2index = khash_str2int_init();\n        for (i = 0; i < bpm->num_loci; i++) {\n            if (khash_str2int_has_key(bpm->names2index, bpm->names[i]))\n                error(\"Illumina probe %s present multiple times in file %s\\n\", bpm->names[i], fn);\n            khash_str2int_inc(bpm->names2index, bpm->names[i]);\n        }\n    }\n    read_array(bpm->hfile, (void **)&bpm->norm_ids, NULL, bpm->num_loci, sizeof(uint8_t), 0, md5);\n\n    bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry));\n    LocusEntry locus_entry;\n    for (i = 0; i < bpm->num_loci; i++) {\n        memset(&locus_entry, 0, sizeof(LocusEntry));\n        locusentry_read(&locus_entry, bpm->hfile, md5);\n        int idx = locus_entry.index - 1;\n        if (idx < 0 || idx >= bpm->num_loci) error(\"Locus entry index %d is out of boundaries\\n\", locus_entry.index);\n        if (bpm->norm_ids[idx] > 100)\n            error(\"Manifest format error: read invalid normalization ID %d\\n\", bpm->norm_ids[idx]);\n        // To mimic the flawed byte-wrapping behavior from GenomeStudio, AutoCall, and\n        // IAAP, this value is allowed to overflow beyond 255, which happens with some\n        // probes in the Omni5 arrays\n        bpm->norm_ids[idx] += 100 * locus_entry.assay_type;\n        locus_entry.norm_id = bpm->norm_ids[idx];\n        memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry));\n    }\n    bpm->norm_lookups = bpm_norm_lookups(bpm);\n    for (i = 0; i < bpm->num_loci; i++) {\n        if (i != bpm->locus_entries[i].index - 1)\n            error(\"Manifest format error: read invalid number of assay entries\\n\");\n    }\n    if (bpm->locus_entries[0].version < 8)\n        fprintf(stderr, \"Warning: RefStrand annotation missing from manifest file %s\\n\", bpm->fn);\n\n    read_bytes(bpm->hfile, (void *)&bpm->m_header, sizeof(int32_t), md5);\n    bpm->header = (char **)malloc(bpm->m_header * sizeof(char *));\n    for (i = 0; i < bpm->m_header; i++) read_pfx_string(bpm->hfile, &bpm->header[i], NULL, md5);\n\n    if (!heof(bpm->hfile)) {\n        if (eof_check)\n            error(\n                \"BPM reader did not reach the end of file %s at position %ld\\nUse --do-not-check-eof to suppress this \"\n                \"check\\n\",\n                bpm->fn, htell(bpm->hfile));\n        if (checksum)\n            while (md5_hgetc(bpm->hfile, md5) != EOF);\n    }\n\n    if (md5) {\n        hts_md5_final(bpm->md5_buf, md5);\n        hts_md5_destroy(md5);\n    }\n\n    return bpm;\n}\n\nstatic void bpm_destroy(bpm_t *bpm) {\n    if (!bpm) return;\n    int i;\n    if (bpm->hfile && hclose(bpm->hfile) < 0) error(\"Error closing BPM file %s\\n\", bpm->fn);\n    free(bpm->fn);\n    if (bpm->fp && hts_close(bpm->fp) < 0) error(\"Error closing CSV file %s\\n\", bpm->fp->fn);\n    free(bpm->manifest_name);\n    free(bpm->control_config);\n    free(bpm->indexes);\n    if (bpm->names) {\n        for (i = 0; i < bpm->num_loci; i++) free(bpm->names[i]);\n        free(bpm->names);\n    }\n    khash_str2int_destroy(bpm->names2index);\n    free(bpm->norm_ids);\n    for (i = 0; i < bpm->num_loci; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        free(locus_entry->ilmn_id);\n        free(locus_entry->name);\n        free(locus_entry->ilmn_strand);\n        free(locus_entry->snp);\n        free(locus_entry->chrom);\n        free(locus_entry->ploidy);\n        free(locus_entry->species);\n        free(locus_entry->map_info);\n        free(locus_entry->customer_strand);\n        free(locus_entry->allele_a_probe_seq);\n        free(locus_entry->allele_b_probe_seq);\n        free(locus_entry->genome_build);\n        free(locus_entry->source);\n        free(locus_entry->source_version);\n        free(locus_entry->source_strand);\n        free(locus_entry->source_seq);\n        free(locus_entry->top_genomic_seq);\n        free(locus_entry->ref_strand);\n    }\n    free(bpm->locus_entries);\n    free(bpm->norm_lookups);\n    for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]);\n    free(bpm->header);\n    free(bpm);\n}\n\n/****************************************\n * EGT FILE IMPLEMENTATION              *\n ****************************************/\n\n// http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumEGTFile.java\n// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/ClusterFile.py\n\ntypedef struct {\n    int32_t N;        // Number of samples assigned to cluster during training\n    float r_dev;      // R (intensity) std deviation value\n    float r_mean;     // R (intensity) mean value\n    float theta_dev;  // Theta std devation value\n    float theta_mean; // Theta mean value\n} ClusterStats;\n\ntypedef struct {\n    float cluster_separation; // A score measure the separation between genotype clusters\n    float total_score;        // The GenTrain score\n    float original_score;     // The original score before editing this cluster\n    uint8_t edited;           // Whether this cluster has been manually manipulated\n} ClusterScore;\n\ntypedef struct {\n    ClusterStats aa_cluster_stats; // Describes AA genotype cluster\n    ClusterStats ab_cluster_stats; // Describes AB genotype cluster\n    ClusterStats bb_cluster_stats; // Describes BB genotype cluster\n    float intensity_threshold;     // Intensity threshold for no-call\n    ClusterScore cluster_score;    // Various scores for cluster\n    int32_t address;               // Bead type identifier for probe A\n    float r_mean;                  // precomputed clusters mean\n} ClusterRecord;\n\ntypedef struct {\n    char *fn;\n    hFILE *hfile;\n    int32_t version;\n    char *gencall_version;       // The GenCall version\n    char *cluster_version;       // The clustering algorithm version\n    char *call_version;          // The genotyping algorithm version\n    char *normalization_version; // The normalization algorithm version\n    char *date_created;          // The date the cluster file was created (e.g., 3/9/2017 2:18:30 PM)\n    uint8_t is_wgt;\n    int32_t data_block_version;\n    char *opa;\n    char *manifest_name; // The manifest name used to build this cluster file\n    int32_t num_records;\n    ClusterRecord *cluster_records;\n    char **names; // Names of records from manifest\n    void *names2index;\n    char unsigned md5_buf[16];\n} egt_t;\n\nstatic void clusterscore_read(ClusterScore *clusterscore, hFILE *hfile, hts_md5_context *md5) {\n    read_bytes(hfile, (void *)&clusterscore->cluster_separation, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterscore->total_score, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterscore->original_score, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterscore->edited, sizeof(uint8_t), md5);\n}\n\nstatic void clusterrecord_read(ClusterRecord *clusterrecord, hFILE *hfile, int32_t data_block_version,\n                               hts_md5_context *md5) {\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.N, sizeof(int32_t), md5);\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.N, sizeof(int32_t), md5);\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.N, sizeof(int32_t), md5);\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_mean, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_mean, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_mean, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_dev, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_mean, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_mean, sizeof(float), md5);\n    read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_mean, sizeof(float), md5);\n    if (data_block_version >= 7) {\n        read_bytes(hfile, (void *)&clusterrecord->intensity_threshold, sizeof(float), md5);\n        read_bytes(hfile, NULL, 14 * sizeof(float), md5);\n    } else {\n        clusterrecord->intensity_threshold = NAN;\n    }\n}\n\nstatic egt_t *egt_init(const char *fn, int eof_check, int checksum) {\n    int i;\n    egt_t *egt = (egt_t *)calloc(1, sizeof(egt_t));\n    egt->fn = strdup(fn);\n    egt->hfile = hopen(egt->fn, \"rb\");\n    if (egt->hfile == NULL) error(\"Could not open %s: %s\\n\", egt->fn, strerror(errno));\n    if (is_gzip(egt->hfile)) error(\"File %s is gzip compressed and currently cannot be sought\\n\", egt->fn);\n\n    hts_md5_context *md5 = checksum ? hts_md5_init() : NULL;\n\n    read_bytes(egt->hfile, (void *)&egt->version, sizeof(int32_t), md5);\n    if (egt->version != 3) error(\"EGT cluster file version %d not supported\\n\", egt->version);\n\n    read_pfx_string(egt->hfile, &egt->gencall_version, NULL, md5);\n    read_pfx_string(egt->hfile, &egt->cluster_version, NULL, md5);\n    read_pfx_string(egt->hfile, &egt->call_version, NULL, md5);\n    read_pfx_string(egt->hfile, &egt->normalization_version, NULL, md5);\n    read_pfx_string(egt->hfile, &egt->date_created, NULL, md5);\n\n    read_bytes(egt->hfile, (void *)&egt->is_wgt, sizeof(uint8_t), md5);\n    if (egt->is_wgt != 1) error(\"Only WGT cluster file version supported\\n\");\n\n    read_pfx_string(egt->hfile, &egt->manifest_name, NULL, md5);\n\n    read_bytes(egt->hfile, (void *)&egt->data_block_version, sizeof(int32_t), md5);\n    if (egt->data_block_version < 5 || egt->data_block_version == 6 || egt->data_block_version > 9)\n        error(\"Data block version %d in cluster file not supported\\n\", egt->data_block_version);\n    read_pfx_string(egt->hfile, &egt->opa, NULL, md5);\n\n    read_bytes(egt->hfile, (void *)&egt->num_records, sizeof(int32_t), md5);\n    egt->cluster_records = (ClusterRecord *)malloc(egt->num_records * sizeof(ClusterRecord));\n    for (i = 0; i < egt->num_records; i++)\n        clusterrecord_read(&egt->cluster_records[i], egt->hfile, egt->data_block_version, md5);\n    for (i = 0; i < egt->num_records; i++) clusterscore_read(&egt->cluster_records[i].cluster_score, egt->hfile, md5);\n\n    // toss useless strings such as aa_ab_bb/aa_ab/aa_bb/ab_bb\n    for (i = 0; i < egt->num_records; i++) read_pfx_string(egt->hfile, NULL, NULL, md5);\n\n    egt->names = (char **)malloc(egt->num_records * sizeof(char *));\n    egt->names2index = khash_str2int_init();\n    for (i = 0; i < egt->num_records; i++) {\n        read_pfx_string(egt->hfile, &egt->names[i], NULL, md5);\n        if (khash_str2int_has_key(egt->names2index, egt->names[i]))\n            error(\"Illumina probe %s present multiple times in file %s\\n\", egt->names[i], fn);\n        khash_str2int_inc(egt->names2index, egt->names[i]);\n    }\n    for (i = 0; i < egt->num_records; i++)\n        read_bytes(egt->hfile, (void *)&egt->cluster_records[i].address, sizeof(int32_t), md5);\n\n    int32_t aa_n, ab_n, bb_n;\n    for (i = 0; i < egt->num_records; i++) {\n        read_bytes(egt->hfile, (void *)&aa_n, sizeof(int32_t), md5);\n        read_bytes(egt->hfile, (void *)&ab_n, sizeof(int32_t), md5);\n        read_bytes(egt->hfile, (void *)&bb_n, sizeof(int32_t), md5);\n        if (egt->cluster_records[i].aa_cluster_stats.N != aa_n || egt->cluster_records[i].ab_cluster_stats.N != ab_n\n            || egt->cluster_records[i].bb_cluster_stats.N != bb_n)\n            error(\"Cluster counts don't match with EGT cluster file %s\\n\", egt->fn);\n    }\n\n    if (egt->data_block_version == 9) read_bytes(egt->hfile, NULL, egt->num_records * sizeof(float), md5);\n    if (eof_check && !heof(egt->hfile))\n        error(\n            \"EGT reader did not reach the end of file %s at position %ld\\nUse --do-not-check-eof to suppress this \"\n            \"check\\n\",\n            egt->fn, htell(egt->hfile));\n    if (!heof(egt->hfile)) {\n        if (eof_check)\n            error(\n                \"EGT reader did not reach the end of file %s at position %ld\\nUse --do-not-check-eof to suppress this \"\n                \"check\\n\",\n                egt->fn, htell(egt->hfile));\n        if (checksum)\n            while (md5_hgetc(egt->hfile, md5) != EOF);\n    }\n\n    if (md5) {\n        hts_md5_final(egt->md5_buf, md5);\n        hts_md5_destroy(md5);\n    }\n\n    for (i = 0; i < egt->num_records; i++) {\n        ClusterStats *aa = &egt->cluster_records[i].aa_cluster_stats;\n        ClusterStats *ab = &egt->cluster_records[i].ab_cluster_stats;\n        ClusterStats *bb = &egt->cluster_records[i].bb_cluster_stats;\n        egt->cluster_records[i].r_mean =\n            (aa->N * aa->r_mean + ab->N * ab->r_mean + bb->N * bb->r_mean) / (aa->N + ab->N + bb->N);\n    }\n    return egt;\n}\n\nstatic void egt_destroy(egt_t *egt) {\n    if (!egt) return;\n    int i;\n    if (hclose(egt->hfile) < 0) error(\"Error closing EGT file %s\\n\", egt->fn);\n    free(egt->fn);\n    free(egt->gencall_version);\n    free(egt->cluster_version);\n    free(egt->call_version);\n    free(egt->normalization_version);\n    free(egt->date_created);\n    free(egt->opa);\n    free(egt->manifest_name);\n    free(egt->cluster_records);\n    for (i = 0; i < egt->num_records; i++) free(egt->names[i]);\n    free(egt->names);\n    khash_str2int_destroy(egt->names2index);\n    free(egt);\n}\n\n// static void egt_to_csv(const egt_t *egt, FILE *stream, int verbose) {\n//     fprintf(stream, \"Illumina, Inc.\\n\");\n//     fprintf(stream, \"[Heading]\\n\");\n//     fprintf(stream, \"Descriptor File Name,%s\\n\", strrchr(egt->fn, '/') ? strrchr(egt->fn, '/') + 1 : egt->fn);\n//     fprintf(stream, \"GenCall version,%s\\n\", egt->gencall_version);\n//     fprintf(stream, \"Clustering algorithm version,%s\\n\", egt->cluster_version);\n//     fprintf(stream, \"Genotyping algorithm version,%s\\n\", egt->call_version);\n//     fprintf(stream, \"Normalization algorithm version,%s\\n\", egt->normalization_version);\n//     fprintf(stream, \"Date Manufactured,%s\\n\", egt->date_created);\n//     fprintf(stream, \"Manifest name used to build this cluster file,%s\\n\", egt->manifest_name);\n//     fprintf(stream, \"OPA,%s\\n\", egt->opa ? egt->opa : \"\");\n//     fprintf(stream, \"Loci Count,%d\\n\", egt->num_records);\n//     fprintf(stream, \"[Assay]\\n\");\n//     fprintf(stream,\n//             \"Name,AA.N,AA.R_dev,AA.R_mean,AA.Theta_dev,AA.Theta_mean,AB.N,AB.R_dev,AB.R_mean,AB.\"\n//             \"Theta_dev,AB.Theta_mean,BB.N,BB.R_dev,BB.R_mean,BB.Theta_dev,BB.Theta_mean,Intensity \"\n//             \"Threshold,Cluster Separation,GenTrain Score,Original Score,Edited,Address\\n\");\n//     if (verbose) {\n//         int i;\n//         for (i = 0; i < egt->num_records; i++) {\n//             ClusterRecord *cluster_record = &egt->cluster_records[i];\n//             fprintf(stream, \"%s,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%f,%f,%f,%d,%d\\n\", egt->names[i],\n//                     cluster_record->aa_cluster_stats.N, cluster_record->aa_cluster_stats.r_dev,\n//                     cluster_record->aa_cluster_stats.r_mean, cluster_record->aa_cluster_stats.theta_dev,\n//                     cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.N,\n//                     cluster_record->ab_cluster_stats.r_dev, cluster_record->ab_cluster_stats.r_mean,\n//                     cluster_record->ab_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_mean,\n//                     cluster_record->bb_cluster_stats.N, cluster_record->bb_cluster_stats.r_dev,\n//                     cluster_record->bb_cluster_stats.r_mean, cluster_record->bb_cluster_stats.theta_dev,\n//                     cluster_record->bb_cluster_stats.theta_mean, cluster_record->intensity_threshold,\n//                     cluster_record->cluster_score.cluster_separation, cluster_record->cluster_score.total_score,\n//                     cluster_record->cluster_score.original_score, cluster_record->cluster_score.edited,\n//                     cluster_record->address);\n//         }\n//     } else {\n//         fprintf(stream, \"... use --verbose to visualize Assay data ...\\n\");\n//     }\n// }\n\n/****************************************\n * MATLAB ROBUST FIT ROUTINES           *\n ****************************************/\n\n// the code for these routines was derived from the Statistics and Machine Learning Toolbox in MATLAB\n// Illumina implemented the whole robustfit() function despite the fact that they only needed\n// the one dimensional case of it that could easily do away with matrices\n// the implementation here reimplements one dimensional linear regression to solve the linear least squares problem\n// and so doing away with the need for matrix routines for computing the QR matrix factorization\n// the original implementation of robustfit() from Tom Lane was the one adopted in GenTrain 2.0:\n// http://github.com/iarsenal95/computer_vision/blob/master/final_project/MATLAB/boosting/weightedstats/private/statrobustfit.m\n// in 2002 Tom Lane realized that the MATLAB implementation of the madsigma() sub-routine was problematic:\n// http://groups.google.com/g/comp.soft-sys.matlab/c/Raf-VYUh9yY/m/gIi16wAR4VQJ\n// this must have led to the new version of madsigma() being adopted in GenTrain 3.0:\n// http://github.com/stephane-on/Spectral_analysis/blob/master/statrobustfit.m\n\ninline static double sqr(double x) { return x * x; }\n\ninline static float sqrf(float x) { return x * x; }\n\n// equivalent to MATLAB linsolve(x,y)\n// http://www.mathworks.com/help/matlab/ref/linsolve.html\nstatic int matlab_linsolve0(int n, const float *x, const float *y, double *m) {\n    int i;\n    double sumx2 = 0.0;\n    double sumxy = 0.0;\n    for (i = 0; i < n; i++) {\n        sumx2 += sqr((double)x[i]);\n        sumxy += (double)x[i] * (double)y[i];\n    }\n    if (sumx2 == 0) return 1;\n    *m = sumxy / sumx2;\n    return 0;\n}\n\n// equivalent to MATLAB linsolve([ones(n,1), x],y)\n// http://www.mathworks.com/help/matlab/ref/linsolve.html\nstatic int matlab_linsolve1(int n, const float *x, const float *y, double *b, double *m) {\n    int i;\n    double sumx2 = 0.0;\n    double sumxy = 0.0;\n    double sumx = 0.0;\n    double sumy = 0.0;\n    for (i = 0; i < n; i++) {\n        sumx2 += sqr((double)x[i]);\n        sumxy += (double)y[i] * (double)x[i];\n        sumx += (double)x[i];\n        sumy += (double)y[i];\n    }\n    double denom = (double)n * sumx2 - sumx * sumx;\n    if (denom == 0) return 1;\n    *m = (n * sumxy - sumx * sumy) / denom;\n    *b = (sumy * sumx2 - sumx * sumxy) / denom;\n    return 0;\n}\n\n// equivalent to MATLAB wfit(y,x,w) which is equivalent to linsolve(diag(sqrt(w))*x,diag(sqrt(w))*y)\n// stats/private/statrobustfit.m\nstatic int matlab_wfit0(int n, const float *y, const float *x, const double *w, double *m) {\n    int i;\n    double wsumx2 = 0.0;\n    double wsumxy = 0.0;\n    for (i = 0; i < n; i++) {\n        wsumx2 += w[i] * sqr((double)x[i]);\n        wsumxy += w[i] * (double)x[i] * (double)y[i];\n    }\n    if (wsumx2 == 0) return 1;\n    *m = wsumxy / wsumx2;\n    return 0;\n}\n\n// equivalent to MATLAB wfit(y,[ones(n,1),x],w) which is equivalent to\n// linsolve(diag(sqrt(w))*[ones(n,1),x],diag(sqrt(w))*y) stats/private/statrobustfit.m\nstatic int matlab_wfit1(int n, const float *y, const float *x, const double *w, double *b, double *m) {\n    int i;\n    double wsumx2 = 0.0;\n    double wsumxy = 0.0;\n    double wsumx = 0.0;\n    double wsumy = 0.0;\n    double wsum = 0.0;\n    for (i = 0; i < n; i++) {\n        wsumx2 += w[i] * sqr((double)x[i]);\n        wsumxy += w[i] * (double)x[i] * (double)y[i];\n        wsumx += w[i] * (double)x[i];\n        wsumy += w[i] * (double)y[i];\n        wsum += w[i];\n    }\n    double denom = wsum * wsumx2 - wsumx * wsumx;\n    if (denom == 0) return 1;\n    *m = (wsum * wsumxy - wsumx * wsumy) / denom;\n    *b = (wsumy * wsumx2 - wsumx * wsumxy) / denom;\n    return 0;\n}\n\n// http://www.mathworks.com/help/stats/nanmean.html\nstatic float matlab_nanmean(int n, const float *vals) {\n    if (n == 0) return NAN;\n    int i, j;\n    double sum = 0.0;\n    for (i = 0, j = 0; i < n; i++) {\n        if (!isnan(vals[i])) {\n            sum += vals[i];\n            j++;\n        }\n    }\n    return (float)(sum / (double)j);\n}\n\n// http://www.mathworks.com/help/matlab/ref/mean.html\nstatic float matlab_mean(int n, const float *vals) {\n    if (n == 0) return NAN;\n    int i;\n    double sum = 0.0;\n    for (i = 0; i < n; i++) sum += vals[i];\n    return (float)(sum / (double)n);\n}\n\n// the input array does not need to be sorted\n// http://www.mathworks.com/help/matlab/ref/median.html\nstatic float matlab_median(int n, float *vals) {\n    if (n == 0) return 0.0f;\n    ks_introsort_float((size_t)n, vals);\n    if (n % 2 == 1) return vals[n / 2];\n    return (vals[n / 2 - 1] + vals[n / 2]) * 0.5f;\n}\n\n// stats/private/statrobustfit.m\n// function s = madsigma(r,p)\n// %MADSIGMA    Compute sigma estimate using MAD of residuals from 0\n// rs = sort(abs(r));\n// s = median(rs(max(1,p):end)) / 0.6745; % 0.6745 ~ qnorm(0.75)\nstatic double matlab_madsigma_new(int n, const double *r, int p) {\n    int i;\n    float *rs = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) rs[i] = (float)fabs(r[i]);\n    ks_introsort_float((size_t)n, rs);\n    double s = (double)matlab_median(n - (p - 1), rs + (p - 1)) / 0.6745;\n    if (s == 0.0) s = 0.5 * (double)matlab_mean(n, rs);\n    free(rs);\n    return s;\n}\n\n// a separate implementation from Illumina can be found in function madsigma in file Utils.cs\n// the code follows the original implementation from Tom Lane in 2000\n// stats/private/statrobustfit.m\n// function s = madsigma(r,p);\n// %MADSIGMA    Compute sigma estimate using MAD of residuals\n// m = median(r);\n// rs = sort(abs(r-m));\n// if (abs(m) > rs(end))\n//     % Unexpectedly all residuals are very small\n//     rs = sort(abs(r));\n// end\n// s = median(rs(p:end)) / 0.6745; % 0.6745 ~ qnorm(0.75)\n// if (s==0), s = .5*mean(rs); end\nstatic double matlab_madsigma_old(int n, const double *r, int p) {\n    int i;\n    float *rs = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) rs[i] = (float)r[i];\n    float m = matlab_median((size_t)n, rs);\n    for (i = 0; i < n; i++) rs[i] = fabsf(rs[i] - m);\n    ks_introsort_float((size_t)n, rs);\n    if (fabsf(m) > rs[n - 1]) {\n        for (i = 0; i < n; i++) rs[i] = fabsf((float)r[i]);\n        ks_introsort_float((size_t)n, rs);\n    }\n    double s = (double)matlab_median(n - (p - 1), rs + (p - 1)) / 0.6745;\n    if (s == 0.0) s = 0.5 * (double)matlab_mean(n, rs);\n    free(rs);\n    return s;\n}\n\n// roughly equivalent to MATLAB robustfit(x,y,'bisquare',4.685,'off')\n// http://www.mathworks.com/help/stats/robustfit.html\n// stats/private/statrobustfit.m\n// stats/private/statrobustwfun.m\nstatic void matlab_robustfit0(int n, const float *x, const float *y, double (*madsigma)(int, const double *, int),\n                              float *out_m) {\n    int i;\n    double *r = (double *)malloc(n * sizeof(double));\n    double *w = (double *)malloc(n * sizeof(double));\n    double m, m0 = 0.0;\n    if (matlab_linsolve0(n, x, y, &m)) error(\"Error while running linsolve0\\n\");\n    // [Q,R] = qr(x,0);\n    // R = [sqrt(sum(x.^2)]\n    // E = X/R = [x/sqrt(sum(x.^2)]\n    // h = min(.9999, sum(E.*E,2)) = min(.9999, x.^2 / sum(x.^2))\n    // adjfactor = 1 ./ sqrt(1-h)\n    // as GenCall messed up the implementation, here we use instead\n    // h = min(.9999, sum(E.*E)) = min(.9999, sum(x.^2) / sum(x.^2)) = .9999\n    // adjfactor = 1 / sqrt(1 - 0.9999) ~ 100;\n    double adjfactor = 100.0 + 24832 * DBL_EPSILON;\n\n    int iter = 0;\n    do {\n        // as Illumina messed up the implementation, here we use adjfactor instead of adjfactor[i]\n        for (i = 0; i < n; i++) r[i] = ((double)y[i] - m * (double)x[i]) * adjfactor;\n        double s = madsigma(n, r, 1);\n        if (s == 0.0) s = 1.0;\n        for (i = 0; i < n; i++) {\n            r[i] *= 1.0 / (s * 4.685);\n            w[i] = fabs(r[i]) < 1 ? sqr(1.0 - sqr(r[i])) : 0.0;\n        }\n        m0 = m;\n        if (matlab_wfit0(n, y, x, w, &m)) error(\"Error while running wfit0\\n\");\n        iter++;\n    } while (iter < 50 && fabs(m - m0) > 1e-6 * (double)fmaxf((float)fabs(m), (float)fabs(m0)));\n\n    free(r);\n    free(w);\n    *out_m = (float)m;\n}\n\n// roughly equivalent to MATLAB robustfit(x,y,'bisquare',4.685,'on')\n// http://www.mathworks.com/help/stats/robustfit.html\n// stats/private/statrobustfit.m\n// stats/private/statrobustwfun.m\nstatic void matlab_robustfit1(int n, const float *x, const float *y, double (*madsigma)(int, const double *, int),\n                              float *out_b, float *out_m) {\n#ifdef __GNUC__\n    if (n <= 0) __builtin_unreachable(); // to prevent an unnecessary \"may be used uninitialized\" warning\n#endif\n    int i;\n    double *adjfactor = (double *)malloc(n * sizeof(double));\n    double *r = (double *)malloc(n * sizeof(double));\n    double *w = (double *)malloc(n * sizeof(double));\n    double b, m, b0 = 0.0, m0 = 0.0;\n    if (matlab_linsolve1(n, x, y, &b, &m))\n        error(\n            \"Error while running linsolve1\\nFailed to normalize and gencall\\nThis typically happens when the wrong \"\n            \"manifest file is used\\n\");\n    // [Q,R] = qr([ones(n,1),x],0);\n    // R = [-sqrt(n), -sum(x)/sqrt(n); 0, sqrt(sum(x.^2)-sum(x)^2/n)]\n    // E = X/R = [-ones(n,1)/sqrt(n), (sum(x)/n-x)/sqrt(sum(x.^2)-sum(x)^2/n)]\n    // h = min(.9999, sum(E.*E,2)) = min(.9999, (n*x.^2 - 2*sum(x)*x + sum(x.^2))/(n*sum(x.^2) - sum(x)^2))\n    double sumx = 0.0;\n    double sumx2 = 0.0;\n    for (i = 0; i < n; i++) {\n        sumx += (double)x[i];\n        sumx2 += sqr((double)x[i]);\n    }\n    double denom = (double)n * sumx2 - sqr(sumx);\n    for (i = 0; i < n; i++) {\n        double h = fmin(.9999, ((double)n * sqr((double)x[i]) - 2.0 * sumx * (double)x[i] + sumx2) / denom);\n        adjfactor[i] = 1.0 / sqrt(1.0 - h);\n    }\n\n    int iter = 0;\n    do {\n        for (i = 0; i < n; i++) r[i] = ((double)y[i] - b - m * (double)x[i]) * adjfactor[i];\n        double s = madsigma(n, r, 2);\n        if (s == 0.0) s = 1.0;\n        for (i = 0; i < n; i++) {\n            r[i] *= 1.0 / (s * 4.685);\n            w[i] = fabs(r[i]) < 1 ? sqr(1.0 - sqr(r[i])) : 0.0;\n        }\n        b0 = b;\n        m0 = m;\n        if (matlab_wfit1(n, y, x, w, &b, &m)) error(\"Error while running wfit1\\n\");\n        iter++;\n    } while (iter < 50\n             && (fabs(b - b0) > 1e-6 * (double)fmaxf((float)fabs(b), (float)fabs(b0))\n                 || fabs(m - m0) > 1e-6 * (double)fmaxf((float)fabs(m), (float)fabs(m0))));\n\n    free(adjfactor);\n    free(r);\n    free(w);\n    *out_b = (float)b;\n    *out_m = (float)m;\n}\n\n/****************************************\n * NEAREST NEIGHBOR ROUTINES            *\n ****************************************/\n\n// a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs\n// It seems like Illumina at first used a function with O(n^2) complexity for the same task and then when they switched\n// from GoldenGate to larger Infinium arrays this solution did not scale anymore. This led to a reimplementation in C as\n// the C# version was not fast enough. For this reason AutoConvert, an almost entirely C# executable, requires this\n// specific function as unmanaged C code while IAAP and ACLI have their equivalent version in C#, maybe because by then\n// computers had become fast enough\n\nint elementsInBin[12];\nint *binData[12];\nint elementsInShiftedBin[11];\nint *binDataShifted[11];\n\n// a separate implementation from Illumina of this function can be found in function ClosestPointsB\nint findClosestSitesToPointsAlongAxis(int n_raw, float *raw_x, float *raw_y, int n_axis, float *axis_x, float *axis_y,\n                                      int *ret) {\n    int i;\n    float *raw_a = NULL;\n    float *raw_b = NULL;\n    float *axis_a = NULL;\n    float axis_max_val;\n    float bin_width;\n    int bin_idx;\n    float quotient;\n    float reminder;\n    int *curr_bin_data;\n    int curr_bin_size;\n    float curr_axis_x;\n    float curr_axis_y;\n    float x_dist;\n    float y_dist;\n    double best_val;\n    int best_idx;\n    int j;\n    int curr_idx;\n    double sq_dist;\n    double axis_max_dist;\n    int use_y = 1;\n    int use_x = 1;\n\n    for (i = 0; i < n_axis; i++) {\n        if (axis_x[i] > 0.0001) {\n            use_y = 0;\n            break;\n        }\n    }\n\n    for (i = 0; i < n_axis; i++) {\n        if (axis_y[i] > 0.0001) {\n            use_x = 0;\n            break;\n        }\n    }\n\n    if (use_y) {\n        raw_a = raw_y;\n        raw_b = raw_x;\n        axis_a = axis_y;\n    } else if (use_x) {\n        raw_a = raw_x;\n        raw_b = raw_y;\n        axis_a = axis_x;\n    } else {\n        return -1;\n    }\n\n    axis_max_val = axis_a[n_axis - 1];\n    bin_width = axis_max_val / 12.0f;\n    axis_max_dist = (double)bin_width;\n\n    for (i = 0; i < n_raw; i++) {\n        if ((double)raw_b[i] > axis_max_dist) continue;\n        bin_idx = (int)(raw_a[i] / bin_width);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n        elementsInBin[bin_idx]++;\n        bin_idx = (int)(raw_a[i] / bin_width - 0.5f);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 10) bin_idx = 10;\n        elementsInShiftedBin[bin_idx]++;\n    }\n\n    for (i = 0; i <= 11; i++) {\n        binData[i] = (int *)malloc((size_t)elementsInBin[i] * sizeof(int));\n        elementsInBin[i] = 0;\n        if (i == 11) continue;\n        binDataShifted[i] = (int *)malloc((size_t)elementsInShiftedBin[i] * sizeof(int));\n        elementsInShiftedBin[i] = 0;\n    }\n\n    for (i = 0; i < n_raw; i++) {\n        if ((double)raw_b[i] > axis_max_dist) continue;\n        bin_idx = (int)(raw_a[i] / bin_width);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n        binData[bin_idx][elementsInBin[bin_idx]] = i;\n        elementsInBin[bin_idx]++;\n        bin_idx = (int)(raw_a[i] / bin_width - 0.5f);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 10) bin_idx = 10;\n        binDataShifted[bin_idx][elementsInShiftedBin[bin_idx]] = i;\n        elementsInShiftedBin[bin_idx]++;\n    }\n\n    for (i = 0; i < n_axis; i++) {\n        quotient = axis_a[i] / bin_width;\n        bin_idx = (int)quotient;\n        reminder = quotient - (float)bin_idx;\n        curr_bin_data = NULL;\n        curr_bin_size = 0;\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n\n        if (0.25f <= reminder && reminder <= 0.75f) {\n            curr_bin_data = binData[bin_idx];\n            curr_bin_size = elementsInBin[bin_idx];\n        } else {\n            if (reminder < 0.25f) {\n                if (bin_idx == 0) {\n                    curr_bin_data = binData[bin_idx];\n                    curr_bin_size = elementsInBin[bin_idx];\n                } else {\n                    curr_bin_data = binDataShifted[bin_idx - 1];\n                    curr_bin_size = elementsInShiftedBin[bin_idx - 1];\n                }\n            } else if (bin_idx == 11) {\n                curr_bin_data = binData[bin_idx];\n                curr_bin_size = elementsInBin[bin_idx];\n            } else {\n                curr_bin_data = binDataShifted[bin_idx];\n                curr_bin_size = elementsInShiftedBin[bin_idx];\n            }\n        }\n\n        curr_axis_x = axis_x[i];\n        curr_axis_y = axis_y[i];\n        best_val = 1e20;\n        best_idx = -1;\n\n        for (j = 0; j < curr_bin_size; j++) {\n            curr_idx = curr_bin_data[j];\n            x_dist = raw_x[curr_idx] - curr_axis_x;\n            y_dist = raw_y[curr_idx] - curr_axis_y;\n            sq_dist = (double)(x_dist * x_dist + y_dist * y_dist);\n            if (sq_dist < best_val) {\n                best_val = sq_dist;\n                best_idx = curr_idx;\n            }\n        }\n\n        ret[i] = best_idx;\n    }\n\n    for (i = 0; i <= 11; i++) {\n        free((void *)binData[i]);\n        elementsInBin[i] = 0;\n        if (i > 10) continue;\n        free((void *)binDataShifted[i]);\n        elementsInShiftedBin[i] = 0;\n    }\n\n    return 0;\n}\n\n// a separate implementation from Illumina of this function can be found in function ClosestPointsSlow\n// as explained in the patent, this approach is slow as it runs in O(n^2)\nstatic int *closest_points_slow(int nref, const float *xref, const float *yref, int n, float *x, float *y) {\n    int i, j, *closest_sites = (int *)malloc(n * sizeof(int));\n    for (i = 0; i < n; i++) {\n        float xv = x[i];\n        float yv = y[i];\n        double mindist = (xv - xref[0]) * (xv - xref[0]) + (yv - yref[0]) * (yv - yref[0]);\n        int mini = 0;\n        for (j = 1; j < nref; j++) {\n            double dist = (xv - xref[j]) * (xv - xref[j]) + (yv - yref[j]) * (yv - yref[j]);\n            if (dist < mindist) {\n                mindist = dist;\n                mini = j;\n            }\n        }\n        closest_sites[i] = mini;\n    }\n    return closest_sites;\n}\n\n#define SAMPLE 2000 // mentioned in file Utils.cs\n\n// a separate implementation from Illumina of this function can be found in function ClosestPoints\nstatic int *closest_points(int nref, float *xref, float *yref, int n, float *x, float *y) {\n    if (nref < SAMPLE) return closest_points_slow(nref, xref, yref, n, x, y);\n    int *closest_sites = (int *)malloc(n * sizeof(int));\n    findClosestSitesToPointsAlongAxis(nref, xref, yref, n, x, y, closest_sites);\n    return closest_sites;\n}\n\n/****************************************\n * MATLAB UTILS ROUTINES                *\n ****************************************/\n\n// a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs\n\n// the input array does need to be sorted\nstatic float percentile(int n, const float *vals, int percentile) {\n    if (n == 0) return NAN;\n    int i1 = n * percentile / 100;\n    float f = (float)(n * percentile) / 100.0f - (float)i1;\n    if (f < 0.5f) {\n        i1--;\n    }\n    if (i1 < 0) {\n        return vals[0];\n    }\n    if (i1 >= n - 1) {\n        return vals[n - 1];\n    }\n\n    float x1 = 100.0f * ((float)i1 + 0.5f) / (float)n;\n    float x2 = 100.0f * ((float)(i1 + 1) + 0.5f) / (float)n;\n    float y1 = (float)vals[i1];\n    float y2 = (float)vals[i1 + 1];\n    float m = (y2 - y1) / (x2 - x1);\n    return y1 + m * ((float)percentile - x1);\n}\n\n// http://www.mathworks.com/help/matlab/ref/iqr.html\nstatic float matlab_iqr(int n, const float *vals) {\n    int i;\n    float *vs = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) vs[i] = vals[i];\n    ks_introsort_float((size_t)n, vs);\n    float iqr = percentile(n, vs, 75) - percentile(n, vs, 25);\n    free(vs);\n    return iqr;\n}\n\n// http://www.mathworks.com/help/stats/trimmean.html\nstatic float matlab_trimmean(int n, float *vals, int percent) {\n    ks_introsort_float((size_t)n, vals);\n    float high = percentile(n, vals, 100 - percent / 2);\n    float low = percentile(n, vals, percent / 2);\n    double sum = 0.0;\n    int i, count = 0;\n    for (i = 0; i < n; i++) {\n        if (vals[i] >= low && vals[i] <= high) {\n            sum += (double)vals[i];\n            count++;\n        }\n    }\n    return (float)sum / (float)count;\n}\n\n// http://www.mathworks.com/help/matlab/ref/linspace.html\nstatic float *matlab_linspace(int n, float minv, float maxv) {\n    int i;\n    float *vals = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) vals[i] = minv + (maxv - minv) * (float)i / (float)(n - 1);\n    return vals;\n}\n\n// the input array does not need to be sorted\n// http://www.mathworks.com/help/matlab/ref/unique.html\nstatic int matlab_unique(int n, int *indices) {\n    int i, j;\n    ks_introsort_int((size_t)n, indices);\n    for (i = 0; indices[i] == -1; i++);\n    indices[0] = indices[i++];\n    for (j = 1; i < n; i++)\n        if (indices[i] != indices[i - 1]) indices[j++] = indices[i];\n    return j;\n}\n\n// the input arrays do not need to be sorted\n// http://www.mathworks.com/help/matlab/ref/union.html\nstatic int *matlab_union(int na, const int *a, int nb, const int *b, int *n) {\n    int i, *c = (int *)malloc((na + nb) * sizeof(int));\n    for (i = 0; i < na; i++) c[i] = a[i];\n    for (i = 0; i < nb; i++) c[i + na] = b[i];\n    *n = matlab_unique(na + nb, c);\n    return c;\n}\n\n// http://www.mathworks.com/help/matlab/ref/min.html\nstatic float matlab_min(int n, const float *vals) {\n    int i;\n    float minval = FLT_MAX;\n    for (i = 0; i < n; i++) {\n        if (isnan(vals[i])) continue;\n        if (vals[i] < minval) minval = vals[i];\n    }\n    return minval;\n}\n\n// http://www.mathworks.com/help/matlab/ref/max.html\nstatic float matlab_max(int n, const float *vals) {\n    int i;\n    float maxval = -FLT_MAX;\n    for (i = 0; i < n; i++) {\n        if (isnan(vals[i])) continue;\n        if (vals[i] > maxval) maxval = vals[i];\n    }\n    return maxval;\n}\n\n/****************************************\n * NORMALIZATION ROUTINES               *\n ****************************************/\n\n// a thorough explanation of the normalization steps can be found in the document\n// Kermani, B. G. Artificial intelligence and global normalization methods for genotyping. U.S. Patent No. 7,035,740\n// (2005-09-29) http://patents.google.com/patent/US7035740 Peiffer, D. A. et al. High-resolution genomic profiling of\n// chromosomal aberrations using Infinium whole-genome genotyping. Genome Res., 16, 1136–1148 (2006-08-09)\n// http://doi-org.ezp-prod1.hul.harvard.edu/10.1101/gr.5402306\n// Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006-09-26)\n// http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf\n// Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016)\n// http://emea.illumina.com/content/dam/illumina-marketing/documents/products/technotes/gentrain3-technical-note-370-2016-015.pdf\n// http://www.illumina.com/content/dam/illumina/gcs/assembled-assets/marketing-literature/gentrain-tech-note-m-gl-01258/gentrain-tech-note-m-gl-01258.pdf\n// a separate implementation from Illumina of these functions in GenCall can be found in file NormalizationInfinium.cs\n// http://support.illumina.com/downloads/gencall_software.html\n\n// Illumina software includes three normalization protocols for Infinium arrays:\n// 1.1.0 Normalization10 not used\n// 1.1.2 Normalization111+Normalization10 used in AutoConvert and IAAP Genotyping CLI with option --gentrain-id 2\n// 1.2.0 NormalizationDragonfish+Normalization111_Dragonfish+Normalization10_Dragonfish used in AutoConvert 2.0, IAAP\n// Genotyping CLI, and Array Analysis CLI we implement version 1.1.2 and 1.2.0 for interoperability purposes with\n// existing Illumina cluster files\n\n// Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome\n// genotyping. Genome Res., 16, 1136–1148 (2006-08-09) The data for each BeadChip is self-normalized using infor- mation\n// contained within the array. This normalization algo- rithm removes outliers, adjusts for channel-dependent back-\n// ground and global intensity differences, and also scales the data.\n// The X and Y color channels undergo an affine coordinate trans-\n// formation to make the data appear as canonical as possible with\n// the homozygotes lying along the transformed x- and y-axes. The\n// following five steps are applied: (1) outlier removal; (2) a trans-\n// lation correction in which the asymptotes are fitted to candidate\n// AA and BB homozygotes; the intersection of these fit lines de-\n// fines the translated origin; (3) rotational correction: the angle of\n// the AA homozygote asymptote with respect to the translated\n// X-axis is used to define the rotational correction; (4) shear cor-\n// rection: the angle of the BB homozygote asymptote with respect\n// to the translated and rotated y-axis is used to define the shear\n// correction; (5) scaling correction: statistical centroids are com-\n// puted for the candidate AA homozygotes to define an x-axis scal-\n// ing parameter, and for candidate BB homozygotes to define a\n// y-axis scaling parameter. The translated, rotated, shear-corrected\n// data are normalized to a scale of ∼1 using the scaling parameters\n\n#define SAMPLING 400 // mentioned in Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006)\n#define ROBUST_THRESHOLD                                                                                               \\\n    192 // mentioned in Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016)\n\n// a separate implementation from Illumina can be found in functions RemoveOutliers from classes\n// Normalization10_Dragonfish and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub.\n// No. 970-2006-010 (2006-09-26) Outlier SNPs are removed from consideration during normalization parameter estimation.\n// These SNPs are only considered outliers during the normalization process and are not excluded from downstream\n// analysis. A SNP is considered an outlier if its intensity meets any of the following criteria:\n// - Its value of x, y, or x/(x+y) is smaller than either the 5th smallest or the 1st percentile (whichever is smaller)\n// of those values across all SNPs.\n// - Its value of x, y, or x/(x+y) is larger than either the 5th largest or the 99 th percentile (whichever is larger)\n// of those values across all SNPs.\nstatic void remove_outliers(int *n, float *x, float *y) {\n    if (*n < SAMPLING) return;\n    int i, j;\n    float *xs = (float *)malloc(*n * sizeof(float));\n    float *ys = (float *)malloc(*n * sizeof(float));\n    float *ts = (float *)malloc(*n * sizeof(float));\n    for (i = 0; i < *n; i++) {\n        xs[i] = x[i];\n        ys[i] = y[i];\n        ts[i] = y[i] / (FLT_MIN * FLT_EPSILON + x[i] + y[i]);\n    }\n    ks_introsort_float((size_t)(*n), xs);\n    ks_introsort_float((size_t)(*n), ys);\n    ks_introsort_float((size_t)(*n), ts);\n\n    int M = 5;\n    int Nb = 1;\n\n    float tcut1a = ts[M - 1];\n    float tcut2a = ts[*n - M + 1 - 1];\n\n    float xcut1a = xs[M - 1];\n    float xcut2a = xs[*n - M + 1 - 1];\n\n    float ycut1a = ys[M - 1];\n    float ycut2a = ys[*n - M + 1 - 1];\n\n    float tcut1b = percentile(*n, ts, Nb);\n    float tcut2b = percentile(*n, ts, 100 - Nb);\n\n    float xcut1b = percentile(*n, xs, Nb);\n    float xcut2b = percentile(*n, xs, 100 - Nb);\n\n    float ycut1b = percentile(*n, ys, Nb);\n    float ycut2b = percentile(*n, ys, 100 - Nb);\n\n    float tcut1 = fminf(tcut1a, tcut1b);\n    float tcut2 = fmaxf(tcut2a, tcut2b);\n\n    float xcut1 = fminf(xcut1a, xcut1b);\n    float xcut2 = fmaxf(xcut2a, xcut2b);\n\n    float ycut1 = fminf(ycut1a, ycut1b);\n    float ycut2 = fmaxf(ycut2a, ycut2b);\n\n    for (i = 0, j = 0; i < *n; i++) {\n        if (y[i] <= ycut1 || x[i] <= xcut1 || y[i] >= ycut2 || x[i] >= xcut2) {\n            continue;\n        }\n        double t = y[i] / (double)(y[i] + x[i]);\n        if (t <= tcut1 || t >= tcut2) {\n            continue;\n        }\n        x[j] = x[i];\n        y[j] = y[i];\n        j++;\n    }\n    *n = j;\n\n    free(xs);\n    free(ys);\n    free(ts);\n}\n\n// a separate implementation from Illumina can be found in function RemoveOffset from class Normalization10_Dragonfish\n// and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010\n// (2006-09-26) a. An x-sweep is performed by sampling 400 points along the x-axis, from the smallest x value to the\n// largest. The closest SNP to each sampled point along the axis is added to the set of candidate homozygote As. b. The\n// same analysis is performed along the y-axis to find the candidate homozygote Bs. c. A straight line is fit into\n// candidate homozygote A alleles. d. A straight line is fit into candidate homozygote B alleles. e. The intercept of\n// the two lines is computed, and this coordinate corresponds to offset_x and offset_y.\nstatic void remove_offset(int n, float *x, float *y, int *naa, int **iaa, int *nbb, int **ibb,\n                          double (*madsigma)(int, const double *, int), float *offset_x, float *offset_y) {\n    if (n < ROBUST_THRESHOLD) {\n        *offset_x = 0.0f;\n        *offset_y = 0.0f;\n        return;\n    }\n\n    int i;\n    float mx = matlab_min(n, x);\n    float my = matlab_min(n, y);\n    float *xt = (float *)malloc(n * sizeof(float));\n    float *yt = (float *)malloc(n * sizeof(float));\n\n    for (i = 0; i < n; i++) {\n        xt[i] = x[i] - mx;\n        yt[i] = y[i] - my;\n    }\n    float *xsweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, xt));\n    float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, yt));\n    float *zeros = (float *)calloc(SAMPLING, sizeof(float));\n\n    *iaa = closest_points(n, xt, yt, SAMPLING, xsweep, zeros);\n    *ibb = closest_points(n, xt, yt, SAMPLING, zeros, ysweep);\n    *naa = matlab_unique(SAMPLING, *iaa);\n    *nbb = matlab_unique(SAMPLING, *ibb);\n\n    float *xaa = (float *)malloc(*naa * sizeof(float));\n    float *yaa = (float *)malloc(*naa * sizeof(float));\n    for (i = 0; i < *naa; i++) {\n        xaa[i] = xt[(*iaa)[i]];\n        yaa[i] = yt[(*iaa)[i]];\n    }\n\n    float *xbb = (float *)malloc(*nbb * sizeof(float));\n    float *ybb = (float *)malloc(*nbb * sizeof(float));\n    for (i = 0; i < *nbb; i++) {\n        xbb[i] = xt[(*ibb)[i]];\n        ybb[i] = yt[(*ibb)[i]];\n    }\n\n    float baa, maa;\n    float bbb, mbb;\n    matlab_robustfit1(*naa, xaa, yaa, madsigma, &baa, &maa);\n    matlab_robustfit1(*nbb, ybb, xbb, madsigma, &bbb, &mbb);\n\n    float ox = (bbb + mbb * baa) / (1.0f - mbb * maa);\n    float oy = baa + maa * ox;\n\n    *offset_x = ox + mx;\n    *offset_y = oy + my;\n\n    for (i = 0; i < n; i++) {\n        x[i] -= *offset_x;\n        y[i] -= *offset_y;\n    }\n\n    free(xt);\n    free(yt);\n    free(xsweep);\n    free(ysweep);\n    free(zeros);\n    free(xaa);\n    free(yaa);\n    free(xbb);\n    free(ybb);\n}\n\n// a separate implementation from Illumina can be found in function HandleRotation from class Normalization10_Dragonfish\n// and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010\n// (2006-09-26) a. The points are corrected for translation and another x-sweep is performed to determine a set of\n// control points. b. A straight line is fit into the control points. The angle between this line and the x-axis defines\n// the amount of rotation in the data. This angle corresponds to the theta parameter.\nstatic void handle_rotation(int n, float *x, float *y, int *naa, int **iaa, int *nbb, int **ibb,\n                            double (*madsigma)(int, const double *, int), float *theta) {\n    if (n < ROBUST_THRESHOLD) {\n        *theta = 0.0f;\n        return;\n    }\n\n    int i;\n    float *xsweep = matlab_linspace(SAMPLING, matlab_min(n, x), matlab_max(n, x));\n    float *ysweep = matlab_linspace(SAMPLING, matlab_min(n, y), matlab_max(n, y));\n    float *zeros = (float *)calloc(SAMPLING, sizeof(float));\n    int *tiaa = closest_points(n, x, y, SAMPLING, xsweep, zeros);\n    int *tibb = closest_points(n, x, y, SAMPLING, zeros, ysweep);\n\n    int naa_in = *naa, *iaa_in = *iaa;\n    int nbb_in = *nbb, *ibb_in = *ibb;\n    *iaa = matlab_union(naa_in, iaa_in, SAMPLING, tiaa, naa);\n    *ibb = matlab_union(nbb_in, ibb_in, SAMPLING, tibb, nbb);\n\n    float *tx = (float *)malloc(*naa * sizeof(float));\n    float *ty = (float *)malloc(*naa * sizeof(float));\n    for (i = 0; i < *naa; i++) {\n        tx[i] = x[(*iaa)[i]];\n        ty[i] = y[(*iaa)[i]];\n    }\n\n    float m;\n    matlab_robustfit0(*naa, tx, ty, madsigma, &m);\n\n    double taa = atan((double)m);\n    double ct = cos(taa);\n    double st = sin(taa);\n\n    for (i = 0; i < n; i++) {\n        float tmp = x[i];\n        x[i] = (float)(ct * (double)x[i] + st * (double)y[i]);\n        y[i] = (float)((0.0 - st) * (double)tmp + ct * (double)y[i]);\n    }\n\n    *theta = (float)taa;\n\n    free(xsweep);\n    free(ysweep);\n    free(zeros);\n    free(iaa_in);\n    free(ibb_in);\n    free(tiaa);\n    free(tibb);\n    free(tx);\n    free(ty);\n}\n\n// a separate implementation from Illumina can be found in function HandleShear from class Normalization10_Dragonfish\n// and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010\n// (2006-09-26) a. The points are corrected for rotation and another y-sweep is performed to determine a set of control\n// points. b. A straight line is fit to these control points. The angle of this line identifies the shear parameter\nstatic void handle_shear(int n, float *x, float *y, int *nbb, int **ibb, double (*madsigma)(int, const double *, int),\n                         float *shear) {\n    if (n < ROBUST_THRESHOLD) {\n        *shear = 0.0f;\n        return;\n    }\n\n    int i;\n    float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, y));\n    float *zeros = (float *)calloc(SAMPLING, sizeof(float));\n    int *tibb = closest_points(n, x, y, SAMPLING, zeros, ysweep);\n    int nbb_in = *nbb, *ibb_in = *ibb;\n    *ibb = matlab_union(nbb_in, ibb_in, SAMPLING, tibb, nbb);\n\n    float *tx = (float *)malloc(*nbb * sizeof(float));\n    float *ty = (float *)malloc(*nbb * sizeof(float));\n    for (i = 0; i < *nbb; i++) {\n        tx[i] = x[(*ibb)[i]];\n        ty[i] = y[(*ibb)[i]];\n    }\n\n    float m;\n    matlab_robustfit0(*nbb, ty, tx, madsigma, &m);\n\n    double tbb = atan((double)m);\n    double shy = tan(tbb);\n\n    for (i = 0; i < n; i++) x[i] = (float)((double)x[i] - shy * (double)y[i]);\n\n    *shear = (float)shy;\n\n    free(ibb_in);\n    free(ysweep);\n    free(zeros);\n    free(tibb);\n    free(tx);\n    free(ty);\n}\n\n// a separate implementation from Illumina can be found in function HandleScale from classes Normalization10_Dragonfish\n// and Normalization10 0.7413 ~ 1/(2*qnorm(0.75))\nstatic void base_handle_scale(int n, float *x, float *y, int gentrain_version, float *scale_x, float *scale_y) {\n    int i, naa, nbb, *iaa, *ibb;\n    // this should never happen\n    for (i = 0; i < n; i++) {\n        if (x[i] < 0.0f) x[i] = 0.0f;\n        if (y[i] < 0.0f) y[i] = 0.0f;\n    }\n\n    if (n < ROBUST_THRESHOLD) {\n        float *t = (float *)malloc(n * sizeof(float));\n        // for GenTrain 2.0 we replicate the bug by allowing failed probes as AA points\n        for (i = 0; i < n; i++)\n            t[i] = x[i] > 0.0f || y[i] > 0.0f || gentrain_version == 2\n                       ? (float)(180.0 * M_1_PI * atan2((double)y[i], (double)x[i]))\n                       : NAN;\n\n        naa = 0;\n        nbb = 0;\n        for (i = 0; i < n; i++) {\n            if (t[i] < 10.0f) naa++;\n            if (t[i] > 80.0f) nbb++;\n        }\n        iaa = (int *)malloc(naa * sizeof(int));\n        ibb = (int *)malloc(nbb * sizeof(int));\n        naa = 0;\n        nbb = 0;\n        for (i = 0; i < n; i++) {\n            if (t[i] < 10.0f) iaa[naa++] = i;\n            if (t[i] > 80.0f) ibb[nbb++] = i;\n        }\n\n        free(t);\n    } else {\n        float *xsweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, x));\n        float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, y));\n        float *zeros = (float *)calloc(SAMPLING, sizeof(float));\n\n        iaa = closest_points(n, x, y, SAMPLING, xsweep, zeros);\n        ibb = closest_points(n, x, y, SAMPLING, zeros, ysweep);\n        naa = matlab_unique(SAMPLING, iaa);\n        nbb = matlab_unique(SAMPLING, ibb);\n\n        free(xsweep);\n        free(ysweep);\n        free(zeros);\n    }\n\n    float *xaa = (float *)malloc(naa * sizeof(float));\n    float *ybb = (float *)malloc(nbb * sizeof(float));\n    for (i = 0; i < naa; i++) xaa[i] = x[iaa[i]];\n    for (i = 0; i < nbb; i++) ybb[i] = y[ibb[i]];\n\n    if (n < ROBUST_THRESHOLD) {\n        *scale_x = matlab_trimmean(naa, xaa, 20);\n        *scale_y = matlab_trimmean(nbb, ybb, 20);\n    } else {\n        *scale_x = 0.5f * matlab_trimmean(naa, xaa, 50) + 0.7413f * matlab_iqr(naa, xaa);\n        *scale_y = 0.5f * matlab_trimmean(nbb, ybb, 50) + 0.7413f * matlab_iqr(nbb, ybb);\n    }\n\n    for (i = 0; i < n; i++) {\n        x[i] /= *scale_x;\n        y[i] /= *scale_y;\n    }\n\n    free(iaa);\n    free(ibb);\n    free(xaa);\n    free(ybb);\n}\n\n// a separate implementation from Illumina can be found in function HandleScale from class Normalization111_Dragonfish\n// and Normalization111 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010\n// (2006-09-26) a. The points are corrected for shear, and another x-sweep is performed to identify a set of virtual\n// points. b. A statistical robust measure of the mean of these control points is used to determine scale_x. c. A\n// Y-sweep is done, and some virtual points are identified via triangulation. A statistical robust measure of the mean\n// of these control points is used to determine scale_y.\nstatic void handle_scale(int n, float *x, float *y, int gentrain_version, float *scale_x, float *scale_y) {\n    if (n < ROBUST_THRESHOLD) {\n        base_handle_scale(n, x, y, gentrain_version, scale_x, scale_y);\n        return;\n    }\n\n    int i;\n    int naa = 0;\n    int nbb = 0;\n    float xthrsh = 0.1f * percentile(n, x, 99);\n    float ythrsh = 0.1f * percentile(n, y, 99);\n    for (i = 0; i < n; i++) {\n        if (x[i] > 5.0f * y[i] && x[i] > xthrsh) naa++;\n        if (y[i] > 5.0f * x[i] && y[i] > ythrsh) nbb++;\n    }\n    int *iaa = (int *)malloc(naa * sizeof(int));\n    int *ibb = (int *)malloc(nbb * sizeof(int));\n\n    naa = 0;\n    nbb = 0;\n    for (i = 0; i < n; i++) {\n        if (x[i] > 5.0f * y[i] && x[i] > xthrsh) iaa[naa++] = i;\n        if (y[i] > 5.0f * x[i] && y[i] > ythrsh) ibb[nbb++] = i;\n    }\n\n    float *xaa = (float *)malloc(naa * sizeof(float));\n    float *ybb = (float *)malloc(nbb * sizeof(float));\n    for (i = 0; i < naa; i++) xaa[i] = x[iaa[i]];\n    for (i = 0; i < nbb; i++) ybb[i] = y[ibb[i]];\n\n    float xscale = matlab_trimmean(naa, xaa, 50);\n    float yscale = matlab_trimmean(nbb, ybb, 50);\n\n    for (i = 0; i < n; i++) {\n        x[i] /= xscale;\n        y[i] /= yscale;\n    }\n\n    *scale_x = (float)xscale;\n    *scale_y = (float)yscale;\n\n    free(iaa);\n    free(ibb);\n    free(xaa);\n    free(ybb);\n}\n\nstatic void get_nn12_rr12(int n, const float *x, const float *y, float *nn12, float *rr12) {\n    int i, j;\n    float *xs = (float *)malloc(n * sizeof(float));\n    float *ys = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) {\n        xs[i] = x[i];\n        ys[i] = y[i];\n    }\n    ks_introsort_float((size_t)n, xs);\n    ks_introsort_float((size_t)n, ys);\n    float xthrsh = 0.2f * percentile(n, xs, 95);\n    float ythrsh = 0.2f * percentile(n, ys, 95);\n    int count = 0;\n    for (i = 0; i < n; i++)\n        if (x[i] < xthrsh && y[i] < ythrsh) count++;\n    *nn12 = (float)count / (float)n;\n    if (count) {\n        float *xy = (float *)malloc(n * sizeof(float));\n        for (i = 0; i < n; i++) xy[i] = x[i] + y[i];\n        float *xy12 = (float *)malloc(count * sizeof(float));\n        for (i = 0, j = 0; i < n; i++)\n            if (x[i] < xthrsh && y[i] < ythrsh) xy12[j++] = xy[i];\n        float mean_xy12 = matlab_nanmean(count, xy12);\n        float mean_xy = matlab_nanmean(n, xy);\n        *rr12 = mean_xy12 / mean_xy;\n        free(xy);\n        free(xy12);\n    } else {\n        *rr12 = 1.0f;\n    }\n    free(xs);\n    free(ys);\n}\n\n// a separate implementation from Illumina can be found in function NormalizeSingleBin from class\n// Normalization111_Dragonfish\nstatic void normalize_single_bin(int n, float *x, float *y, int gentrain_version, XForm *xform) {\n    int naa, *iaa = NULL, nbb, *ibb = NULL;\n    double (*madsigma)(int, const double *, int) = gentrain_version == 3 ? matlab_madsigma_new : matlab_madsigma_old;\n    xform->version = 1;\n    remove_outliers(&n, x, y);\n    remove_offset(n, x, y, &naa, &iaa, &nbb, &ibb, madsigma, &xform->offset_x, &xform->offset_y);\n    get_nn12_rr12(n, x, y, &xform->nn12, &xform->rr12);\n    handle_rotation(n, x, y, &naa, &iaa, &nbb, &ibb, madsigma, &xform->theta);\n    free(iaa);\n    handle_shear(n, x, y, &nbb, &ibb, madsigma, &xform->shear);\n    free(ibb);\n    handle_scale(n, x, y, gentrain_version, &xform->scale_x, &xform->scale_y);\n\n    xform->taa = (float)((double)(xform->theta * 180.0f) * M_1_PI);\n    xform->tbb = (float)((atan((double)xform->shear) - (double)xform->theta) * 180.0 * M_1_PI);\n}\n\n// a separate implementation from Illumina can be found in function MirrorData from class NormalizationDragonfish\nstatic void mirror_data(int n, float *x, float *y) {\n    int i;\n    for (i = 0; i < n; i++) {\n        if (y[i] > x[i]) {\n            float tmp = x[i];\n            x[i] = y[i];\n            y[i] = tmp;\n        }\n    }\n}\n\n// a separate implementation from Illumina can be found in function GetAA_Values from class NormalizationDragonfish\nstatic int *get_aa_values(int n, const float *r, const float *t, int *naa) {\n    int i, j;\n    int *iaa = (int *)malloc(n * sizeof(int));\n    for (i = 0, j = 0; i < n; i++)\n        if (t[i] < 0.1f && !isnan(r[i]) && r[i] != FLT_MIN * FLT_EPSILON) iaa[j++] = i;\n    *naa = j;\n    return iaa;\n}\n\n// a separate implementation from Illumina can be found in functions RectToPolar from classes\n// Normalization111_Dragonfish and Normalization111\nstatic void rect_to_polar(int n, float *x, float *y) {\n    int i;\n    float *r = x;\n    float *t = y;\n    for (i = 0; i < n; i++) {\n        if (x[i] == 0.0f && y[i] == 0.0f) {\n            r[i] = NAN;\n            t[i] = NAN;\n            continue;\n        }\n        float tmp = x[i];\n        r[i] = x[i] < 0.0f && y[i] < 0.0f ? FLT_MIN * FLT_EPSILON : fabsf(x[i]) + fabsf(y[i]);\n        t[i] = (float)(atan2((double)y[i], (double)tmp) * M_2_PI);\n    }\n}\n\n// a separate implementation from Illumina can be found in function NormalizeSingleBinSingleChannel from class\n// NormalizationDragonfish Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016)\n// In the sample intensity normalization process, specific\n// groups of loci are normalized together in “normalization\n// bins.” Due to differences in probe design, Infinium I loci\n// (two probes per locus) and Infinium II loci (one probe per\n// locus) are normalized in separate bins. If the number of loci\n// in a normalization bin is small (< 192 loci), the normalization\n// process can be negatively impacted. With the low bead\n// pool complexity supported on the Infinium XT platform,\n// the occurrence of small normalization bins may be more\n// prevalent, especially with normalization bins consisting\n// of Infinium I loci. With the GenTrain 2.0 algorithm, small\n// normalization bin size negatively impacts the normalization\n// of intensity data for the given locus (Figure 1A).\n// The GenTrain 3.0 algorithm improves the normalization\n// of small bins by taking advantage of the special nature\n// of Infinium I assay data, where the signal intensity for\n// both alleles originates in the same color channel. This\n// affords the possibility to fit a normalization model with\n// only two free parameters, instead of six. When applied to\n// the same data mishandled by GenTrain 2.0, GenTrain 3.0\n// improves the performance of the intensity normalization\n// and generates tight clusters (Figure 1B). The GenTrain 3.0\n// algorithm applies the improved normalization model for any\n// normalization bin containing fewer than 192 Infinium I loci.\nstatic void normalize_single_bin_single_channel(int n, float *x, float *y, XForm *xform) {\n    int i, j, k;\n    mirror_data(n, x, y);\n    float *aux = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) aux[i] = y[i];\n    ks_introsort_float((size_t)n, aux);\n    float ythrsh = percentile(n, aux, 2);\n    for (i = 0; i < n; i++) {\n        x[i] -= ythrsh;\n        y[i] -= ythrsh;\n        aux[i] = y[i];\n    }\n    int naa;\n    rect_to_polar(n, x, y);\n    float *r = x;\n    float *t = y;\n    int *iaa = get_aa_values(n, r, t, &naa);\n    for (i = 0; i < naa; i++) aux[i] = aux[iaa[i]];\n    float ymean = matlab_trimmean(naa, aux, 20) - 50.0f;\n    // here we replicate the bug in the normalization protocol\n    for (i = 0, j = 0; i < n; i++)\n        if (!isnan(r[i])) j++;\n    for (i = 0, k = 0; i < j; i++)\n        if (!isnan(r[i])) r[k++] = r[i];\n    for (i = k; i < j; i++) r[i] = 0.0f;\n    float rmean = matlab_trimmean(j, r, 20) - 2.0f * ymean;\n    ythrsh += ymean;\n\n    xform->version = 1;\n    xform->offset_x = ythrsh;\n    xform->offset_y = ythrsh;\n    xform->theta = 0.0f;\n    xform->shear = 0.0f;\n    xform->scale_x = rmean;\n    xform->scale_y = rmean;\n    xform->rr12 = 1.0f;\n    free(iaa);\n    free(aux);\n}\n\n// a separate implementation from Illumina can be found in function Normalize from class NormalizationDragonfish\nstatic XForm *normalize(int n, const uint16_t *xin, const uint16_t *yin, const uint8_t *norm_ids, int gentrain_version,\n                        size_t *n_xforms) {\n    int i, j, max_count = 0, counts[256];\n    *n_xforms = 0;\n    memset(counts, 0, 256 * sizeof(int));\n    int *aux = (int32_t *)malloc(n * sizeof(int));\n\n    // count size of sub-bead pool bins and sort coordinates by bin\n    for (i = 0; i < n; i++) {\n        aux[i] = (norm_ids[i] << 23) + i;\n        counts[norm_ids[i]]++;\n    }\n    ks_introsort_int((size_t)n, aux);\n\n    // compute number of sub-bead pool bins and size of the largest bin\n    for (i = 0, j = 0; i < 256; i++) {\n        if (counts[i]) {\n            if (counts[i] > max_count) max_count = counts[i];\n            counts[j++] = counts[i];\n        }\n    }\n    *n_xforms = j;\n    XForm *xform = (XForm *)calloc(*n_xforms, sizeof(XForm));\n    float *x = (float *)malloc(max_count * sizeof(float));\n    float *y = (float *)malloc(max_count * sizeof(float));\n\n    // compute the normalization transform one sub-bead pool bin at a time\n    int k = 0;\n    for (i = 0; i < *n_xforms; i++) {\n        if (counts[i] < 10) error(\"Error in normalization. Not enough good loci. Found %d\\n\", counts[i]);\n        uint8_t norm_id = aux[k] >> 23;\n        for (j = 0; j < counts[i]; j++) {\n            int idx = aux[k] & 0x7FFFFF;\n            x[j] = (float)xin[idx];\n            y[j] = (float)yin[idx];\n            k++;\n        }\n        if (gentrain_version == 3 && norm_id >= 100 && counts[i] < ROBUST_THRESHOLD) { // GenTrain 3.0 and Infinium I\n            normalize_single_bin_single_channel(counts[i], x, y, &xform[i]);\n        } else { // GenTrain 2.0 or Infinium II\n            normalize_single_bin(counts[i], x, y, gentrain_version, &xform[i]);\n        }\n    }\n\n    free(aux);\n    free(x);\n    free(y);\n    return xform;\n}\n\n/****************************************\n * MATH ROUTINES                        *\n ****************************************/\n\n// a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs\n\n// http://www.mathworks.com/help/fuzzy/zmf.html\nstatic float matlab_zmf(float x, float a, float b) {\n    if (a >= b) error(\"Invalid arguments for zmf (a >= b)\");\n    if (x <= a) return 1;\n    if (a < x && x <= (a + b) / 2.0f) return 1.0f - 2.0f * sqrf((x - a) / (b - a));\n    if ((a + b) / 2.0f < x && x <= b) return 2.0f * sqrf((x - b) / (b - a));\n    return 0;\n}\n\n// http://www.mathworks.com/help/fuzzy/smf.html\nstatic float matlab_smf(float x, float a, float b) {\n    if (a >= b) return x >= (a + b) / 2.0f ? 1.0f : 0.0f;\n    if (x <= a) return 0;\n    if (a < x && x <= (a + b) / 2.0f) return 2.0f * sqrf((x - a) / (b - a));\n    if ((a + b) / 2.0f < x && x <= b) return 1.0f - 2.0f * sqrf((x - b) / (b - a));\n    return 1;\n}\n\n// http://www.mathworks.com/help/stats/normpdf.html\nstatic double matlab_normpdf_vleft(float x, float mu, float sigma) {\n    if (sigma <= 0.0f) return NAN;\n    if (x < mu) return 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma;\n    return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma;\n}\n\n// http://www.mathworks.com/help/stats/normpdf.html\nstatic double matlab_normpdf_vright(float x, float mu, float sigma) {\n    if (sigma <= 0.0f) return NAN;\n    if (x > mu) return 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma;\n    return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma;\n}\n\n// http://www.mathworks.com/help/stats/normpdf.html\nstatic double matlab_normpdf(float x, float mu, float sigma) {\n    if (sigma <= 0.0f) return NAN;\n    return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma;\n}\n\n/****************************************\n * GENOTYPE CALLING ROUTINES            *\n ****************************************/\n\n// compute normalized intensities (http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf)\n// a separate implementation from Illumina can be found in function Transform from class NormalizationTransform\nstatic inline void raw_x_y2norm_x_y(uint16_t raw_x, uint16_t raw_y, float offset_x, float offset_y, float cos_theta,\n                                    float sin_theta, float shear, float scale_x, float scale_y, float *norm_x,\n                                    float *norm_y) {\n    float temp_x = (float)raw_x - offset_x;\n    float temp_y = (float)raw_y - offset_y;\n    float temp_x2 = cos_theta * temp_x + sin_theta * temp_y;\n    float temp_y2 = -sin_theta * temp_x + cos_theta * temp_y;\n    float temp_x3 = temp_x2 - shear * temp_y2;\n    *norm_x = temp_x3 < 0.0f ? 0.0f : temp_x3 / scale_x;\n    *norm_y = temp_y2 < 0.0f ? 0.0f : temp_y2 / scale_y;\n}\n\n// compute Theta and R from raw intensities\nstatic inline void norm_x_y2ilmn_theta_r(float norm_x, float norm_y, float *ilmn_theta, float *ilmn_r) {\n    if (norm_x == 0.0f && norm_y == 0.0f) {\n        *ilmn_theta = (float)NAN;\n        *ilmn_r = (float)NAN;\n        return;\n    }\n    *ilmn_theta = (float)(atan2((double)norm_y, (double)norm_x) * M_2_PI);\n    if (norm_x < 0.0f && norm_y < 0.0f) {\n        *ilmn_r = FLT_MIN * FLT_EPSILON;\n    } else {\n        *ilmn_r = fabsf(norm_x) + fabsf(norm_y);\n    }\n}\n\n// http://stackoverflow.com/questions/23392321/most-efficient-way-to-find-median-of-three-integers\nstatic inline float median3(float a, float b, float c) { return fmaxf(fminf(a, b), fminf(fmaxf(a, b), c)); }\n\n// a separate implementation from Illumina can be found in function gen_std_flair from class GenTrain62 or in file\n// GenTrain60.cs\nstatic ClusterRecord *gen_std_flair(const ClusterRecord *cluster_record) {\n    ClusterRecord *out_cluster = (ClusterRecord *)malloc(sizeof(ClusterRecord));\n    memcpy(out_cluster, cluster_record, sizeof(ClusterRecord));\n\n    int Mtight = 3;\n    int Mloose = 3;\n\n    float z1 = 0.5f * (cluster_record->ab_cluster_stats.theta_mean - cluster_record->aa_cluster_stats.theta_mean)\n               / (cluster_record->ab_cluster_stats.theta_dev + cluster_record->aa_cluster_stats.theta_dev);\n    float z2 = 0.5f * (cluster_record->bb_cluster_stats.theta_mean - cluster_record->ab_cluster_stats.theta_mean)\n               / (cluster_record->bb_cluster_stats.theta_dev + cluster_record->ab_cluster_stats.theta_dev);\n    float mz = fminf(z1, z2);\n    float alpha = fmaxf((1.0f / (float)Mtight) * mz, 1.0f);\n    float beta = fminf((1.0f / (float)Mloose) * mz, 1.0f);\n    float eta = alpha * beta;\n\n    out_cluster->aa_cluster_stats.theta_dev *= eta;\n    out_cluster->ab_cluster_stats.theta_dev *= eta;\n    out_cluster->bb_cluster_stats.theta_dev *= eta;\n\n    float min_dispersion_t = 0.02f;\n    if (out_cluster->aa_cluster_stats.theta_dev < min_dispersion_t)\n        out_cluster->aa_cluster_stats.theta_dev = min_dispersion_t;\n    if (out_cluster->ab_cluster_stats.theta_dev < min_dispersion_t)\n        out_cluster->ab_cluster_stats.theta_dev = min_dispersion_t;\n    if (out_cluster->bb_cluster_stats.theta_dev < min_dispersion_t)\n        out_cluster->bb_cluster_stats.theta_dev = min_dispersion_t;\n\n    float min_dispersion_r_cte = 0.2f;\n    int M = 7;\n    float min_dispersion_r = min_dispersion_r_cte;\n    // compute median of the three values\n    float med = median3(cluster_record->aa_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_dev,\n                        cluster_record->bb_cluster_stats.theta_dev);\n    if (min_dispersion_r < med) min_dispersion_r = med;\n\n    float min_dispersion_r_aa = cluster_record->aa_cluster_stats.r_mean / (float)M;\n    float min_dispersion_r_ab = cluster_record->ab_cluster_stats.r_mean / (float)M;\n    float min_dispersion_r_bb = cluster_record->bb_cluster_stats.r_mean / (float)M;\n    if (min_dispersion_r_aa < min_dispersion_r) min_dispersion_r_aa = min_dispersion_r;\n    if (min_dispersion_r_ab < min_dispersion_r) min_dispersion_r_ab = min_dispersion_r;\n    if (min_dispersion_r_bb < min_dispersion_r) min_dispersion_r_bb = min_dispersion_r;\n\n    if (out_cluster->aa_cluster_stats.r_dev < min_dispersion_r_aa)\n        out_cluster->aa_cluster_stats.r_dev = min_dispersion_r_aa;\n    if (out_cluster->ab_cluster_stats.r_dev < min_dispersion_r_ab)\n        out_cluster->ab_cluster_stats.r_dev = min_dispersion_r_ab;\n    if (out_cluster->bb_cluster_stats.r_dev < min_dispersion_r_bb)\n        out_cluster->bb_cluster_stats.r_dev = min_dispersion_r_bb;\n\n    return out_cluster;\n}\n\n// a separate implementation from Illumina can be found in function modilik from class GenTrain62 or in file\n// GenTrain60.cs this function computes the likelihood for each cluster\nstatic void modilik(ClusterRecord *c, float t, float r, double *Laa, double *Lab, double *Lbb) {\n    double alpha = 100.0; // what is the relevance of this?\n    *Laa = 0.0;\n    *Lab = 0.0;\n    *Lbb = 0.0;\n\n    // computes the Mahalanobis distance\n    double daa =\n        alpha * (double)fabsf(t - c->aa_cluster_stats.theta_mean) + (double)fabsf(r - c->aa_cluster_stats.r_mean);\n    double dab =\n        alpha * (double)fabsf(t - c->ab_cluster_stats.theta_mean) + (double)fabsf(r - c->ab_cluster_stats.r_mean);\n    double dbb =\n        alpha * (double)fabsf(t - c->bb_cluster_stats.theta_mean) + (double)fabsf(r - c->bb_cluster_stats.r_mean);\n\n    int bCovered = 0;\n    if (daa <= dbb && dab <= dbb && c->aa_cluster_stats.r_mean <= c->ab_cluster_stats.r_mean && !isnan(t)) {\n        *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev);\n        *Lab = matlab_normpdf_vright(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n    if (daa <= dab && dbb <= dab && c->aa_cluster_stats.r_mean <= c->bb_cluster_stats.r_mean && !isnan(t)) {\n        *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev);\n        *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n    if (dab <= daa && dbb <= daa && c->ab_cluster_stats.r_mean <= c->bb_cluster_stats.r_mean && !isnan(t)) {\n        *Lab = matlab_normpdf_vleft(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev);\n        *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n    if (daa <= dbb && dab <= dbb && c->aa_cluster_stats.r_mean > c->ab_cluster_stats.r_mean && !isnan(t)) {\n        *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev);\n        *Lab = matlab_normpdf_vright(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n    if (dab <= daa && dbb <= daa && c->ab_cluster_stats.r_mean > c->bb_cluster_stats.r_mean && !isnan(t)) {\n        *Lab = matlab_normpdf_vleft(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev);\n        *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n    if (daa <= dab && dbb <= dab && c->aa_cluster_stats.r_mean > c->bb_cluster_stats.r_mean && !isnan(t)) {\n        *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev)\n               * matlab_normpdf_vright(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev);\n        *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev)\n               * matlab_normpdf_vleft(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev);\n        bCovered = 1;\n    }\n\n    if (!bCovered) {\n        *Laa = matlab_normpdf(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev)\n               * matlab_normpdf(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev);\n        *Lab = matlab_normpdf(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev)\n               * matlab_normpdf(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev);\n        *Lbb = matlab_normpdf(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev)\n               * matlab_normpdf(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev);\n    }\n}\n\n// a separate implementation from Illumina can be found in function computeScoreCallPrelim from class GenTrain62 or in\n// file GenTrain60.cs Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004) To call\n// genotypes for an individual’s DNA, the calling algorithm takes the DNA’s intensity values and the information\n// generated by the clustering algorithm; subsequently, it then identifies to which cluster the data for any specific\n// locus (of the DNA of interest) corre- sponds. The DNA data is first normalized (using the same procedure as for the\n// clustering algorithm). The calling operation (classification) is performed using a Bayesian model. The score for each\n// call (GenCall Score) is the product of the GenTrain Score and a data-to-model fit score. After scoring all the loci\n// in the DNA of interest, the application computes a composite score for that DNA (DNA Score). Subsequently, the\n// GenCall score of each locus for this DNA is further penalized by the DNA Score. Shen,R. et al. High-throughput SNP\n// genotyping on universal bead arrays. Mutat Res, 573, 70–82 (2005-06-03) A quality score, the GenCall score, is\n// calculated for each SNP call, reflecting the degree of separation be- tween homozygote and heterozygote clusters for\n// that SNP and the placement of the individual call within a cluster. To make a genotype call, the software looks at\n// many factors but one of the first is the distribution of beads of the same type and in this way outliers are rejected\n// to ensure genotyping accuracy. The GenCall score is composed of various sub-scores, of which the most important one\n// is the clustering score. This score is a locus-specific score, and is computed by a fuzzy logic inference system. It\n// varies from 0.0 to 1.0, and correlates with accuracy of the genotype call. GenCall scores have been shown to\n// correlate with the accuracy of the genotyping call.\nstatic float compute_score_call_prelim(float r, float t, const ClusterRecord *cluster_record, uint8_t *iAPmax) {\n    if (r < cluster_record->intensity_threshold) {\n        *iAPmax = (uint8_t)0;\n        return (float)NAN;\n    }\n    double omega = 1.0;\n    double Den = 1.0 + 3.0 * omega;\n\n    ClusterRecord *c = gen_std_flair(cluster_record);\n\n    // likelihoods\n    double Laa;\n    double Lab;\n    double Lbb;\n    if (isnan(t)) {\n        Laa = Lab = Lbb = NAN;\n    } else {\n        modilik(c, t, r, &Laa, &Lab, &Lbb);\n        if (isnan(Laa)) Laa = 0.0;\n        if (isnan(Lab)) Lab = 0.0;\n        if (isnan(Lbb)) Lbb = 0.0;\n    }\n\n    int N = c->aa_cluster_stats.N + c->ab_cluster_stats.N + c->bb_cluster_stats.N;\n\n    // priors\n    double Paa = ((double)((float)c->aa_cluster_stats.N / (float)N) + omega) / Den;\n    double Pab = ((double)((float)c->ab_cluster_stats.N / (float)N) + omega) / Den;\n    double Pbb = ((double)((float)c->bb_cluster_stats.N / (float)N) + omega) / Den;\n\n    double Evidence = Laa * Paa + Lab * Pab + Lbb * Pbb;\n\n    // posteriors\n    double APaa = Laa * Paa / Evidence;\n    double APab = Lab * Pab / Evidence;\n    double APbb = Lbb * Pbb / Evidence;\n\n    //\n    if (APaa >= APab && APaa >= APbb)\n        *iAPmax = (uint8_t)1; // AA\n    else if (APab >= APaa && APab >= APbb)\n        *iAPmax = (uint8_t)2; // AB\n    else if (APbb >= APaa && APbb >= APab)\n        *iAPmax = (uint8_t)3; // BB\n    else\n        *iAPmax = (uint8_t)0; // NC\n\n    double mx = 0.0;\n    double scndmx = 0.0;\n    if (APaa > APab) {\n        mx = APaa;\n        scndmx = APab;\n    } else {\n        mx = APab;\n        scndmx = APaa;\n    }\n    if (APbb > mx) {\n        scndmx = mx;\n        mx = APbb;\n    } else if (APbb > scndmx) {\n        scndmx = APbb;\n    }\n\n    double ap_ratio = mx / (DBL_MIN * DBL_EPSILON + scndmx);\n    double ap_lod = log10(ap_ratio);\n\n    double score_ap = matlab_smf((float)ap_lod, 0.0f, 2.0f);\n\n    float score_r1 = matlab_smf(r, 0.0f, 0.1f);\n    float score_r2 = 0.0f;\n    float score_r3 = 0.0f;\n    float score_r4 = 0.0f;\n\n    int numClusters = 0;\n    if (c->aa_cluster_stats.N > 0) numClusters++;\n    if (c->ab_cluster_stats.N > 0) numClusters++;\n    if (c->bb_cluster_stats.N > 0) numClusters++;\n    float score_misclust;\n    if (numClusters == 1 && c->ab_cluster_stats.N == 0)\n        score_misclust = 0.7f;\n    else if (numClusters != 3)\n        score_misclust = 0.95f;\n    else\n        score_misclust = 1.0f;\n\n    float score_t = 1.0f;\n\n    float RdropBegin = 6.0f;\n    float RdropEnd = 12.0f;\n    switch (*iAPmax) {\n    case 1: // AA\n        score_t = matlab_zmf(t, c->aa_cluster_stats.theta_mean + 2.0f * c->aa_cluster_stats.theta_dev,\n                             c->aa_cluster_stats.theta_mean + 6.0f * c->aa_cluster_stats.theta_dev);\n        score_r2 = matlab_smf(r, 0.0f, c->aa_cluster_stats.r_mean / 10.0f);\n        score_r3 = matlab_smf(r, c->aa_cluster_stats.r_mean - 6.0f * c->aa_cluster_stats.r_dev,\n                              c->aa_cluster_stats.r_mean - 2.0f * c->aa_cluster_stats.r_dev);\n        score_r4 = matlab_zmf(r, c->aa_cluster_stats.r_mean + RdropBegin * c->aa_cluster_stats.r_dev,\n                              c->aa_cluster_stats.r_mean + RdropEnd * c->aa_cluster_stats.r_dev);\n        break;\n    case 2: // AB\n        score_t = matlab_zmf(fabsf((t - c->ab_cluster_stats.theta_mean) / c->ab_cluster_stats.theta_dev), 2.0f, 6.0f);\n        score_r2 = matlab_smf(r, 0.0f, c->ab_cluster_stats.r_mean / 10.0f);\n        score_r3 = matlab_smf(r, c->ab_cluster_stats.r_mean - 6.0f * c->ab_cluster_stats.r_dev,\n                              c->ab_cluster_stats.r_mean - 2.0f * c->ab_cluster_stats.r_dev);\n        score_r4 = matlab_zmf(r, c->ab_cluster_stats.r_mean + RdropBegin * c->ab_cluster_stats.r_dev,\n                              c->ab_cluster_stats.r_mean + RdropEnd * c->ab_cluster_stats.r_dev);\n        break;\n    case 3: // BB\n        score_t = matlab_smf(t, c->bb_cluster_stats.theta_mean - 6.0f * c->bb_cluster_stats.theta_dev,\n                             c->bb_cluster_stats.theta_mean - 2.0f * c->bb_cluster_stats.theta_dev);\n        score_r2 = matlab_smf(r, 0.0f, c->bb_cluster_stats.r_mean / 10.0f);\n        score_r3 = matlab_smf(r, c->bb_cluster_stats.r_mean - 6.0f * c->bb_cluster_stats.r_dev,\n                              c->bb_cluster_stats.r_mean - 2.0f * c->bb_cluster_stats.r_dev);\n        score_r4 = matlab_zmf(r, c->bb_cluster_stats.r_mean + RdropBegin * c->bb_cluster_stats.r_dev,\n                              c->bb_cluster_stats.r_mean + RdropEnd * c->bb_cluster_stats.r_dev);\n        break;\n    }\n    float score_r = score_r1 * score_r2 * score_r3 * score_r4;\n\n    float score_n = 1.0f;\n\n    if (isnan(cluster_record->cluster_score.total_score)) score_t = score_r = (float)NAN;\n    if (isnan(t)) score_t = score_r = score_n = (float)NAN;\n\n    float score_call_prelim =\n        (float)score_ap * cluster_record->cluster_score.total_score * score_t * score_r * score_n * score_misclust;\n\n    free(c);\n    return score_call_prelim;\n}\n\n// http://www.mathworks.com/help/fuzzy/gbellmf.html\nstatic double matlab_gbellmf(double x, double a, double b, double c) {\n    double tmp = sqr((x - c) / a);\n    if (tmp == 0.0 && b == 0.0) return 0.5;\n    return 1.0 / (1.0 + pow(tmp, b));\n}\n\n// a separate implementation from Illumina can be found in function gencall_score_map from class GenTrain62 or in file\n// GenTrain60.cs 0.35 = 0.5 * 0.7 0.504 = 0.8 × 0.7 × 0.9 1.71 = 0.9 * 1.9 1.08 = 0.9 * 1.2\nstatic double gencall_score_map(double x) { return pow(x, 0.35) * matlab_gbellmf(x, 0.504, 1.71, 1.08); }\n\nstatic inline char rev_allele(char allele) {\n    static const char allele_complement[128] = {\n        0, 0,   0, 0,   0,   0, 0, 0,   0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0,   0, 0,   0,   0, 0, 0,   0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 'T', 0, 'G', 'D', 0, 0, 'C', 0, 'I', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'A', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n    };\n    if (allele > 95) return 0;\n    return allele_complement[(int)allele];\n}\n\n// a separate implementation from Illumina can be found in function GetBaseCall from class AutoCallPollerThread\nstatic void get_base_call(const char *snp, const char *ilmn_strand, uint8_t genotype, BaseCall *base_call) {\n    char a = toupper(ilmn_strand[0]) == 'T' ? snp[1] : rev_allele(snp[1]);\n    char b = toupper(ilmn_strand[0]) == 'T' ? snp[3] : rev_allele(snp[3]);\n    switch (genotype) {\n    case 1:\n        (*base_call)[0] = a;\n        (*base_call)[1] = a;\n        return;\n    case 2:\n        (*base_call)[0] = a;\n        (*base_call)[1] = b;\n        return;\n    case 3:\n        (*base_call)[0] = b;\n        (*base_call)[1] = b;\n        return;\n    }\n    (*base_call)[0] = '-';\n    (*base_call)[1] = '-';\n}\n\n// a separate implementation from Illumina can be found in function MakeCalls from class AutoCallPollerThread\nstatic void make_calls(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt, float gencall_cutoff,\n                       int allow_missing_clusters) {\n    int i, n = bpm->num_loci;\n    gtc->sample_data.num_calls = 0;\n    gtc->sample_data.num_intensity_only = 0;\n    for (i = 0; i < n; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        uint16_t raw_x = gtc->raw_x[i];\n        uint16_t raw_y = gtc->raw_y[i];\n        float norm_x = -NAN;\n        float norm_y = -NAN;\n        float ilmn_theta = -NAN;\n        float ilmn_r = -NAN;\n        if (raw_x || raw_y) {\n            int norm_id = bpm->norm_lookups[bpm->norm_ids[i]];\n            XForm *xform = &gtc->normalization_transforms[norm_id];\n            raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id],\n                             gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y);\n            norm_x_y2ilmn_theta_r(norm_x, norm_y, &ilmn_theta, &ilmn_r);\n\n            int idx;\n            int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n            if (ret < 0) {\n                if (allow_missing_clusters) {\n                    fprintf(stderr, \"Warning: Illumina probe %s not found in cluster file\\n\", locus_entry->name);\n                    gtc->genotype_scores[i] = 0.0f;\n                    gtc->genotypes[i] = 0;\n                } else {\n                    error(\"Illumina probe %s not found in cluster file\\nUse --allow-missing-clusters to allow this\\n\",\n                          locus_entry->name);\n                }\n            } else {\n                ClusterRecord *cluster_record = &egt->cluster_records[idx];\n                float min_dispersion_r = 0.1f;\n                if (cluster_record->aa_cluster_stats.r_dev < min_dispersion_r)\n                    cluster_record->aa_cluster_stats.r_dev = min_dispersion_r;\n                if (cluster_record->ab_cluster_stats.r_dev < min_dispersion_r)\n                    cluster_record->ab_cluster_stats.r_dev = min_dispersion_r;\n                if (cluster_record->bb_cluster_stats.r_dev < min_dispersion_r)\n                    cluster_record->bb_cluster_stats.r_dev = min_dispersion_r;\n                uint8_t genotype;\n                float score_call_prelim = compute_score_call_prelim(ilmn_r, ilmn_theta, cluster_record, &genotype);\n                float score_call = (float)gencall_score_map(score_call_prelim);\n                gtc->genotype_scores[i] = isnan(score_call) ? 0.0f : score_call;\n                gtc->genotypes[i] = genotype;\n            }\n        } else {\n            gtc->genotype_scores[i] = 0.0f;\n            gtc->genotypes[i] = 0;\n        }\n\n        if (gtc->genotype_scores[i] < gencall_cutoff) gtc->genotypes[i] = 0;\n        if (locus_entry->intensity_only) {\n            gtc->genotypes[i] = 0;\n            gtc->sample_data.num_intensity_only++;\n        }\n        if (gtc->genotypes[i]) gtc->sample_data.num_calls++;\n        get_base_call(locus_entry->snp, locus_entry->ilmn_strand, gtc->genotypes[i], &gtc->base_calls[i]);\n    }\n\n    gtc->sample_data.num_no_calls = gtc->num_snps - gtc->sample_data.num_intensity_only - gtc->sample_data.num_calls;\n    gtc->call_rate = (float)gtc->sample_data.num_calls\n                     / ((float)gtc->num_snps - (float)gtc->sample_data.num_intensity_only + FLT_MIN * FLT_EPSILON);\n}\n\ntypedef struct {\n    int version;\n    int min_loci;\n    int max_loci;\n    int min_x_loci;\n    int min_y_loci;\n    float call_rate_threshold;\n    float y_threshold;\n    float x_threshold;\n    float x_het_rate_threshold;\n} gender_t;\n\n// a separate implementation from Illumina can be found in function EstimateGender from class AutoCallPollerThread\n// TODO what happened here to gender->max_loci?\nstatic void estimate_gender(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt, const gender_t *gender) {\n    int i, n = bpm->num_loci;\n    int x_count = 0;\n    int x_hets_count = 0;\n    int x_non_missing_count = 0;\n    int y_count = 0;\n    int auto_count = 0;\n    int auto_non_missing_count = 0;\n    float *r_x = (float *)malloc(n * sizeof(float));\n    float *r_y = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        const char *chrom =\n            strncasecmp(locus_entry->chrom, \"CHR\", 3) == 0 ? locus_entry->chrom + 3 : locus_entry->chrom;\n        int idx;\n        int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n        if (ret < 0) continue;\n        ClusterRecord *cluster_record = &egt->cluster_records[idx];\n        int norm_id = bpm->norm_lookups[bpm->norm_ids[i]];\n        XForm *xform = &gtc->normalization_transforms[norm_id];\n        float norm_x, norm_y, t;\n        if (cluster_record->cluster_score.total_score != 0.0f) {\n            if (strcmp(chrom, \"X\") == 0) {\n                raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y,\n                                 gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x,\n                                 xform->scale_y, &norm_x, &norm_y);\n                norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r_x[x_count]);\n                if (gtc->genotypes[i] == 2) x_hets_count++;\n                if (gtc->genotypes[i]) x_non_missing_count++;\n                x_count++;\n            } else if (strcmp(chrom, \"Y\") == 0) {\n                raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y,\n                                 gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x,\n                                 xform->scale_y, &norm_x, &norm_y);\n                norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r_y[y_count]);\n                y_count++;\n            } else if (strcmp(chrom, \"XY\") != 0 && strcmp(chrom, \"MT\") != 0) {\n                auto_count++;\n                if (gtc->genotypes[i]) auto_non_missing_count++;\n            }\n        }\n    }\n\n    gtc->gender = 'U';\n    if (gender->version == 1 || y_count < gender->min_y_loci || auto_count < gender->min_loci) {\n        if (x_non_missing_count > gender->min_x_loci) {\n            gtc->gender = !((double)((float)x_hets_count / (float)x_non_missing_count) > gender->x_het_rate_threshold)\n                              ? 'M'\n                              : 'F';\n        }\n    } else if (auto_count > 0 && (double)auto_non_missing_count / (double)auto_count > gender->call_rate_threshold) {\n        for (i = 0; i < y_count; i++)\n            if (isnan(r_y[i]) || isinf(r_y[i])) r_y[i] = 0.0f;\n        float y_med = matlab_median(y_count, r_y);\n        if ((double)y_med > gender->y_threshold) {\n            gtc->gender = 'M';\n        } else if (x_count < gender->min_x_loci) {\n            gtc->gender = 'F';\n        } else {\n            for (i = 0; i < x_count; i++)\n                if (isnan(r_x[i]) || isinf(r_x[i])) r_x[i] = 0.0f;\n            float x_med = matlab_median(x_count, r_x);\n            gtc->gender = (double)x_med > gender->x_threshold ? 'F' : 'U';\n        }\n    }\n    free(r_x);\n    free(r_y);\n}\n\n// compute BAF and LRR from Theta and R as explained in Peiffer, D. A. et al. High-resolution genomic profiling of\n// chromosomal aberrations using Infinium whole-genome genotyping. Genome Res. 16, 1136–1148 (2006)\n// Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome\n// genotyping. Genome Res., 16, 1136–1148 (2006-08-09)\nstatic inline void get_baf_lrr(float ilmn_theta, float ilmn_r, float aa_theta, float ab_theta, float bb_theta,\n                               float aa_r, float ab_r, float bb_r, float r_mean, float *baf, float *lrr) {\n    float r_ref;\n    if (ilmn_theta == ab_theta) {\n        r_ref = ab_r;\n        *baf = 0.5f;\n    } else if (ilmn_theta < ab_theta) {\n        r_ref = aa_r + (ilmn_theta - aa_theta) * (aa_r - ab_r) / (aa_theta - ab_theta);\n        *baf = (ilmn_theta - aa_theta) / (ab_theta - aa_theta) * 0.5f;\n    } else if (ilmn_theta > ab_theta) {\n        r_ref = ab_r + (ilmn_theta - ab_theta) * (bb_r - ab_r) / (bb_theta - ab_theta);\n        *baf = 0.5f + (ilmn_theta - ab_theta) / (bb_theta - ab_theta) * 0.5f;\n    } else {\n        *lrr = -NAN;\n        *baf = -NAN;\n        return;\n    }\n    *lrr = ilmn_r != 0.0f ? (float)log2(ilmn_r / (isnan(r_mean) ? r_ref : r_mean)) : -FLT_MAX;\n}\n\n// a separate implementation from Illumina can be found in functions CalculateLogRDev and CalculateBAlleleFreq from\n// class AutoCallPollerThread\nstatic void calculate_baf_lrr(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt) {\n    int i, count = 0, n = bpm->num_loci;\n    double sum = 0.0;\n    double sum2 = 0.0;\n    for (i = 0; i < n; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        int idx;\n        int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n        if (ret < 0) continue;\n        ClusterRecord *c = &egt->cluster_records[idx];\n        float baf = -NAN;\n        float lrr = -NAN;\n        if ((gtc->raw_x[i] || gtc->raw_y[i]) && c) {\n            int norm_id = bpm->norm_lookups[bpm->norm_ids[i]];\n            XForm *xform = &gtc->normalization_transforms[norm_id];\n            float norm_x, norm_y, t, r;\n\n            raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id],\n                             gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y);\n            norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r);\n            get_baf_lrr(t, r, c->aa_cluster_stats.theta_mean, c->ab_cluster_stats.theta_mean,\n                        c->bb_cluster_stats.theta_mean, c->aa_cluster_stats.r_mean, c->ab_cluster_stats.r_mean,\n                        c->bb_cluster_stats.r_mean, locus_entry->intensity_only ? c->r_mean : NAN, &baf, &lrr);\n        }\n        gtc->b_allele_freqs[i] = baf < 0.0 ? 0.0f : baf > 1.0 ? 1.0f : (float)baf;\n        gtc->logr_ratios[i] = (float)lrr;\n\n        char start_chrom = strncasecmp(locus_entry->chrom, \"CHR\", 3) == 0 ? toupper(locus_entry->chrom[3])\n                                                                          : toupper(locus_entry->chrom[0]);\n        if (!locus_entry->intensity_only && (start_chrom != 'X' && start_chrom != 'Y' && start_chrom != 'M')\n            && !isinf(lrr) && !isnan(lrr)) {\n            sum += (double)lrr;\n            sum2 += sqr((double)lrr);\n            count++;\n        }\n    }\n    gtc->logr_dev = (float)sqrt(sum2 / (double)count - sqr(sum / (double)count));\n}\n\n// a separate implementation from Illumina can be found in function CalculateIntensityPercentiles from class\n// AutoCallPollerThread\nstatic void calculate_intensity_percentiles(gtc_t *gtc) {\n    int i, n = gtc->num_snps;\n    float *xs = (float *)malloc(n * sizeof(float));\n    float *ys = (float *)malloc(n * sizeof(float));\n    for (i = 0; i < n; i++) {\n        xs[i] = (float)gtc->raw_x[i];\n        ys[i] = (float)gtc->raw_y[i];\n    }\n    ks_introsort_float((size_t)n, xs);\n    ks_introsort_float((size_t)n, ys);\n    gtc->percentiles_x[0] = (uint16_t)percentile(n, xs, 5);\n    gtc->percentiles_x[1] = (uint16_t)percentile(n, xs, 50);\n    gtc->percentiles_x[2] = (uint16_t)percentile(n, xs, 95);\n    gtc->percentiles_y[0] = (uint16_t)percentile(n, ys, 5);\n    gtc->percentiles_y[1] = (uint16_t)percentile(n, ys, 50);\n    gtc->percentiles_y[2] = (uint16_t)percentile(n, ys, 95);\n    free(xs);\n    free(ys);\n}\n\n// a separate implementation from Illumina can be found in function ComputeSampleStats from class AutoCallPollerThread\n// Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004)\n// GenCall Scores may be averaged among DNAs and\n// among loci for purposes of evaluating the quality of the\n// genotyping within a particular DNA or locus. For example,\n// we often evaluate “GC10” and “GC50” scores that are calcu-\n// lated by taking the 10th percentile and the 50th percentile\n// (median) of the GenCall Scores for a certain locus, respec-\n// tively. Using GC10 and GC50 Scores, a user may choose\n// to fail particularly poor performing loci, for instance,\n// by discarding loci with GC10 of 0.1 or lower. Also, a series\n// of aggregate statistics (i.e., average) of the GC10 or GC50\n// scores for each DNA can be used to identify low-quality\n// DNAs (for instance, a user may discard DNA samples with\n// average GC10 scores of 0.2 or lower). The GenCall Score\n// can also be used in situations where users have a mini-\n// mum required call rate. This rate translates to making\n// calls on a certain percentile of the data. Users can sort\n// all their genotypes based on the GenCall Score, and then\n// choose the top (Nth) percentile of interest for their study.\nstatic void compute_sample_stats(gtc_t *gtc, const bpm_t *bpm, float gencall_cutoff) {\n    int i, j, n = gtc->num_snps;\n    float *gs = (float *)malloc(n * sizeof(float));\n    for (i = 0, j = 0; i < n; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        if (gtc->genotype_scores[i] > gencall_cutoff && !locus_entry->intensity_only)\n            gs[j++] = (float)gtc->genotype_scores[i];\n    }\n    ks_introsort_float((size_t)j, gs);\n    gtc->p10gc = percentile(j, gs, 10);\n    gtc->sample_data.p50gc = percentile(j, gs, 50);\n    free(gs);\n}\n\n/****************************************\n * CREATE NEW GTC STRUCTURE             *\n ****************************************/\n\n// a separate implementation from Illumina can be found in class MD5ChecksumFile of the Array Analysis CLI\nstatic char *basename(const char *fn, const char unsigned *md5_buf) {\n    const char str[] = \"(MD5Checksum=\";\n    char *ptr = strrchr(fn, '/');\n    if (ptr)\n        ptr++;\n    else\n        ptr = (char *)fn;\n    char *ret;\n    if (md5_buf) {\n        int len = strlen(ptr);\n        ret = (char *)malloc((len + 47) * sizeof(char));\n        memcpy((void *)ret, (void *)ptr, (size_t)len);\n        ptr = ret + len;\n        memcpy((void *)ptr, &str, sizeof(str) - 1);\n        ptr += sizeof(str) - 1;\n        hts_md5_hex(ptr, md5_buf);\n        ptr += 32;\n        *(ptr++) = ')';\n        *ptr = 0;\n    } else {\n        ret = strdup(ptr);\n    }\n    return ret;\n}\n\n// TODO this should be done once only for the BPM structure\nstatic int32_t *get_control_addresses(const char *str, int *n_addresses) {\n    int i, j;\n    int moff = 0, *off = NULL;\n    int moff2 = 0, *off2 = NULL;\n    int32_t *addresses = NULL;\n    int m_addresses = 0;\n    *n_addresses = 0;\n\n    char *s = strdup(str);\n    int noff = ksplit_core(s, '\\n', &moff, &off);\n    for (i = 0; i < noff; i++) {\n        char *ptr = strchr(&s[off[i]], ',');\n        *ptr = '\\0';\n        int noff2 = ksplit_core(&s[off[i]], ':', &moff2, &off2);\n        hts_expand(int32_t, *n_addresses + noff2, m_addresses, addresses);\n        for (j = 0; j < noff2; j++) {\n            char *endptr;\n            addresses[*n_addresses + j] = (int32_t)strtol(&s[off[i] + off2[j]], &endptr, 10);\n        }\n        *n_addresses += noff2;\n    }\n    free(s);\n    free(off);\n    free(off2);\n    return addresses;\n}\n\nstatic char *get_string_parameter(const char *str, const char *id) {\n    const char *ptr = strstr(str, id);\n    if (!ptr) return NULL;\n    ptr += strlen(id);\n    if (*ptr != '=') return NULL;\n    ptr++;\n    const char *ptr2 = strchr(ptr, '|');\n    return strndup(ptr, ptr2 ? ptr2 - ptr : strlen(ptr));\n}\n\nstatic int32_t get_int32_parameter(const char *str, const char *id) {\n    const char *ptr = strstr(str, id);\n    if (!ptr) return 0;\n    ptr += strlen(id);\n    if (*ptr != '=') return 0;\n    ptr++;\n    char *endptr;\n    return (int32_t)strtol(ptr, &endptr, 10);\n}\n\n// a separate implementation from Illumina can be found in function LoadSampleSection from class SampleData\n// AutoConvert used the creation time of the IDAT file for the imaging date field of the GTC file:\n// imagingDate = fileInfo.CreationTime.ToLongDateString() + \" \" + fileInfo.CreationTime.ToLongTimeString();\n// this was later updated to instead use the imaging date field of the last Scan entry in the IDAT file\nstatic void load_sample_section(gtc_t *gtc, const idat_t *idat, int imaging_date) {\n    int i;\n    RunInfo *run_info = NULL;\n    for (i = 0; i < idat->m_run_infos; i++)\n        if (strcmp(idat->run_infos[i].block_type, \"Scan\") == 0) run_info = &idat->run_infos[i];\n    if (run_info) {\n        gtc->imaging_date = imaging_date ? strdup(run_info->run_time) : NULL;\n        gtc->scanner_data.scanner_name = get_string_parameter(run_info->block_pars, \"sherlockID\");\n        gtc->scanner_data.pmt_green = get_int32_parameter(run_info->block_pars, \"PMTGainCY3\");\n        gtc->scanner_data.pmt_red = get_int32_parameter(run_info->block_pars, \"PMTGainCY5\");\n        gtc->scanner_data.scanner_version = strdup(run_info->code_version);\n        gtc->scanner_data.imaging_user = get_string_parameter(run_info->block_pars, \"Username\");\n    }\n}\n\nstatic int32_t get32_index(void *dict, int32_t key) {\n    khash_t(32) *hash = (khash_t(32) *)dict;\n    khiter_t k = kh_get(32, hash, key);\n    if (k == kh_end(hash)) return -1;\n    return kh_val(hash, k);\n}\n\n// a separate implementation from Illumina can be found in function fillArray from class SampleData\nstatic void fill_array(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, gtc_t *gtc) {\n    int i;\n    int32_t idx1, idx2;\n    for (i = 0; i < bpm->num_loci; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        if (locus_entry->assay_type == 0) { // 0 - Infinium II probes\n            idx1 = get32_index(red_idat->ilmn_id2index, locus_entry->address_a);\n            idx2 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_a);\n            if (idx1 == -1 || idx2 == -1) continue; // warning?\n            if (red_idat->nbeads[idx1] >= 2 && grn_idat->nbeads[idx2] >= 2) {\n                gtc->raw_x[i] = red_idat->trimmed_mean[idx1];\n                gtc->raw_y[i] = grn_idat->trimmed_mean[idx2];\n            }\n        } else if (locus_entry->assay_type == 1) { // 1 - Infinium I (A/T) probes\n            idx1 = get32_index(red_idat->ilmn_id2index, locus_entry->address_a);\n            idx2 = get32_index(red_idat->ilmn_id2index, locus_entry->address_b);\n            if (idx1 == -1 || idx2 == -1) continue; // warning?\n            if (red_idat->nbeads[idx1] >= 2 && red_idat->nbeads[idx2] >= 2) {\n                gtc->raw_x[i] = red_idat->trimmed_mean[idx1];\n                gtc->raw_y[i] = red_idat->trimmed_mean[idx2];\n            }\n        } else if (locus_entry->assay_type == 2) { // 2 - Infinium I (G/C) probes\n            idx1 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_a);\n            idx2 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_b);\n            if (idx1 == -1 || idx2 == -1) continue; // warning?\n            if (grn_idat->nbeads[idx1] >= 2 && grn_idat->nbeads[idx2] >= 2) {\n                gtc->raw_x[i] = grn_idat->trimmed_mean[idx1];\n                gtc->raw_y[i] = grn_idat->trimmed_mean[idx2];\n            }\n        } else {\n            error(\"Assay type %d for probe %s not valid\\n\", locus_entry->assay_type, locus_entry->ilmn_id);\n        }\n    }\n}\n\n// a separate implementation from Illumina can be found in function fillControlsArray from class SampleData\nstatic void fill_controls_array(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, gtc_t *gtc) {\n    int i, n_controls;\n    int32_t idx1, idx2;\n    int *control_addresses = get_control_addresses(bpm->control_config, &n_controls);\n    gtc->m_controls_x = n_controls;\n    gtc->m_controls_y = n_controls;\n    gtc->controls_x = (uint16_t *)calloc(n_controls, sizeof(uint16_t));\n    gtc->controls_y = (uint16_t *)calloc(n_controls, sizeof(uint16_t));\n    for (i = 0; i < n_controls; i++) {\n        idx1 = get32_index(red_idat->ilmn_id2index, control_addresses[i]);\n        idx2 = get32_index(grn_idat->ilmn_id2index, control_addresses[i]);\n        if (idx1 == -1 || idx2 == -1) continue; // warning?\n        gtc->controls_x[i] = red_idat->trimmed_mean[idx1];\n        gtc->controls_y[i] = grn_idat->trimmed_mean[idx2];\n    }\n    free(control_addresses);\n}\n\n// a separate implementation from Illumina can be found in function Process from class AutoCallPollerThread\nstatic gtc_t *gtc_init(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, const egt_t *egt,\n                       int gentrain_version, int gtc_file_version, float gencall_cutoff, int sample_name, int checksums,\n                       int imaging_date, const char *autocall_date_format, const char *autocall_version,\n                       int allow_missing_clusters, const gender_t *gender) {\n    if (!grn_idat || !red_idat || !bpm) return NULL;\n\n    gtc_t *gtc = (gtc_t *)calloc(1, sizeof(gtc_t));\n    gtc->version = gtc_file_version;\n    gtc->ploidy = 2;\n    gtc->ploidy_type = 1;\n    gtc->sample_name = red_idat->sample_name ? strdup(red_idat->sample_name) : NULL;\n    char *ptr, *ptr2;\n    if (!gtc->sample_name && sample_name) {\n        ptr = strrchr(grn_idat->fn, '/');\n        if (ptr)\n            ptr++;\n        else\n            ptr = grn_idat->fn;\n        ptr2 = strstr(ptr, \"_Grn.idat\");\n        gtc->sample_name = strndup(ptr, ptr2 - ptr);\n    }\n    gtc->sample_plate = red_idat->sample_plate ? strdup(red_idat->sample_plate) : NULL;\n    gtc->sample_well = red_idat->sample_well ? strdup(red_idat->sample_well) : NULL;\n    gtc->sentrix_id = red_idat->sentrix_barcode ? strdup(red_idat->sentrix_barcode) : NULL;\n    if (egt) gtc->cluster_file = basename(egt->fn, checksums && egt ? egt->md5_buf : NULL);\n    gtc->snp_manifest = basename(bpm->fn, checksums ? bpm->md5_buf : NULL);\n\n    time_t timer;\n    char buffer[26];\n    struct tm *tm_info;\n    timer = time(NULL);\n    tm_info = localtime(&timer);\n    strftime(buffer, 26, autocall_date_format, tm_info);\n    gtc->autocall_date = strdup(buffer);\n    gtc->autocall_version = strdup(autocall_version);\n\n    gtc->num_snps = bpm->num_loci;\n    gtc->raw_x = (uint16_t *)calloc(gtc->num_snps, sizeof(uint16_t));\n    gtc->raw_y = (uint16_t *)calloc(gtc->num_snps, sizeof(uint16_t));\n    gtc->genotypes = (uint8_t *)calloc(gtc->num_snps, sizeof(uint8_t));\n    gtc->base_calls = (BaseCall *)malloc(gtc->num_snps * sizeof(BaseCall));\n    memset(gtc->base_calls, '-', gtc->num_snps * sizeof(BaseCall));\n    gtc->genotype_scores = (float *)calloc(gtc->num_snps, sizeof(float));\n    gtc->b_allele_freqs = (float *)calloc(gtc->num_snps, sizeof(float));\n    gtc->logr_ratios = (float *)calloc(gtc->num_snps, sizeof(float));\n\n    fill_array(grn_idat, red_idat, bpm, gtc);\n\n    fprintf(stderr, \"Normalizing...\\n\");\n    gtc->normalization_transforms = normalize(gtc->num_snps, gtc->raw_x, gtc->raw_y, bpm->norm_ids, gentrain_version,\n                                              &gtc->m_normalization_transforms);\n\n    gtc->sin_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float));\n    gtc->cos_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float));\n    int i;\n    for (i = 0; i < gtc->m_normalization_transforms; i++) {\n        gtc->sin_theta[i] = (float)sin((double)gtc->normalization_transforms[i].theta);\n        gtc->cos_theta[i] = (float)cos((double)gtc->normalization_transforms[i].theta);\n    }\n\n    if (egt) {\n        fprintf(stderr, \"Calling...\\n\");\n        make_calls(gtc, bpm, egt, gencall_cutoff, allow_missing_clusters);\n        fprintf(stderr, \"Call rate: %.7f\\n\", gtc->call_rate);\n        estimate_gender(gtc, bpm, egt, gender);\n        fprintf(stderr, \"Gender: %s\\n\", gtc->gender == 'M' ? \"Male\" : gtc->gender == 'F' ? \"Female\" : \"Unknown\");\n        calculate_baf_lrr(gtc, bpm, egt);\n        compute_sample_stats(gtc, bpm, gencall_cutoff);\n    }\n\n    calculate_intensity_percentiles(gtc);\n\n    fill_controls_array(grn_idat, red_idat, bpm, gtc);\n\n    load_sample_section(gtc, red_idat, imaging_date);\n\n    return gtc;\n}\n\n/****************************************\n * PLUGIN                               *\n ****************************************/\n\nconst char *about(void) { return \"Convert Illumina IDAT files for Infinium arrays to GTC files.\\n\"; }\n\nstatic const char *usage_text(void) {\n    return \"\\n\"\n           \"About: convert Illumina IDAT files for Infinium arrays to GTC files.\\n\"\n           \"(version \" IDAT2GTC_VERSION\n           \" http://github.com/freeseek/idat2vcf)\\n\"\n           \"[ Kermani, B. G. Artificial intelligence and global normalization methods for\\n\"\n           \"  genotyping. U.S. Patents No. 7,035,740 (2005-09-29) and 7,467,117 (2006-10-05) ]\\n\"\n           \"[ Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations\\n\"\n           \"  using Infinium whole-genome genotyping. Genome Res., 16, 1136–1148 (2006-08-09) ]\\n\"\n           \"[ Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004) ]\\n\"\n           \"[ Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006-09-26) ]\\n\"\n           \"[ Illumina, Inc. Improved Cluster Generation with Gentrain2. Pub. No. 037-2009-015 (2009-01-26)]\\n\"\n           \"[ Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016) ]\\n\"\n           \"Usage: bcftools +idat2gtc --bpm <file> [options]\\n\"\n           \"\\n\"\n           \"Plugin options:\\n\"\n           \"    -b, --bpm <file>                    BPM manifest file\\n\"\n           \"    -e, --egt <file>                    EGT cluster file\\n\"\n           \"    -i, --idats <dir>                   IDAT files from directory\\n\"\n           \"    -g, --grn-idats <file>              file with list of green IDATs\\n\"\n           \"    -r, --red-idats <file>              file with list of red IDATs\\n\"\n           \"    -o, --output <dir>                  write output to a directory\\n\"\n           \"    -v, --gentrain-version <int>        whether to use GenTrain 2.0 (2) or GenTrain 3.0 (3) for \"\n           \"normalization [3]\\n\"\n           \"    -c, --gencall-cutoff <int>          cutoff score for GenCall algorithm [0.15]\\n\"\n           \"        --snp-map <file>                create SNP map file\\n\"\n           \"        --do-not-check-eof              do not check whether the BPM and EGT readers reach the end of the \"\n           \"file\\n\"\n           \"        --preset <int>                  Illumina AutoCall software to emulate [4]\\n\"\n           \"                                        AutoConvert (1), AutoConvert 2.0 (2), IAAP CLI (3), Array Analysis \"\n           \"CLI (4)\\n\"\n           \"GTC output files options:\\n\"\n           \"        --gtc-version <int>             whether use the old (3) or the new (5) GTC file format [5]\\n\"\n           \"        --no-sample-name                leave sample name empty if missing from IDAT files\\n\"\n           \"        --no-checksums                  do not include cluster and manifest files checksums\\n\"\n           \"        --no-imaging-date               do not include imaging date\\n\"\n           \"        --autocall-date <string>        AutoCall date format to use [\" AUTOCALL_DATE_FORMAT_DFLT\n           \"]\\n\"\n           \"        --autocall-version <string>     AutoCall version label to use [\" AUTOCALL_VERSION_DFLT\n           \"]\\n\"\n           \"        --allow-missing-clusters        BPM manifest file variants can be missing from the EGT cluster file\"\n           \"\\n\"\n           \"Gender estimation options:\\n\"\n           \"        --gender-version <int>          whether to only use heterozygosity (1) or also intensities (2) \"\n           \"[2]\\n\"\n           \"        --min-loci <int>                minimum number of autosomal loci for gender estimation [100]\\n\"\n           \"        --max-loci <int>                maximum number of autosomal loci for gender estimation [10000]\\n\"\n           \"        --min-x-loci <int>              minimum number of X loci for gender estimation [20]\\n\"\n           \"        --min-y-loci <int>              minimum number of Y loci for gender estimation [20]\\n\"\n           \"        --call-rate-threshold <float>   threshold for autosomal call rate for gender estimation [0.0]\\n\"\n           \"        --y-threshold <float>           threshold for Y intensity for gender estimation [0.3]\\n\"\n           \"        --x-threshold <float>           threshold for X intensity for gender estimation [0.9]\\n\"\n           \"        --x-het-rate-threshold <float>  threshold for X Het Rate for gender estimation [0.1]\\n\"\n           \"\\n\"\n           \"Examples:\\n\"\n           \"    bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt \\\\\\n\"\n           \"      5434246082_R03C01_Grn.idat 5434246082_R03C01_Red.idat\\n\"\n           \"    bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --snp-map \"\n           \"GSA-24v3-0_A1.bpm.csv\\n\"\n           \"    bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --gentrain-version 2 \"\n           \"--gtc-version 3 \\\\\\n\"\n           \"      --no-sample-name --no-checksums --no-imaging-date --autocall-date \\\"\\\" --autocall-version 1.6.3.1 \"\n           \"--gender-version 1\\n\"\n           \"    bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --no-sample-name \\\\\\n\"\n           \"      --no-checksums --autocall-date \\\"\\\" --autocall-version 2.0.1.179 --min-loci 10 --max-loci 100\\n\"\n           \"    bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --no-sample-name \\\\\\n\"\n           \"      --no-checksums --autocall-date \\\"\\\"\\n\"\n           \"\\n\";\n}\n\nstatic inline FILE *get_file_handle(const char *str) {\n    if (!str) return NULL;\n    FILE *ret;\n    if (strcmp(str, \"-\") == 0) {\n        ret = stdout;\n    } else {\n        ret = fopen(str, \"w\");\n        if (!ret) error(\"Failed to open %s: %s\\n\", str, strerror(errno));\n    }\n    return ret;\n}\n\n// to recapitulate the .NET behavior of ToString() I need to add a small value\n// http://stackoverflow.com/questions/2085449\n// http://stackoverflow.com/questions/11085052\n// http://stackoverflow.com/questions/14325214\nstatic double round_adjust(double x) {\n    double y = 5e-8, z;\n    if (x > 1.0) {\n        z = 1.0;\n        while (x > z) {\n            y *= 10.0;\n            z *= 10.0;\n        }\n    } else {\n        z = 0.1;\n        while (x < z) {\n            y *= 0.1;\n            z *= 0.1;\n        }\n    }\n    return x + y;\n}\n\n// this is the same file that can be generated by AutoConvert, AutoConvert 2.0 or by Picard\n// BpmToNormalizationManifestCsv most likely this file was generated by AutoConvert to allow other software such as\n// Illuminus, GenoSNP, Birdseed, optiCall, zCall, and iCall, to normalize intensities across sub-bead pools\n// http://gatk.broadinstitute.org/hc/en-us/articles/360057440631-BpmToNormalizationManifestCsv-Picard\n// a separate implementation from Illumina can be found in function CreateSNPMapFile from class AutoCallPollerThread\nstatic void snp_map_write(const bpm_t *bpm, const egt_t *egt, const char *fn) {\n    int i;\n    FILE *out_txt = get_file_handle(fn);\n    fprintf(out_txt, \"Index,Name,Chromosome,Position,GenTrain Score,SNP,ILMN Strand,Customer Strand,NormID\\n\");\n    for (i = 0; i < bpm->num_loci; i++) {\n        LocusEntry *locus_entry = &bpm->locus_entries[i];\n        const char *chrom =\n            strncasecmp(locus_entry->chrom, \"CHR\", 3) == 0 ? locus_entry->chrom + 3 : locus_entry->chrom;\n        double gentrain_score = NAN;\n        if (egt) {\n            int idx;\n            int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx);\n            if (ret < 0) error(\"Illumina probe %s not found in cluster file\\n\", locus_entry->name);\n            ClusterRecord *cluster_record = &egt->cluster_records[idx];\n            gentrain_score = round_adjust(cluster_record->cluster_score.total_score);\n        }\n        fprintf(out_txt, \"%d,%s,%s,%s,%.4f,%s,%s,%s,%d\\n\", locus_entry->index, locus_entry->name, chrom,\n                locus_entry->map_info, gentrain_score, locus_entry->snp, locus_entry->ilmn_strand,\n                locus_entry->source_strand, locus_entry->norm_id);\n    }\n    if (out_txt != stdout && out_txt != stderr) fclose(out_txt);\n}\n\nvoid mkdir_p(const char *fmt, ...);\n\nint run(int argc, char *argv[]) {\n    const char *bpm_fname = NULL;\n    const char *egt_fname = NULL;\n    const char *snp_map_fname = NULL;\n    const char *idat_pathname = NULL;\n    const char *grn_idat_fname = NULL;\n    const char *red_idat_fname = NULL;\n    const char *output_pathname = \".\";\n    const char *autocall_date_format = AUTOCALL_DATE_FORMAT_DFLT;\n    const char *autocall_version = AUTOCALL_VERSION_DFLT;\n    char *tmp;\n    int gentrain_version = 3;\n    float gencall_cutoff = 0.15;\n    int eof_check = 1;\n    int preset = 4;\n    int gtc_file_version = 5;\n    int sample_name = 1;\n    int checksums = 1;\n    int imaging_date = 1;\n    int allow_missing_clusters = 0;\n    gender_t gender;\n    gender.version = 2;                // 1 in AutoConvert\n    gender.min_loci = 100;             // version 2\n    gender.max_loci = 10000;           // version 2 for downsampling\n    gender.min_x_loci = 20;            // shared between version 1 and 2\n    gender.min_y_loci = 20;            // version 2\n    gender.call_rate_threshold = 0.0;  // changed from 0.97\n    gender.y_threshold = 0.3;          // version 2\n    gender.x_threshold = 0.9;          // version 2\n    gender.x_het_rate_threshold = 0.1; // version 1\n\n    static struct option loptions[] = {{\"bpm\", required_argument, NULL, 'b'},\n                                       {\"egt\", required_argument, NULL, 'e'},\n                                       {\"idats\", required_argument, NULL, 'i'},\n                                       {\"grn-idats\", required_argument, NULL, 'g'},\n                                       {\"red-idats\", required_argument, NULL, 'r'},\n                                       {\"output\", required_argument, NULL, 'o'},\n                                       {\"gentrain-version\", required_argument, NULL, 'v'},\n                                       {\"gencall-cutoff\", required_argument, NULL, 'c'},\n                                       {\"snp-map\", required_argument, NULL, 1},\n                                       {\"do-not-check-eof\", no_argument, NULL, 2},\n                                       {\"preset\", required_argument, NULL, 3},\n                                       {\"gtc-version\", required_argument, NULL, 4},\n                                       {\"no-sample-name\", no_argument, NULL, 5},\n                                       {\"no-cheksums\", no_argument, NULL, 6},\n                                       {\"no-imaging-date\", no_argument, NULL, 7},\n                                       {\"autocall-date\", required_argument, NULL, 8},\n                                       {\"autocall-version\", required_argument, NULL, 9},\n                                       {\"allow-missing-clusters\", no_argument, NULL, 10},\n                                       {\"gender-version\", no_argument, NULL, 11},\n                                       {\"min-loci\", no_argument, NULL, 12},\n                                       {\"max-loci\", no_argument, NULL, 13},\n                                       {\"min-x-loci\", no_argument, NULL, 14},\n                                       {\"min-y-loci\", no_argument, NULL, 15},\n                                       {\"call-rate-threshold\", no_argument, NULL, 16},\n                                       {\"y-threshold\", no_argument, NULL, 17},\n                                       {\"x-threshold\", no_argument, NULL, 18},\n                                       {\"x-het-rate-threshold\", no_argument, NULL, 19},\n                                       {NULL, 0, NULL, 0}};\n    int c;\n    while ((c = getopt_long(argc, argv, \"h?b:e:i:g:r:o:v:c:\", loptions, NULL)) >= 0) {\n        switch (c) {\n        case 'b':\n            bpm_fname = optarg;\n            break;\n        case 'e':\n            egt_fname = optarg;\n            break;\n        case 'i':\n            idat_pathname = optarg;\n            break;\n        case 'g':\n            grn_idat_fname = optarg;\n            break;\n        case 'r':\n            red_idat_fname = optarg;\n            break;\n        case 'o':\n            output_pathname = optarg;\n            break;\n        case 'v':\n            gentrain_version = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --gentrain-version %s\\n\", optarg);\n            if (gentrain_version != 2 && gentrain_version != 3)\n                error(\"The --gentrain-version option only allows values 2, and 3\\n%s\", usage_text());\n            break;\n        case 'c':\n            gencall_cutoff = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --gencall-cutoff %s\\n\", optarg);\n            break;\n        case 1:\n            snp_map_fname = optarg;\n            break;\n        case 2:\n            eof_check = 0;\n            break;\n        case 3:\n            preset = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --preset %s\\n\", optarg);\n            if (preset < 1 || preset > 4)\n                error(\"The --preset option only allows values 1, 2, 3, and 4\\n%s\", usage_text());\n            switch (preset) {\n            case 1:\n                gentrain_version = 2;\n                gender.version = 1;\n                gtc_file_version = 3;\n                sample_name = 0;\n                checksums = 0;\n                imaging_date = 0;\n                autocall_version = \"1.6.3.1\";\n                break;\n            case 2:\n                gentrain_version = 3;\n                gender.version = 2;\n                gender.call_rate_threshold = 0.97;\n                gtc_file_version = 5;\n                sample_name = 0;\n                checksums = 0;\n                imaging_date = 1;\n                autocall_version = \"2.0.1.179\";\n                break;\n            case 3:\n                gentrain_version = 3;\n                gender.version = 2;\n                // we did not reimplement the bug of estimating the autosomal call rate including loci with 0 cluster\n                // scores as missing\n                gender.call_rate_threshold = 0.97;\n                gtc_file_version = 5;\n                sample_name = 0;\n                checksums = 0;\n                imaging_date = 1;\n                autocall_version = \"3.0.0\";\n                break;\n            case 4:\n                gentrain_version = 3;\n                gender.version = 2;\n                gender.call_rate_threshold = 0.97;\n                gtc_file_version = 5;\n                sample_name = 1;\n                checksums = 1;\n                imaging_date = 1;\n                autocall_version = \"3.0.0\";\n                break;\n            }\n            break;\n        case 4:\n            gtc_file_version = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --gtc-version %s\\n\", optarg);\n            if (gtc_file_version != 3 && gtc_file_version != 5)\n                error(\"The --gtc-version option only allows values 3, and 5\\n%s\", usage_text());\n            break;\n        case 5:\n            sample_name = 0;\n            break;\n        case 6:\n            checksums = 0;\n            break;\n        case 7:\n            imaging_date = 0;\n            break;\n        case 8:\n            autocall_date_format = optarg;\n            break;\n        case 9:\n            autocall_version = optarg;\n            break;\n        case 10:\n            allow_missing_clusters = 1;\n            break;\n        case 11:\n            gender.version = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --gender-version %s\\n\", optarg);\n            if (gender.version != 1 && gender.version != 2)\n                error(\"The --gender-version option only allows values 1 and 2\\n%s\", usage_text());\n            break;\n        case 12:\n            gender.min_loci = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --min-loci %s\\n\", optarg);\n            break;\n        case 13:\n            gender.max_loci = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --max-loci %s\\n\", optarg);\n            break;\n        case 14:\n            gender.min_x_loci = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --min-x-loci %s\\n\", optarg);\n            break;\n        case 15:\n            gender.min_y_loci = strtol(optarg, &tmp, 0);\n            if (*tmp) error(\"Could not parse: --min-y-loci %s\\n\", optarg);\n            break;\n        case 16:\n            gender.call_rate_threshold = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --call-rate-threshold %s\\n\", optarg);\n            break;\n        case 17:\n            gender.y_threshold = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --y-threshold %s\\n\", optarg);\n            break;\n        case 18:\n            gender.x_threshold = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --x-threshold %s\\n\", optarg);\n            break;\n        case 19:\n            gender.x_het_rate_threshold = strtof(optarg, &tmp);\n            if (*tmp) error(\"Could not parse: --x-het-rate-threshold %s\\n\", optarg);\n            break;\n        case 'h':\n        case '?':\n        default:\n            error(\"%s\", usage_text());\n        }\n    }\n    if (bpm_fname == NULL) error(\"The --bpm option is required\\n%s\", usage_text());\n    if (idat_pathname != NULL && (grn_idat_fname != NULL || red_idat_fname != NULL))\n        error(\"Cannot use option --idats with either option --grn-idats or --red-idats\\n%s\", usage_text());\n    if (grn_idat_fname != NULL && red_idat_fname == NULL)\n        error(\"Option --grn-idats requires option --red-idats\\n%s\", usage_text());\n    if (grn_idat_fname == NULL && red_idat_fname != NULL)\n        error(\"Option --red-idats requires option --grn-idats\\n%s\", usage_text());\n    if (idat_pathname == NULL && grn_idat_fname == NULL && red_idat_fname == NULL) {\n        if (snp_map_fname == NULL && argc - optind == 0) error(\"No IDAT files provided as input\\n%s\", usage_text());\n        if (argc - optind % 2 == 1)\n            error(\n                \"If options --idats/--grn-idats/--red-idats are not used, input an alternating list of green and red \"\n                \"IDATs\\n%s\",\n                usage_text());\n    }\n\n    fprintf(stderr, \"idat2gtc \" IDAT2GTC_VERSION \" http://github.com/freeseek/gtc2vcf\\n\");\n    fprintf(stderr, \"Using normalization algorithm version %s\\n\", gentrain_version == 2 ? \"1.1.2\" : \"1.2.0\");\n\n    if (strcmp(output_pathname, \".\") != 0) mkdir_p(\"%s/\", output_pathname);\n\n    // read SNP manifest file\n    fprintf(stderr, \"Reading BPM file %s\\n\", bpm_fname);\n    bpm_t *bpm = bpm_init(bpm_fname, eof_check, 0, checksums);\n\n    // read cluster file\n    egt_t *egt = NULL;\n    if (egt_fname) {\n        fprintf(stderr, \"Reading EGT file %s\\n\", egt_fname);\n        egt = egt_init(egt_fname, eof_check, checksums);\n        if (!strcmp(egt->normalization_version, \"1.2.0\")) {\n            if (gentrain_version != 3)\n                fprintf(stderr, \"Normalization algorithm version %s for cluster file %s corresponds to GenTrain 3.0\\n\",\n                        egt->normalization_version, egt->fn);\n        } else if (!strcmp(egt->normalization_version, \"1.1.2\")) {\n            if (gentrain_version != 2)\n                fprintf(stderr, \"Normalization algorithm version %s for cluster file %s corresponds to GenTrain 2.0\\n\",\n                        egt->normalization_version, egt->fn);\n        } else if (!strcmp(egt->normalization_version, \"1.1.0\")) {\n            if (gentrain_version != 1)\n                fprintf(stderr, \"Normalization algorithm version %s for cluster file %s corresponds to GenTrain 1.0\\n\",\n                        egt->normalization_version, egt->fn);\n        } else {\n            fprintf(stderr, \"Normalization algorithm version %s for cluster file %s is not recognized\\n\",\n                    egt->normalization_version, egt->fn);\n        }\n    } else {\n        fprintf(stderr, \"No cluster file specified or forcing no cluster use\\n\");\n        if (!gentrain_version) gentrain_version = 3;\n    }\n\n    // write SNP map file if requested\n    if (snp_map_fname) snp_map_write(bpm, egt, snp_map_fname);\n\n    // generate lists of green and red IDATs to process\n    int i, n = 0;\n    char **grn_idats = NULL;\n    char **red_idats = NULL;\n    if (idat_pathname != NULL) {\n        // this code for now does not recursively looks for IDAT files\n        DIR *d = opendir(idat_pathname);\n        if (!d) error(\"Failed to open directory %s\\n\", idat_pathname);\n        struct dirent *dir;\n        int m_grn = 0;\n        int m_red = 0;\n        int p = strlen(idat_pathname);\n        grn_idats = NULL;\n        red_idats = NULL;\n        while ((dir = readdir(d))) {\n            char *ptr = strstr(dir->d_name, \"_Grn.idat\");\n            if (!ptr) continue;\n            hts_expand0(char *, n + 1, m_grn, grn_idats);\n            hts_expand0(char *, n + 1, m_red, red_idats);\n            int q = strlen(dir->d_name);\n            grn_idats[n] = (char *)malloc((p + q + 2) * sizeof(char));\n            memcpy(grn_idats[n], idat_pathname, p);\n            grn_idats[n][p] = '/';\n            memcpy(&grn_idats[n][p + 1], dir->d_name, q + 1);\n            dir->d_name[q - 8] = 'R';\n            dir->d_name[q - 7] = 'e';\n            dir->d_name[q - 6] = 'd';\n            red_idats[n] = (char *)malloc((p + q + 2) * sizeof(char));\n            memcpy(red_idats[n], idat_pathname, p);\n            red_idats[n][p] = '/';\n            memcpy(&red_idats[n][p + 1], dir->d_name, q + 1);\n            n++;\n        }\n        closedir(d);\n\n    } else if (grn_idat_fname != NULL && red_idat_fname != NULL) {\n        grn_idats = hts_readlines(grn_idat_fname, &n);\n        int n_check;\n        red_idats = hts_readlines(red_idat_fname, &n_check);\n        if (n != n_check)\n            error(\"File %s contains %d filenames while file %s contains %d filenames\\n\", grn_idat_fname, n,\n                  red_idat_fname, n_check);\n    } else if (argc > optind) {\n        n = (argc - optind) / 2;\n        grn_idats = (char **)malloc(n * sizeof(char *));\n        red_idats = (char **)malloc(n * sizeof(char *));\n        for (i = 0; i < n; i++) {\n            grn_idats[i] = argv[optind++];\n            red_idats[i] = argv[optind++];\n        }\n    }\n\n    if (n > 0) {\n        if (egt) {\n            fprintf(stderr, \"Using genotyping algorithm version %s\\n\", gentrain_version == 2 ? \"6.3.0\" : \"7.0.0\");\n            fprintf(stderr, \"Gender estimation parameters\\n\");\n            fprintf(stderr, \"\\tVersion: %d\\n\", gender.version);\n            fprintf(stderr, \"\\tMinX_Loci: %d\\n\", gender.min_x_loci);\n            fprintf(stderr, \"\\tX_HetRateThreshold: %f\\n\", gender.x_het_rate_threshold);\n            fprintf(stderr, \"\\tMinAutosomalLoci: %d\\n\", gender.min_loci);\n            fprintf(stderr, \"\\tMaxAutosomalLoci: %d\\n\", gender.max_loci);\n            fprintf(stderr, \"\\tMinY_Loci: %d\\n\", gender.min_y_loci);\n            fprintf(stderr, \"\\tAutosomalCallRateThreshold: %f\\n\", gender.call_rate_threshold);\n            fprintf(stderr, \"\\tX_IntensityThreshold: %f\\n\", gender.x_threshold);\n            fprintf(stderr, \"\\tY_IntensityThreshold: %f\\n\", gender.y_threshold);\n        }\n        DIR *d = opendir(output_pathname);\n        if (!d) error(\"Failed to open directory %s\\n\", output_pathname);\n        kstring_t gtc_fname = {0, 0, NULL};\n        for (i = 0; i < n; i++) {\n            fprintf(stderr, \"Reading GRN IDAT file %s\\n\", grn_idats[i]);\n            idat_t *grn_idat = idat_init(grn_idats[i], 1);\n            fprintf(stderr, \"Reading RED IDAT file %s\\n\", red_idats[i]);\n            idat_t *red_idat = idat_init(red_idats[i], 1);\n            gtc_t *gtc = gtc_init(grn_idat, red_idat, bpm, egt, gentrain_version, gtc_file_version, gencall_cutoff,\n                                  sample_name, checksums, imaging_date, autocall_date_format, autocall_version,\n                                  allow_missing_clusters, &gender);\n            const char *ptr = strstr(grn_idats[i], \"_Grn.idat\");\n            if (!ptr) ptr = strstr(grn_idats[i], \".idat\");\n            const char *ptr2 = strrchr(grn_idats[i], '/');\n            if (ptr2)\n                ptr2++;\n            else\n                ptr2 = grn_idats[i];\n            ksprintf(&gtc_fname, \"%s/%.*s.gtc\", output_pathname, (int)(ptr ? ptr - ptr2 : strlen(ptr2)), ptr2);\n            idat_destroy(grn_idat);\n            idat_destroy(red_idat);\n            fprintf(stderr, \"Writing GTC file %s\\n\", gtc_fname.s);\n            if (gtc_write(gtc, gtc_fname.s, gtc_file_version) < 0) error(\"Failed to write GTC file: %s\\n\", gtc_fname.s);\n            gtc_destroy(gtc);\n            gtc_fname.l = 0;\n        }\n        free(gtc_fname.s);\n        closedir(d);\n    }\n\n    if (idat_pathname != NULL || grn_idat_fname != NULL || red_idat_fname != NULL) {\n        for (i = 0; i < n; i++) {\n            free(grn_idats[i]);\n            free(red_idats[i]);\n        }\n    }\n    free(grn_idats);\n    free(red_idats);\n\n    bpm_destroy(bpm);\n    egt_destroy(egt);\n\n    return 0;\n}\n"
  },
  {
    "path": "nearest_neighbor.c",
    "content": "/* The MIT License\n\n   Copyright (c) 2018 Giulio Genovese\n\n   Author: Giulio Genovese <giulio.genovese@gmail.com>\n\n   Permission is hereby granted, free of charge, to any person obtaining a copy\n   of this software and associated documentation files (the \"Software\"), to deal\n   in the Software without restriction, including without limitation the rights\n   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n   copies of the Software, and to permit persons to whom the Software is\n   furnished to do so, subject to the following conditions:\n\n   The above copyright notice and this permission notice shall be included in\n   all copies or substantial portions of the Software.\n\n   THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n   THE SOFTWARE.\n\n */\n\n#include <stdlib.h>\n\nint elementsInBin[12];\nint *binData[12];\nint elementsInShiftedBin[11];\nint *binDataShifted[11];\n\nint findClosestSitesToPointsAlongAxis(int n_raw, float *raw_x, float *raw_y, int n_axis, float *axis_x, float *axis_y,\n                                      int *ret) {\n    int i;\n    float *raw_a = NULL;\n    float *raw_b = NULL;\n    float *axis_a = NULL;\n    float axis_max_val;\n    float bin_width;\n    int bin_idx;\n    float quotient;\n    float reminder;\n    int *curr_bin_data;\n    int curr_bin_size;\n    float curr_axis_x;\n    float curr_axis_y;\n    float x_dist;\n    float y_dist;\n    double best_val;\n    int best_idx;\n    int j;\n    int curr_idx;\n    double sq_dist;\n    double axis_max_dist;\n    int use_y = 1;\n    int use_x = 1;\n\n    for (i = 0; i < n_axis; i++) {\n        if (axis_x[i] > 0.0001) {\n            use_y = 0;\n            break;\n        }\n    }\n\n    for (i = 0; i < n_axis; i++) {\n        if (axis_y[i] > 0.0001) {\n            use_x = 0;\n            break;\n        }\n    }\n\n    if (use_y) {\n        raw_a = raw_y;\n        raw_b = raw_x;\n        axis_a = axis_y;\n    } else if (use_x) {\n        raw_a = raw_x;\n        raw_b = raw_y;\n        axis_a = axis_x;\n    } else {\n        return -1;\n    }\n\n    axis_max_val = axis_a[n_axis - 1];\n    bin_width = axis_max_val / 12.0f;\n    axis_max_dist = (double)bin_width;\n\n    for (i = 0; i < n_raw; i++) {\n        if ((double)raw_b[i] > axis_max_dist) continue;\n        bin_idx = (int)(raw_a[i] / bin_width);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n        elementsInBin[bin_idx]++;\n        bin_idx = (int)(raw_a[i] / bin_width - 0.5f);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 10) bin_idx = 10;\n        elementsInShiftedBin[bin_idx]++;\n    }\n\n    for (i = 0; i <= 11; i++) {\n        binData[i] = (int *)malloc((size_t)elementsInBin[i] * sizeof(int));\n        elementsInBin[i] = 0;\n        if (i == 11) continue;\n        binDataShifted[i] = (int *)malloc((size_t)elementsInShiftedBin[i] * sizeof(int));\n        elementsInShiftedBin[i] = 0;\n    }\n\n    for (i = 0; i < n_raw; i++) {\n        if ((double)raw_b[i] > axis_max_dist) continue;\n        bin_idx = (int)(raw_a[i] / bin_width);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n        binData[bin_idx][elementsInBin[bin_idx]] = i;\n        elementsInBin[bin_idx]++;\n        bin_idx = (int)(raw_a[i] / bin_width - 0.5f);\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 10) bin_idx = 10;\n        binDataShifted[bin_idx][elementsInShiftedBin[bin_idx]] = i;\n        elementsInShiftedBin[bin_idx]++;\n    }\n\n    for (i = 0; i < n_axis; i++) {\n        quotient = axis_a[i] / bin_width;\n        bin_idx = (int)quotient;\n        reminder = quotient - (float)bin_idx;\n        curr_bin_data = NULL;\n        curr_bin_size = 0;\n        if (bin_idx < 0) bin_idx = 0;\n        if (bin_idx > 11) bin_idx = 11;\n\n        if (0.25f <= reminder && reminder <= 0.75f) {\n            curr_bin_data = binData[bin_idx];\n            curr_bin_size = elementsInBin[bin_idx];\n        } else {\n            if (reminder < 0.25f) {\n                if (bin_idx == 0) {\n                    curr_bin_data = binData[bin_idx];\n                    curr_bin_size = elementsInBin[bin_idx];\n                } else {\n                    curr_bin_data = binDataShifted[bin_idx - 1];\n                    curr_bin_size = elementsInShiftedBin[bin_idx - 1];\n                }\n            } else if (bin_idx == 11) {\n                curr_bin_data = binData[bin_idx];\n                curr_bin_size = elementsInBin[bin_idx];\n            } else {\n                curr_bin_data = binDataShifted[bin_idx];\n                curr_bin_size = elementsInShiftedBin[bin_idx];\n            }\n        }\n\n        curr_axis_x = axis_x[i];\n        curr_axis_y = axis_y[i];\n        best_val = 1e20;\n        best_idx = -1;\n\n        for (j = 0; j < curr_bin_size; j++) {\n            curr_idx = curr_bin_data[j];\n            x_dist = raw_x[curr_idx] - curr_axis_x;\n            y_dist = raw_y[curr_idx] - curr_axis_y;\n            sq_dist = (double)(x_dist * x_dist + y_dist * y_dist);\n            if (sq_dist < best_val) {\n                best_val = sq_dist;\n                best_idx = curr_idx;\n            }\n        }\n\n        ret[i] = best_idx;\n    }\n\n    for (i = 0; i <= 11; i++) {\n        free((void *)binData[i]);\n        elementsInBin[i] = 0;\n        if (i > 10) continue;\n        free((void *)binDataShifted[i]);\n        elementsInShiftedBin[i] = 0;\n    }\n\n    return 0;\n}\n"
  }
]