Repository: freeseek/gtc2vcf Branch: master Commit: cc4898976c11 Files: 11 Total size: 607.0 KB Directory structure: gitextract_37oi3chf/ ├── BAFregress.c ├── HapMap.md ├── Illumina.md ├── LICENSE ├── README.md ├── affy2vcf.c ├── gtc2vcf.c ├── gtc2vcf.h ├── gtc2vcf_plot.R ├── idat2gtc.c └── nearest_neighbor.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: BAFregress.c ================================================ /* The MIT License Copyright (C) 2024-2025 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include "bcftools.h" #define BAFREGRESS_VERSION "2025-08-19" #define GT_NC 0 #define GT_AA 1 #define GT_AB 2 #define GT_BB 3 KSORT_INIT_GENERIC(float) /****************************************** * PLUGIN * ******************************************/ inline static double sqr(double x) { return x * x; } const char *about(void) { return "Detects and estimates sample contamination using BAF intensity data.\n"; } static const char *usage_text(void) { return "\n" "About: Detects and estimates sample contamination. (version " BAFREGRESS_VERSION " http://github.com/freeseek/gtc2vcf)\n" "[ Jun, G. et al. Detecting and Estimating Contamination of Human DNA Samples in Sequencing\n" "and Array-Based Genotype Data. AJHG 91, 839-848 (2012) http://doi.org/10.1016/j.ajhg.2012.09.004 ]\n" "\n" "Usage: bcftools +BAFregress [options] \n" "\n" "Plugin options:\n" " --threshold minimum allele frequency for BAF regression [0.1]\n" " -a, --af file with allele frequency information\n" " --tag allele frequency INFO tag [AC/AN]\n" " --adjust-BAF minimum number of genotypes for a cluster to median adjust BAF (-1 for " "no adjustment) [5]\n" " --truncate-BAF truncates BAF values between 0 and 1 and turns off adjustment to " "recover original behavior\n" " --use-MAF uses minor allele frequency rather than A/B allele frequency to " "recover original behavior\n" " -e, --estimates write BAF regression estimates to a file [standard output]\n" " -o, --output write VCF output to a file\n" " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level " "[v]\n" " -r, --regions restrict to comma-separated list of regions\n" " -R, --regions-file restrict to regions listed in a file\n" " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant " "overlaps (2) [1]\n" " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions " "with \"^\" prefix\n" " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions " "with \"^\" prefix\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant " "overlaps (2) [0]\n" " --threads number of extra output compression threads [0]\n" " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" " "prefix)\n" " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n" " --force-samples only warn about unknown subset samples\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Example:\n" " bcftools +BAFregress file.bcf\n" " bcftools +BAFregress --tag AF file.bcf\n" " bcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf file.bcf\n" " bcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf --truncate-BAF --use-MAF file.bcf\n" "\n"; } int run(int argc, char **argv) { float af_threshold = 0.1; char *af_fname = NULL; char *af_tag = NULL; int adj_baf = 5; int truncate_baf = 0; int use_maf = 0; char *estimate_fname = "-"; char *output_fname = NULL; int output_type = FT_VCF; int clevel = -1; int regions_overlap = 1; int targets_overlap = 0; int n_threads = 0; char *targets_list = NULL; int targets_is_file = 0; char *regions_list = NULL; int regions_is_file = 0; char *sample_names = NULL; int sample_is_file = 0; int force_samples = 0; int write_index = 0; char *index_fname; htsFile *out_fh = NULL; static struct option loptions[] = {{"threshold", required_argument, NULL, 1}, {"af", required_argument, NULL, 'a'}, {"tag", required_argument, NULL, 2}, {"adjust-BAF", required_argument, NULL, 3}, {"truncate-BAF", no_argument, NULL, 4}, {"use-MAF", no_argument, NULL, 5}, {"estimates", required_argument, NULL, 'e'}, {"output", required_argument, NULL, 'o'}, {"output-type", required_argument, NULL, 'O'}, {"threads", required_argument, NULL, 6}, {"regions", required_argument, NULL, 'r'}, {"regions-file", required_argument, NULL, 'R'}, {"regions-overlap", required_argument, NULL, 7}, {"targets", required_argument, NULL, 't'}, {"targets-file", required_argument, NULL, 'T'}, {"targets-overlap", required_argument, NULL, 8}, {"samples", required_argument, NULL, 's'}, {"samples-file", required_argument, NULL, 'S'}, {"force-samples", no_argument, NULL, 9}, {"write-index", optional_argument, NULL, 'W'}, {0, 0, 0, 0}}; int c; char *tmp; while ((c = getopt_long(argc, argv, "h?a:e:o:O:r:R:t:T:s:S:", loptions, NULL)) >= 0) { switch (c) { case 1: af_threshold = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --threshold %s\n", optarg); if (af_threshold <= 0.0 || af_threshold >= 1.0) error("--threshold must input a value between 0 and 1\n"); break; case 'a': af_fname = optarg; break; case 2: af_tag = optarg; break; case 3: adj_baf = (int)strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --adjust-BAF %s\n", optarg); break; case 4: truncate_baf = 1; break; case 5: use_maf = 1; break; case 'e': estimate_fname = optarg; break; case 'o': output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': output_type = FT_BCF_GZ; break; case 'u': output_type = FT_BCF; break; case 'z': output_type = FT_VCF_GZ; break; case 'v': output_type = FT_VCF; break; default: { clevel = strtol(optarg, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("The output type \"%s\" not recognised\n", optarg); } }; if (optarg[1]) { clevel = strtol(optarg + 1, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("Could not parse argument: --compression-level %s\n", optarg + 1); } break; case 6: n_threads = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse argument: --threads %s\n", optarg); break; case 'r': regions_list = optarg; break; case 'R': regions_list = optarg; regions_is_file = 1; break; case 7: if (!strcasecmp(optarg, "0")) regions_overlap = 0; else if (!strcasecmp(optarg, "1")) regions_overlap = 1; else if (!strcasecmp(optarg, "2")) regions_overlap = 2; else error("Could not parse: --regions-overlap %s\n", optarg); break; case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; case 8: if (!strcasecmp(optarg, "0")) targets_overlap = 0; else if (!strcasecmp(optarg, "1")) targets_overlap = 1; else if (!strcasecmp(optarg, "2")) targets_overlap = 2; else error("Could not parse: --targets-overlap %s\n", optarg); break; case 's': sample_names = optarg; break; case 'S': sample_names = optarg; sample_is_file = 1; break; case 9: force_samples = 1; break; case 'W': if (!(write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); break; case 'h': case '?': default: error("%s", usage_text()); break; } } if (truncate_baf) adj_baf = -1; char *input_fname = NULL; if (optind == argc) { if (!isatty(fileno((FILE *)stdin))) { input_fname = "-"; // reading from stdin } else { error("%s", usage_text()); } } else if (optind + 1 != argc) { error("%s", usage_text()); } else { input_fname = argv[optind]; } bcf_srs_t *srs = bcf_sr_init(); if (af_fname) { bcf_sr_set_opt(srs, BCF_SR_REQUIRE_IDX); bcf_sr_set_opt(srs, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_EXACT); } if (regions_list) { bcf_sr_set_opt(srs, BCF_SR_REGIONS_OVERLAP, regions_overlap); if (bcf_sr_set_regions(srs, regions_list, regions_is_file) < 0) error("Failed to read the regions: %s\n", regions_list); } if (targets_list) { bcf_sr_set_opt(srs, BCF_SR_TARGETS_OVERLAP, targets_overlap); if (bcf_sr_set_targets(srs, targets_list, targets_is_file, 0) < 0) error("Failed to read the targets: %s\n", targets_list); } if (bcf_sr_set_threads(srs, n_threads) < 0) error("Failed to create threads\n"); if (!bcf_sr_add_reader(srs, input_fname)) error("Failed to open %s: %s\n", input_fname, bcf_sr_strerror(srs->errnum)); if (af_fname && !bcf_sr_add_reader(srs, af_fname)) error("Failed to open %s: %s\n", af_fname, bcf_sr_strerror(srs->errnum)); bcf_hdr_t *hdr = bcf_sr_get_header(srs, 0); bcf_hdr_t *af_hdr = af_fname ? bcf_sr_get_header(srs, 1) : NULL; if (sample_names) { int ret = bcf_hdr_set_samples(hdr, sample_names, sample_is_file); if (ret < 0) error("Error parsing the list of samples: %s\n", sample_names); else if (force_samples && ret > 0) error("Sample name mismatch: sample #%d not found in the header\n", ret); } // get IDs for all VCF formats int gt_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT"); if (gt_id < 0) error("Format GT was not found in the input header\n"); int baf_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "BAF"); if (baf_id < 0) error("Format BAF was not found in the input header\n"); int allele_a_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_A"); if (allele_a_id < 0) error("Format ALLELE_A was not found in the input header\n"); int allele_b_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "ALLELE_B"); if (allele_b_id < 0) error("Format ALLELE_B was not found in the input header\n"); int af_id = -1; if (af_tag) { af_id = bcf_hdr_id2int(af_hdr ? af_hdr : hdr, BCF_DT_ID, af_tag); if (af_id < 0) error("Format %s was not found in the allele frequency header\n", af_tag); } FILE *est_fh = strcmp("-", estimate_fname) ? fopen(estimate_fname, "w") : stdout; if (!est_fh) error("Error: cannot write to %s\n", estimate_fname); // output VCF if (output_fname) { char wmode[8]; set_wmode(wmode, output_type, output_fname, clevel); out_fh = hts_open(output_fname, wmode); if (out_fh == NULL) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, output_fname, strerror(errno)); if (n_threads) hts_set_opt(out_fh, HTS_OPT_THREAD_POOL, srs->p); if (bcf_hdr_write(out_fh, hdr) < 0) error("Unable to write to output VCF file\n"); if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0) error("Error: failed to initialise index for %s\n", output_fname); } int n_smpls = bcf_hdr_nsamples(hdr); if (!af_hdr && !af_tag && n_smpls < 30) fprintf( stderr, "Input VCF only includes %d samples. We recommend using a separate VCF to infer marker allele frequency\n", n_smpls); int *arr = NULL; int marr = 0; float *baf_arr = NULL; int nbaf_arr = 0; int8_t *gts = (int8_t *)calloc(n_smpls, sizeof(int8_t)); float *tmp_arr = (float *)calloc(n_smpls, sizeof(float)); float *sumx2 = (float *)calloc(n_smpls, sizeof(float)); float *sumxy = (float *)calloc(n_smpls, sizeof(float)); float *sumx = (float *)calloc(n_smpls, sizeof(float)); float *sumy = (float *)calloc(n_smpls, sizeof(float)); int *n = (int *)calloc(n_smpls, sizeof(int)); // run through each record present in both VCFs int i, j; while (bcf_sr_next_line(srs)) { bcf1_t *line = bcf_sr_get_line(srs, 0); if (!line) continue; if (out_fh && bcf_write1(out_fh, hdr, line) != 0) error("[%s] Error: cannot write to %s\n", __func__, output_fname); bcf1_t *af_line = af_hdr ? bcf_sr_get_line(srs, 1) : line; if (line->n_allele != 2 || !af_line || af_line->n_allele != 2) continue; // skip lines where the allele frequency is less than 0.01 (or greater than 0.99) double af; if (af_tag) { bcf_info_t *af_info = bcf_get_info_id(af_line, af_id); af = af_info ? (double)af_info->v1.f : NAN; } else { hts_expand(int, af_line->n_allele, marr, arr); int ret = bcf_calc_ac(af_hdr ? af_hdr : hdr, af_line, arr, BCF_UN_INFO | BCF_UN_FMT); if (ret <= 0) continue; int an = 0; for (i = 0; i < af_line->n_allele; i++) an += arr[i]; af = (double)arr[1] / (double)an; } if (isnan(af) || af < af_threshold || af > 1.0 - af_threshold) continue; if (use_maf && af > 0.5) af = 1.0 - af; // uses MAF instead of AF to avoid problems with flipped Illumina probes // skip lines where ALLELE_A and ALLELE_B refer to alleles missing from the record (it should not happen) bcf_info_t *allele_a_info = bcf_get_info_id(line, allele_a_id); int8_t allele_a = allele_a_info ? (int8_t)allele_a_info->v1.i : bcf_int8_missing; bcf_info_t *allele_b_info = bcf_get_info_id(line, allele_b_id); int8_t allele_b = allele_b_info ? (int8_t)allele_b_info->v1.i : bcf_int8_missing; if (allele_a < 0 || allele_a >= line->n_allele || allele_b < 0 || allele_b >= line->n_allele) continue; if (allele_b == 0) af = 1.0 - af; // flip the allele frequency if ALLELE_B is the reference // skip lines missing genotypes (e.g. intensity only sites) or with ploidy other than 2 int n_aa = 0, n_ab = 0, n_bb = 0; bcf_fmt_t *gt_fmt = bcf_get_fmt_id(line, gt_id); if (!gt_fmt || gt_fmt->n != 2) continue; #define BRANCH(type_t, bcf_type_vector_end) \ { \ type_t *p = (type_t *)gt_fmt->p; \ for (i = 0; i < n_smpls; i++, p += 2) { \ gts[i] = GT_NC; \ if (p[0] == bcf_type_vector_end || bcf_gt_is_missing(p[0]) || p[1] == bcf_type_vector_end \ || bcf_gt_is_missing(p[1])) \ continue; \ type_t allele_0 = bcf_gt_allele(p[0]); \ type_t allele_1 = bcf_gt_allele(p[1]); \ if (allele_0 == allele_a && allele_1 == allele_a) { \ gts[i] = GT_AA; \ n_aa++; \ } else if ((allele_0 == allele_a && allele_1 == allele_b) \ || (allele_0 == allele_b && allele_1 == allele_a)) { \ gts[i] = GT_AB; \ n_ab++; \ } else if (allele_0 == allele_b && allele_1 == allele_b) { \ gts[i] = GT_BB; \ n_bb++; \ } \ } \ } switch (gt_fmt->type) { case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; default: error("Unexpected type %d\n", gt_fmt->type); } #undef BRANCH int nbaf = bcf_get_format_float(hdr, line, "BAF", &baf_arr, &nbaf_arr); if (nbaf != n_smpls) continue; // wrong number of BAF values // adjust BAF float adj_baf_aa = 0.0; float adj_baf_bb = 0.0; if (adj_baf != -1) { j = 0; if (n_aa >= adj_baf) { for (i = 0; i < n_smpls; i++) if (gts[i] == GT_AA) tmp_arr[j++] = baf_arr[i]; adj_baf_aa = ks_ksmall_float((size_t)j, tmp_arr, (size_t)j / 2); if (j % 2 == 0) adj_baf_aa = (adj_baf_aa + tmp_arr[j / 2 - 1]) * 0.5f; } j = 0; if (n_bb >= adj_baf) { for (i = 0; i < n_smpls; i++) if (gts[i] == GT_BB) tmp_arr[j++] = baf_arr[i]; adj_baf_bb = ks_ksmall_float((size_t)j, tmp_arr, (size_t)j / 2); if (j % 2 == 0) adj_baf_bb = (adj_baf_bb + tmp_arr[j / 2 - 1]) * 0.5f; adj_baf_bb -= 1.0; } } else if (truncate_baf) { // truncates the BAF between 0.0 and 1.0 like Illumina does for (i = 0; i < n_smpls; i++) { if (baf_arr[i] < 0.0) baf_arr[i] = 0.0; else if (baf_arr[i] > 1.0) baf_arr[i] = 1.0; } } for (i = 0; i < n_smpls; i++) { double baf; if (gts[i] == GT_AA) { baf = (double)(baf_arr[i] - adj_baf_aa); sumx2[i] += sqr(af); sumxy[i] += af * baf; sumx[i] += af; sumy[i] += baf; } else if (gts[i] == GT_BB) { baf = (double)(baf_arr[i] - adj_baf_bb); sumx2[i] += sqr(1.0 - af); sumxy[i] += (1.0 - af) * (1.0 - baf); sumx[i] += 1.0 - af; sumy[i] += 1.0 - baf; } else continue; n[i]++; } } fprintf(est_fh, "sample_id\tbaf_regress\tNhom\n"); for (i = 0; i < n_smpls; i++) { double denom = (double)n[i] * sumx2[i] - sqr(sumx[i]); double m = denom ? (n[i] * sumxy[i] - sumx[i] * sumy[i]) / denom : NAN; // double b = denom ? (sumy[i] * sumx2[i] - sumx[i] * sumxy[i]) / denom : NAN; fprintf(est_fh, "%s\t%.4f\t%d\n", hdr->samples[i], m, n[i]); } if (est_fh != stdout && est_fh != stderr) fclose(est_fh); // close output VCF if (output_fname) { if (write_index) { if (bcf_idx_save(out_fh) < 0) { if (hts_close(out_fh) != 0) error("Close failed %s\n", strcmp(output_fname, "-") ? output_fname : "stdout"); error("Error: cannot write to index %s\n", index_fname); } free(index_fname); } hts_close(out_fh); } free(arr); free(baf_arr); free(gts); free(tmp_arr); free(sumx2); free(sumxy); free(sumx); free(sumy); free(n); bcf_sr_destroy(srs); return 0; } ================================================ FILE: HapMap.md ================================================ HapMap ====== A tutorial for how to convert HapMap data from Illumina and Affymetrix arrays to a GRCh38 VCF using gtc2vcf * [Download manifest files](#download-manifest-files) * [Download and unpack IDAT and CEL files](#download-and-unpack-idat-and-cel-files) * [Create sample maps](#create-sample-maps) * [Convert IDATs to GTCs](#convert-idats-to-gtcs) * [Convert GTCs to VCF](#convert-gtcs-to-vcf) * [Convert CELs to CHPs](#convert-cels-to-chps) * [Convert CHPs to VCF](#convert-chps-to-vcf) Download manifest files ======================= Download HumanCNV370v1 manifest and cluster files from [Illumina](http://support.illumina.com/downloads/humancnv370-duo_v10_product_files.html) and [GEO](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6986) ``` wget ftp://webdata:webdata@ftp.illumina.com/downloads/ProductFiles/HumanCNV370/HumanCNV370-Duo/humancnv370v1_c.bpm wget ftp://webdata2:webdata2@ftp.illumina.com/downloads/ProductFiles/HumanCNV370/HumanCNV370-Duo/HumanCNV370v1_C.egt wget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL6nnn/GPL6986/suppl/GPL6986_HumanCNV370v1_C.csv.gz gunzip GPL6986_HumanCNV370v1_C.csv.gz /bin/mv GPL6986_HumanCNV370v1_C.csv HumanCNV370v1_C.csv ``` Download HumanOmni2.5-4v1 manifest and cluster files from [Illumina](http://support.illumina.com/downloads/humanomni2-5-quad_product_files.html) ``` wget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/94afb35e-7c11-45cc-8a65-d868af527c54/HumanOmni2.5-4v1_H.bpm wget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/f003e017-1761-4348-958f-03997a30cf67/HumanOmni2.5-4v1_H.egt wget ftp://webdata2:webdata2@ftp.illumina.com/MyIllumina/d5578cf6-bb3b-4b4b-98d3-21edc5bcbd45/HumanOmni2.5-4v1_H.csv ``` Download HumanOmni25M-8v1-1 manifest and cluster files from [Illumina](ftp://webdata2:webdata2@ftp.illumina.com/downloads/productfiles/humanomni25) and [GEO](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL20641) ``` wget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL20nnn/GPL20641/suppl/GPL20641_HumanOmni2.5M-8v1-1_B.bpm.gz wget ftp://webdata2:webdata2@ftp.illumina.com/downloads/productfiles/humanomni25/humanomni2-5m-8v1-1_b.egt wget http://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL20nnn/GPL20641/suppl/GPL20641_HumanOmni25M-8v1-1_B.csv.gz gunzip GPL20641_HumanOmni2.5M-8v1-1_B.bpm.gz gunzip GPL20641_HumanOmni25M-8v1-1_B.csv.gz /bin/mv GPL20641_HumanOmni2.5M-8v1-1_B.bpm HumanOmni25M-8v1-1_B.bpm /bin/mv GPL20641_HumanOmni25M-8v1-1_B.csv HumanOmni25M-8v1-1_B.csv ``` Download GenomeWideEx_6 and GenomeWideSNP_6 library and annotation files from [Affymetrix](http://www.affymetrix.com/support/technical/byproduct.affx?product=genomewidesnp_6) ``` wget http://tools.thermofisher.com/content/sfs/supportfiles/genomewidesnp6_libraryfile.zip wget http://www.affymetrix.com/Auth/analysis/downloads/lf/genotyping/GenomeWideSNP_6/SNP6_supplemental_axiom_analysis_files.zip wget http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping/GenomeWideSNP_6.na35.annot.csv.zip unzip -oj genomewidesnp6_libraryfile.zip CD_GenomeWideSNP_6_rev3/Full/GenomeWideSNP_6/LibFiles/GenomeWideSNP_6.{cdf,chr{X,Y}probes,specialSNPs} unzip -o SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.{generic_prior.txt,apt-probeset-genotype.AxiomGT1.xml,AxiomGT1.sketch} unzip -o GenomeWideSNP_6.na35.annot.csv.zip GenomeWideSNP_6.na35.annot.csv /bin/rm genomewidesnp6_libraryfile.zip SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.na35.annot.csv.zip ``` Re-align flanking sequences to GRCh38 ``` for chip in HumanCNV370v1_C humanomni25m-8v1-1_b HumanOmni2.5-4v1_H; do bcftools +gtc2vcf --csv $chip.csv --fasta-flank | \ bwa mem -M $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - | \ samtools view -bS -o $chip.bam done bcftools +affy2vcf --csv GenomeWideSNP_6.na35.annot.csv --fasta-flank | \ bwa mem -M $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - | \ samtools view -bS -o $chip.bam ``` Download and unpack IDAT and CEL files ====================================== ``` wget http://bioconductor.org/packages/release/data/annotation/src/contrib/hapmap370k_1.0.1.tar.gz wget -nH --cut-dirs 2 -r ftp://ftp.ncbi.nlm.nih.gov/hapmap/raw_data/hapmap3_affy6.0/ wget -nH --cut-dirs 5 -r ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ mkdir -p idats tar xzvf hapmap370k_1.0.1.tar.gz -C idats hapmap370k/inst/idatFiles tar xzvf hd_genotype_chip/broad_intensities/Omni25_idats_gtcs_2141_samples.tgz -C idats tar xzvf hd_genotype_chip/sanger_intensities/ALL.wgs.sanger_omni_2_5_8.20130805.snps.genotypes.idats.tar.gz -C idats mkdir -p cels for tgz in hapmap3_affy6.0/*.tgz; do tar xzvf $tgz -C cels; done tar xzvf hd_genotype_chip/coriell_affy6_intensities/Affy60_Coriell_CEL_files.tar.gz -C cels # one sample is mapped to HG03171 but should be mapped to HG01171, most likely a typo here /bin/mv "cels/affy6/1000 Genomes phase 1 and 2 cel files/NA18489 .CEL" "cels/affy6/1000 Genomes phase 1 and 2 cel files/NA18489.CEL" /bin/mv "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03616.CEL" "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03616-1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03660.CEL" "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG03660-1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG04149.CEL" "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG04149-1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG01171.CEL" "cels/affy6/1000 Genomes phase 1 and 2 cel files/HG01171-1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 3 cel files/HG03616.CEL" "cels/affy6/1000 Genomes phase 3 cel files/HG03616-C1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 3 cel files/HG03660.CEL" "cels/affy6/1000 Genomes phase 3 cel files/HG03660-C1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 3 cel files/HG04149.CEL" "cels/affy6/1000 Genomes phase 3 cel files/HG04149-C1.CEL" /bin/mv "cels/affy6/1000 Genomes phase 3 cel files/HG03171.CEL" "cels/affy6/1000 Genomes phase 3 cel files/HG01171-C1.CEL" ``` Create sample maps ================== ``` awk -F, 'NR>1 {print $5"\t"$1".HumanCNV370v1"}' idats/hapmap370k/inst/idatFiles/samples370k.csv > HapMap.HumanCNV370v1.tsv awk -F, 'NR>15 {print $2"_"$3"\t"$6".HumanOmni2.5-4v1"}' idats/SampleSheet.csv > HapMap.HumanOmni2.5-4v1.tsv awk 'NR==FNR {x[$2]=$1} NR>FNR {print $2"\t"x[substr($1,12)]".HumanOmni25M-8v1-1"}' \ hd_genotype_chip/sanger_intensities/sanger_omni_chip.20130805.internal_to_coriell_id.map \ idats/omni2.5-8_otgeno_20130805.idats/log.txt > HapMap.HumanOmni25M-8v1-1.tsv # one sample is mapped to NA19787 but should be mapped to NA19730, most likely a sample swap # samples mapped to NA21742 and NA21743 are the same individual, most likely a collection issue cat hapmap3_affy6.0/{passing,excluded}_cels_sample_map.txt | sed 's/.CEL$//' | \ sed 's/NA19787\tCHEAP_p_HapMapP3Redo2_GenomeWideSNP_6_B09_235604.CEL/NA19730\tCHEAP_p_HapMapP3Redo2_GenomeWideSNP_6_B09_235604.CEL/' | \ awk '{sm=$1; if (sm in x) sm=sm"-"x[sm]; print $2"\t"sm".GenomeWideEx_6"; x[$1]++}' > HapMap.GenomeWideEx_6.tsv ls cels/affy6/1000\ Genomes\ phase\ {1\ and\ 2,3}\ cel\ files/*.CEL | sed 's/.CEL$//' | \ sed 's/.CEL$//' | awk -F/ '{print $4"\t"$4".GenomeWideSNP_6"}' > HapMap.GenomeWideSNP_6.tsv ``` Convert IDATs to GTCs ===================== ``` declare -A bpm=( ["HumanCNV370v1"]="humancnv370v1_c.bpm" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.bpm" ["HumanOmni25M-8v1-1"]="HumanOmni25M-8v1-1_B.bpm" ) declare -A egt=( ["HumanCNV370v1"]="HumanCNV370v1_C.egt" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.egt" ["HumanOmni25M-8v1-1"]="humanomni2-5m-8v1-1_b.egt" ) bcftools +gtc2vcf -i $(find idats -iname *.idat) -o gtc2vcf.idat.tsv mkdir -p HumanCNV370v1 HumanOmni25M-8v1-1 HumanOmni2.5-4v1 for idat in $(cut -f1 gtc2vcf.idat.tsv | grep _Grn.idat$); do chip=$(grep ^$idat gtc2vcf.idat.tsv | cut -f16) mono $HOME/bin/autoconvert/AutoConvert.exe $(find idats -iname $idat) $chip ${bpm[$chip]} ${egt[$chip]} done bcftools +gtc2vcf {HumanCNV370v1,HumanOmni25M-8v1-1,HumanOmni2.5-4v1}/*.gtc -o gtc2vcf.gtc.tsv ``` Convert GTCs to VCF =================== ``` declare -A bpm=( ["HumanCNV370v1"]="humancnv370v1_c.bpm" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.bpm" ["HumanOmni25M-8v1-1"]="HumanOmni25M-8v1-1_B.bpm" ) declare -A egt=( ["HumanCNV370v1"]="HumanCNV370v1_C.egt" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.egt" ["HumanOmni25M-8v1-1"]="humanomni2-5m-8v1-1_b.egt" ) declare -A csv=( ["HumanCNV370v1"]="HumanCNV370v1_C.csv" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.csv" ["HumanOmni25M-8v1-1"]="humanomni25m-8v1-1_b.csv" ) declare -A sam=( ["HumanCNV370v1"]="HumanCNV370v1_C.bam" ["HumanOmni2.5-4v1"]="HumanOmni2.5-4v1_H.bam" ["HumanOmni25M-8v1-1"]="humanomni25m-8v1-1_b.bam" ) for chip in HumanCNV370v1 HumanOmni25M-8v1-1 HumanOmni2.5-4v1; do bcftools +gtc2vcf \ --no-version -Ou \ --fasta-ref $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \ --bpm ${bpm[$chip]} \ --egt ${egt[$chip]} \ --csv ${csv[$chip]} \ --sam ${sam[$chip]} \ --gtcs $chip \ --extra HapMap.$chip.sex \ --do-not-check-bpm | \ bcftools sort -Ou -T ./bcftools. | \ bcftools norm --no-version -Ob -o HapMap.$chip.bcf -c x -f $ref && \ bcftools index -f HapMap.$chip.bcf" done ``` Convert CELs to CHPs ==================== ``` (echo cel_files; ls cels/{,Broad_hapmap3_r2_Affy6_cels_excluded/}*.CEL) > cels.GenomeWideEx_6.lst (echo cel_files; ls cels/affy6/1000\ Genomes\ phase\ {1\ and\ 2,3}\ cel\ files/*.CEL) > cels.GenomeWideSNP_6.lst for chip in GenomeWideEx_6 GenomeWideSNP_6; do mkdir -p $chip apt-probeset-genotype \ --out-dir $chip \ --special-snps GenomeWideSNP_6.specialSNPs \ --read-models-brlmmp GenomeWideSNP_6.generic_prior.txt \ --chip-type $chip \ --xml-file GenomeWideSNP_6.apt-probeset-genotype.AxiomGT1.xml \ --cel-files cels.$chip.lst \ --table-output false \ --cc-chp-output \ --cc-chp-out-dir $chip \ --write-models done ``` Convert CHPs to VCF =================== ``` for chip in GenomeWideEx_6 GenomeWideSNP_6; do bcftools +affy2vcf \ --no-version -Ou \ --fasta-ref HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \ --csv GenomeWideSNP_6.na35.annot.csv \ --sam GenomeWideSNP_6.na35.annot.bam \ --models $chip/AxiomGT1.snp-posteriors.txt \ --report $chip/AxiomGT1.report.txt \ --chps $chip \ --extra HapMap.$chip.sex | \ bcftools sort -Ou -T ./bcftools. | \ bcftools norm --no-version -Ob -o HapMap.$chip.bcf -c x -f $ref && \ bcftools index -f HapMap.$chip.bcf" done ``` ================================================ FILE: Illumina.md ================================================ Archived Human Products ----------------------- | array | date | bpm | egt | csv | |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|-------------------------------------------|-------------------------------------------|-------------------------------------------| | [Human-1]() | 12/21/2004 | Exon-Centric_100K_(v1.2.1).bpm | Exon-Centric_100K_(v1.2.1).egt | NA | | [HumanHap240S]() | 03/13/2006 | BDCHP-1X10-HUMANHAP240S_11216501_B.bpm | BDCHP-1X10-HUMANHAP240S_11216501_B.egt | BDCHP-1X10-HUMANHAP240S_11216501_B.csv | | [HumanHap300_v1]() | 03/30/2006 | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.bpm | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.egt | BDCHP-1x10-HUMANHAP300v1-1_11219278_C.csv | | [Human1M]() | 4/24/2006 | Human1Mv1_C.bpm | Human1Mv1_C.egt | Human1Mv1_C.csv | | [HumanExon510S-2]() | 4/24/2006 | HumanExon510Sv1_D.bpm | HumanExon510Sv1_D.egt | Human510Sv1_A.csv | | [HumanHap550_v1]() | 05/01/2006 | BDCHP-1X10-HUMANHAP550_11218540_C.bpm | BDCHP-1X10-HUMANHAP550_11218540_C.egt | BDCHP-1X10-HUMANHAP550_11218540_C_csv | | [HumanNS-12]() | 11/7/2006 | HumanNS-12.bpm | HumanNS-12.egt | HumanNS-12.csv | | [HumanHap300-Duo_v2]() | 12/21/2006 | HumanHap300v2_A.bpm | HumanHap300v2_A.egt | HumanHap300v2_A.csv | | [HumanHap550-Duo_v3]() | 12/21/2006 | HumanHap550-2v3_B.bpm | HumanHap550-2v3_B.egt | HumanHap550-2v3_B.csv | | [HumanHap550_v3]() | 12/21/2006 | HumanHap550v3_A.bpm | HumanHap550v3_A.egt | HumanHap550v3_A.csv | | [HumanHap650Y_v3]() | 12/21/2006 | HumanHap650Y_v3.bpm | HumanHap650Yv3_A.egt | HumanHap650Yv3_A.csv | | [HumanCNV-12_v1]() | 5/15/2007 | HumanCNV12v1_C.bpm | HumanCNV12v1_C.egt | NA | | [HumanCNV370-Duo_v1]() | 5/15/2007 | HumanCNV370v1_C.bpm | HumanCNV370v1_C.egt | HumanCNV370v1_C.csv | | [HumanLinkage-12]() | 7/10/2007 | HumanLinkage-12 _E.bpm | HumanLinkage-12 _E.egt | NA | | [HumanCVDSNP55]() | 3/31/2008 | CVDSNP55v1_A.bpm | Human CVD.egt | HumanCVDv1_A.csv | | [HumanCNV370-Quad_v3]() | 3/17/2008 | HumanCNV370-Quadv3_C.bpm | HumanCNV370-Quadv3_C.egt | HumanCNV370-Quadv3_C.csv | | [HumanCNV-12_v2]() | 4/3/2008 | HumanCNV12v2_B.bpm | NA | NA | | [Human1M-Duo_v3]() | 4/4/2008 | Human1M-Duov3_B.bpm | NA | Human1M-Duov3_B.csv | | [HumanLinkage-24]() | 02/02/2010 | InfiniumLinkage-24_11419173_A.bpm | NA | InfiniumLinkage-24_11419173_A.csv | | [Human610-Quad_v1]() | 10/13/2010 | Human610-Quadv1_C.bpm | Human610-Quadv1_C.egt | Human610-Quadv1_C.csv | | [HumanOmniExpress-12v1]() | 10/14/2010 | HumanOmniExpress-12v1_C.bpm | HumanOmniExpress-12v1_C.egt | HumanOmniExpress-12v1_C.csv | | [Human660W-Quad_v1]() | 4/21/2011 | Human660W-Quad_v1_H.bpm | Human660W-Quad_v1_H.egt | Human660W-Quad_v1_H.csv | Archived_non-Human_Products --------------------------- | array | date | bpm | egt | csv | |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|----------------------|-------------------------------------|-------------------------------------| | [CanineSNP20]() | 7/10/2007 | CanineSNP20_A.bpm | CanineSNP20_A.egt | NA | | [BovineSNP50VERSION1]() | 8/10/2007 | BovineSNP50_B.bpm | BovineSNP50_A.egt/BovineSNP50_B.egt | BovineSNP50_A.csv/BovineSNP50_B.csv | | [EquineSNP50]() | 6/9/2008 | EquineSNP50_C.bpm | EquineSNP50_C.egt | EquineSNP50_C.csv | | [PorcineSNP60]() | 1/7/2009 | PorcineSNP60_B.bpm | PorcineSNP60_A.egt | PorcineSNP60_B.csv | | [OvineSNP50]() | 1/7/2009 | OvineSNP50_B.bpm | OvineSNP50_A.egt | OvineSNP50_B.csv | | [CanineHD]() | 9/2/2009 | CanineHD_A.bpm | CanineHD-A.egt | CanineHD_A.csv | | [Maize_SNP50]() | 2/3/2010 | MaizeSNP50_A.bpm | MaizeSNP50_B.egt | MaizeSNP50_A.csv | | [BovineSNP50VERSION2]() | 5/20/2010 | BovineSNP50_v2_C.bpm | BovineSNP50v2_A.egt | BovineSNP50_v2_C.csv | | [BOVINEHD]() | 6/18/2010 | BovineHD_B.bpm | BovineHD_A.egt | BovineHD_B.csv | Old Products ------------ | array | date | bpm | egt | csv | |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|---------------------------------------|-----------------------------------------|-------------------------------| | [HumanOmni5Exome-4v1]() | 2/10/2012 | HumanOmni5Exome-4v1_A.bpm | NA | NA | | [HumanOmniExpress-12v1-1]() | 10/30/2012 | HumanOmniExpress-12v1-1_A.bpm | NA | NA | | [OmniExpressExome-8v1-1_15036758]() | 12/17/2012 | OmniExpressExome-8v1-1_15036758_A.bpm | HumanOmniExpressExome-8v1-1_2012.12.egt | NA | | [HumanOmni25M-8v1-1]() | 2/13/2013 | HumanOmni25M-8v1-1_B.bpm | HumanOmni2-5M-8v1-1_B.egt | HumanOmni25M-8v1-1_B.csv | | [OmniExpressExome-8v1-1]() | 2/5/2013 | OmniExpressExome-8v1-1_B.bpm | HumanOmniExpressExome-8v1-1_B.egt | OmniExpressExome-8v1-1_B.csv | | [OmniExpressExome-8v1-1]() | 2/5/2013 | OmniExpressExome-8v1-1_B.bpm | HumanOmniExpressExome-8v1-1_B.egt | OmniExpressExome-8v1-1_B.csv | | [HumanCoreExome-12v1-0]() | 2/6/2013 | HumanCoreExome-12v1-0_A.bpm | HumanCoreExome-12v1-0_A.egt | HumanCoreExome-12v1-0_A.csv | | [HumanOmniExpress-12v1-1]() | 2/6/2013 | HumanOmniExpress-12v1-1_B.bpm | HumanOmniExpress-12v1-1_B.egt | HumanOmniExpress-12v1-1_B.csv | | [PsychChip_15048346]() | 10/23/2013 | PsychChip_15048346_A.bpm | NA | PsychChip_15048346_A.csv | Consortium Products ------------------- | array | date | bpm | egt | csv | |--------------------------------------------------------------------------|------------|------------------------------------------|-----|------------------------------------------| | [ASA-24v1-0-Consort_20022506]() | 1/23/2018 | ASA-24v1-0-Consort_20022506_A2.bpm | NA | ASA-24v1-0-Consort_20022506_A2.csv | | [CGCA-24v1-0_20034773]() | 5/13/2020 | CGCA-24v1-0_20034773_A1.bpm | NA | CGCA-24v1-0_20034773_A1.csv | | [DrugDevConsortium-24v1-2_20024394]() | 3/14/2018 | DrugDevConsortium-24v1-2_20024394_A1.bpm | NA | DrugDevConsortium-24v1-2_20024394_A1.csv | | [GDAConfluence_20032938X375356]() | 3/11/2021 | GDAConfluence_20032938X375356_A2.bpm | NA | GDAConfluence_20032938X375356_A2.csv | | [NeuroBooster_20042459]() | 7/16/2020 | NeuroBooster_20042459_A2.bpm | NA | NeuroBooster_20042459_A2.bpm | | [H3Africa_2017_20021485_A2.csv]() | 10/27/2017 | H3Africa_2017_20021485`_A2.bpm | NA | H3Africa_2017_20021485_A2.csv | ================================================ FILE: LICENSE ================================================ The MIT License Copyright (C) 2018-2025 Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ gtc2vcf ======= A set of tools to convert Illumina and Affymetrix DNA microarray intensity data files into VCF files without using Microsoft Windows. You can use the final output to run the pipeline to detect [mosaic chromosomal alterations](http://github.com/freeseek/mocha). If you use this tool in your publication, please cite this website. For any feedback or questions, contact the [author](mailto:giulio.genovese@gmail.com) ![](gtc2vcf.png) * [Usage](#usage) * [Installation](#installation) * [Software Installation](#software-installation) * [Identifying chip type for IDAT and CEL files](#identifying-chip-type-for-idat-and-cel-files) * [Convert Illumina IDAT files to GTC files](#convert-illumina-idat-files-to-gtc-files) * [Convert Illumina GTC files to VCF](#convert-illumina-gtc-files-to-vcf) * [Convert Affymetrix CEL files to CHP files](#convert-affymetrix-cel-files-to-chp-files) * [Convert Affymetrix CHP files to VCF](#convert-affymetrix-chp-files-to-vcf) * [Using an alternative genome reference](#using-an-alternative-genome-reference) * [Detect contamination](#detect-contamination) * [Plot variants](#plot-variants) * [Illumina GenCall](#illumina-gencall) * [Illumina AutoConvert](#illumina-autoconvert) * [Illumina AutoConvert 2.0](#illumina-autoconvert-2-0) * [Illumina Array Analysis Platform Genotyping Command Line Interface](#illumina-array-analysis-platform-genotyping-command-line-interface) * [Illumina Microarray Analytics Array Analysis Command Line Interface](#illumina-microarray-analytics-array-analysis-command-line-interface) * [Acknowledgements](#acknowledgements) Usage ===== Illumina data tool: ``` Usage: bcftools +gtc2vcf [options] [ ...] Plugin options: -l, --list-tags list available FORMAT tags with description for VCF output -t, --tags LIST list of output FORMAT tags [GT,GQ,IGC,BAF,LRR,NORMX,NORMY,R,THETA,X,Y] -b, --bpm BPM manifest file -c, --csv CSV manifest file (can be gzip compressed) -e, --egt EGT cluster file -f, --fasta-ref reference sequence in fasta format --set-cache-size select fasta cache size in bytes --gc-window-size window size in bp used to compute the GC content (-1 for no estimate) [200] -g, --gtcs GTC genotype files from directory or list from file -i, --idat input IDAT files rather than GTC files --capacity number of variants to read from intensity files per I/O operation [32768] --adjust-clusters adjust cluster centers in (Theta, R) space (requires --bpm and --egt) --use-gtc-sample-names use sample name in GTC files rather than GTC file name --do-not-check-bpm do not check whether BPM and GTC files match manifest file name --do-not-check-eof do not check whether the BPM and EGT readers reach the end of the file --genome-studio input a GenomeStudio final report file (in matrix format) --no-version do not append version and command line to the header -o, --output write output to a file [standard output] -O, --output-type u|b|v|z|t[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF t: GenomeStudio tab-delimited text output, 0-9: compression level [v] --threads number of extra output compression threads [0] -x, --extra write GTC metadata to a file -v, --verbose print verbose information -W, --write-index[=FMT] Automatically index the output files [off] Manifest options: --beadset-order output BeadSetID normalization order (requires --bpm and --csv) --fasta-flank output flank sequence in FASTA format (requires --csv) -s, --sam-flank input flank sequence alignment in SAM/BAM format (requires --csv) --genome-build genome build ID used to update the manifest file [GRCh38] Examples: bcftools +gtc2vcf -i 5434246082_R03C01_Grn.idat bcftools +gtc2vcf 5434246082_R03C01.gtc bcftools +gtc2vcf -b HumanOmni2.5-4v1_H.bpm -c HumanOmni2.5-4v1_H.csv bcftools +gtc2vcf -e HumanOmni2.5-4v1_H.egt bcftools +gtc2vcf -c GSA-24v3-0_A1.csv -e GSA-24v3-0_A1_ClusterFile.egt -f human_g1k_v37.fasta -o GSA-24v3-0_A1.vcf bcftools +gtc2vcf -c HumanOmni2.5-4v1_H.csv -f human_g1k_v37.fasta 5434246082_R03C01.gtc -o 5434246082_R03C01.vcf bcftools +gtc2vcf -f human_g1k_v37.fasta --genome-studio GenotypeReport.txt -o GenotypeReport.vcf Examples of manifest file options: bcftools +gtc2vcf -b GSA-24v3-0_A1.bpm -c GSA-24v3-0_A1.csv --beadset-order bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --fasta-flank -o GSA-24v3-0_A1.fasta bwa mem -M GCA_000001405.15_GRCh38_no_alt_analysis_set.fna GSA-24v3-0_A1.fasta -o GSA-24v3-0_A1.sam bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --sam-flank GSA-24v3-0_A1.sam -o GSA-24v3-0_A1.GRCh38.csv ``` Affymetrix data tool: ``` Usage: bcftools +affy2vcf [options] --csv --fasta-ref [ ...] Plugin options: -l, --list-tags list available FORMAT tags with description for VCF output -t, --tags LIST list of output FORMAT tags [GT,CONF,BAF,LRR,NORMX,NORMY,DELTA,SIZE] -c, --csv CSV manifest file (can be gzip compressed) -f, --fasta-ref reference sequence in fasta format --set-cache-size select fasta cache size in bytes --gc-window-size window size in bp used to compute the GC content (-1 for no estimate) [200] --probeset-ids tab delimited file with column 'probeset_id' specifying probesets to convert --calls apt-probeset-genotype calls output (can be gzip compressed) --confidences apt-probeset-genotype confidences output (can be gzip compressed) --summary apt-probeset-genotype summary output (can be gzip compressed) --snp apt-probeset-genotype SNP posteriors output (can be gzip compressed) --chps input CHP files rather than tab delimited files --cel input CEL files rather CHP files --adjust-clusters adjust cluster centers in (Contrast, Size) space (requires --snp) --no-version do not append version and command line to the header -o, --output write output to a file [standard output] -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v] --threads number of extra output compression threads [0] -x, --extra write CHP metadata to a file (requires CHP files) -v, --verbose print verbose information -W, --write-index[=FMT] Automatically index the output files [off] Manifest options: --fasta-flank output flank sequence in FASTA format (requires --csv) -s, --sam-flank input flank sequence alignment in SAM/BAM format (requires --csv) Examples: bcftools +affy2vcf \ --csv GenomeWideSNP_6.na35.annot.csv \ --fasta-ref human_g1k_v37.fasta \ --chps cc-chp/ \ --snp AxiomGT1.snp-posteriors.txt \ --output AxiomGT1.vcf \ --extra report.tsv bcftools +affy2vcf \ --csv GenomeWideSNP_6.na35.annot.csv \ --fasta-ref human_g1k_v37.fasta \ --calls AxiomGT1.calls.txt \ --confidences AxiomGT1.confidences.txt \ --summary AxiomGT1.summary.txt \ --snp AxiomGT1.snp-posteriors.txt \ --output AxiomGT1.vcf Examples of manifest file options: bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv --fasta-flank -o GenomeWideSNP_6.fasta bwa mem -M GCA_000001405.15_GRCh38_no_alt_analysis_set.fna GenomeWideSNP_6.fasta -o GenomeWideSNP_6.sam bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv -s GenomeWideSNP_6.sam -o GenomeWideSNP_6.na35.annot.GRCh38.csv ``` Installation ============ Install basic tools (Debian/Ubuntu specific if you have admin privileges) ``` sudo apt install wget unzip git g++ zlib1g-dev bwa unzip samtools msitools cabextract mono-devel libgdiplus icu-devtools bcftools ``` Optionally, you can install these libraries to activate further HTSlib features ``` sudo apt install libbz2-dev libssl-dev liblzma-dev libgsl0-dev ``` Preparation steps ``` mkdir -p $HOME/bin $HOME/GRCh3{7,8} && cd /tmp ``` We recommend compiling the source code but, wherever this is not possible, Linux x86_64 pre-compiled binaries are available for download [here](http://software.broadinstitute.org/software/gtc2vcf). However, notice that you will require BCFtools version 1.20 or newer. You can also download a previous version of the plugin through [bioconda](http://anaconda.org/bioconda/bcftools-gtc2vcf-plugin) Download latest version of [HTSlib](http://github.com/samtools/htslib) and [BCFtools](http://github.com/samtools/bcftools) (if not downloaded already) ``` wget http://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2 tar xjvf bcftools-1.20.tar.bz2 ``` Download and compile plugins code (make sure you are using gcc version 5 or newer) ``` cd bcftools-1.20/ /bin/rm -f plugins/{idat2gtc.c,gtc2vcf.{c,h},affy2vcf.c} wget -P plugins http://raw.githubusercontent.com/freeseek/gtc2vcf/master/{idat2gtc.c,gtc2vcf.{c,h},affy2vcf.c,BAFregress.c} make /bin/cp bcftools plugins/{idat2gtc,gtc2vcf,affy2vcf,BAFregress}.so $HOME/bin/ ``` Make sure the directory with the plugins is available to BCFtools ``` export PATH="$HOME/bin:$PATH" export BCFTOOLS_PLUGINS="$HOME/bin" ``` Install the GRCh37 human genome reference ``` wget -O- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz | \ gzip -d > $HOME/GRCh37/human_g1k_v37.fasta samtools faidx $HOME/GRCh37/human_g1k_v37.fasta bwa index $HOME/GRCh37/human_g1k_v37.fasta ``` Install the GRCh38 human genome reference (following the suggestion from [Heng Li](http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use)) ``` wget -O- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | \ gzip -d > $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna samtools faidx $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna bwa index $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna ``` Affymetrix provides the [Analysis Power Tools (APT)](http://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-analysis-partners-programs/affymetrix-developers-network/affymetrix-power-tools.html) for free which allow to call genotypes from raw intensity data using an algorithm derived from [BRLMM-P](http://tools.thermofisher.com/content/sfs/brochures/brlmmp_whitepaper.pdf) ``` mkdir -p $HOME/bin && cd /tmp wget http://downloads.thermofisher.com/APT/APT_2.11.8/apt_2.11.8_linux_64_x86_binaries.zip unzip -ojd $HOME/bin apt_2.11.8_linux_64_x86_binaries.zip apt_2.11.8_linux_64_x86_binaries/bin/apt-probeset-genotype chmod a+x $HOME/bin/apt-probeset-genotype ``` Identifying chip type for IDAT and CEL files ============================================ To convert a pair of green and red IDAT files with raw Illumina intensities into a GTC file with genotype calls you need to provide both a BPM manifest file with the location of the probes and an EGT cluster file with the expected intensities of each genotype cluster. It is important to provide the correct BPM and EGT files otherwise the calling will fail possibly generating a GTC file with meaningless calls. Unfortunately newer IDAT files do not contain information about which BPM manifest file to use. The gtc2vcf bcftools plugin can be used to guess which files to use ``` path_to_idat_folder="..." bcftools +gtc2vcf \ -i -g $path_to_idat_folder ``` This will generate a spreadsheet table with information about each IDAT file including a guess for what manifest and cluster files you should use. If a guess is not provided, contact the [author](mailto:giulio.genovese@gmail.com) for troubleshooting Similarly, you can use the affy2vcf bcftools plugin to extract chip type information from CEL files ``` path_to_cel_folder="..." bcftools +affy2vcf \ --cel --chps $path_to_cel_folder ``` Convert Illumina IDAT files to GTC files ======================================== The idat2gtc bcftools plugin can be used to convert Illumina IDAT files to GTC files ``` bpm_manifest_file="..." egt_cluster_file="..." bcftools +idat2gtc \ --bpm $bpm_manifest_file \ --egt $egt_cluster_file \ --idats $path_to_idat_folder \ --output $path_to_gtc_folder ``` The output is equivalent to the output of the Illumina GenCall algorithm while being significantly faster If you do not have the manifest and cluster files for the Illumina IDAT files you are trying to convert, make sure to check the links [here](Illumina.md) If you run the command with the option `--autocall-date ""` then the output should be deterministic and using the `--preset` option you can generate output equivalent to the output you obtain with any of the following: * [Illumina AutoConvert](#autoconvert) * [Illumina AutoConvert 2.0](#autoconvert-2-0) * [Illumina Array Analysis Platform Genotyping Command Line Interface](#iaap-cli) * [Illumina Microarray Analytics Array Analysis Command Line Interface](#array-analysis-cli) If you similarly patch those tools to make them generate deterministic output, you should be able to verify that you get the same md5sum Convert Illumina GTC files to VCF ================================= Specifications for Illumina BPM, EGT, and GTC files were obtained through Illumina's [BeadArrayFiles](http://github.com/Illumina/BeadArrayFiles) library and [GTCtoVCF](http://github.com/Illumina/GTCtoVCF) script. Specifications for IDAT files were obtained through Henrik Bengtsson's [illuminaio](http://github.com/HenrikBengtsson/illuminaio) package ``` bpm_manifest_file="..." csv_manifest_file="..." egt_cluster_file="..." path_to_gtc_folder="..." ref="$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" # or ref="$HOME/GRCh37/human_g1k_v37.fasta" out_prefix="..." bcftools +gtc2vcf \ --no-version -Ou \ --bpm $bpm_manifest_file \ --csv $csv_manifest_file \ --egt $egt_cluster_file \ --gtcs $path_to_gtc_folder \ --fasta-ref $ref \ --extra $out_prefix.tsv | \ bcftools sort -Ou -T ./bcftools. | \ bcftools norm --no-version -o $out_prefix.bcf -Ob -c x -f $ref --write-index ``` Heavy random access to the reference will be needed, so it is important that enough extra memory be available for the operating system to cache the reference or else the task can run excruciatingly slowly. Notice that the gtc2vcf bcftools plugin will drop unlocalized variants. The final VCF might contain duplicates. If this is an issue `bcftools norm -d exact` can be used to remove such variants. At least one of the BPM or the CSV manifest files has to be provided. Normalized intensities cannot be computed without the BPM manifest file. Indel alleles cannot be inferred and will be skipped without the CSV manifest file. Information about genotype cluster centers will be included in the VCF if the EGT cluster file is provided. You can use gtc2vcf to convert one GTC file at a time, but we strongly advise to convert multiple files at once as single sample VCF files will consume a lot of storage space. If you convert hundreds of GTC files at once, you can use the `--adjust-clusters` option which will recenter the genotype clusters rather than using those provided in the EGT cluster file and will compute less noisy LRR values. If you use the `--adjust-clusters` option and you are using the output for calling [mosaic chromosomal alterations](http://github.com/freeseek/mocha), then it is safe to turn the median BAF/LRR adjustments off during that step (i.e. use `--adjust-BAF-LRR -1`) Optionally, between the conversion and the sorting step you can include a `bcftools reheader --samples ` command to assign new names to the samples where `` contains `old_name new_name\n` pairs separated by whitespaces, each on a separate line, with `old_name` being the GTC file name without the `.gtc` extension in this case When running the conversion, the gtc2vcf plugin will double check that the SNP manifest metadata information in the GTC file matches the descriptor file name in the BPM file to make sure you are using the correct manifest file. Sometimes, due to discrepancies between the BPM file name provided by Illumina and the internal descriptor file name, this safety check fails. To turn off this feature in these cases, you can use option `--do-not-check-bpm` Convert Affymetrix CEL files to CHP files ========================================= Affymetrix provides a best practice workflow for genotyping data generated using [SNP6](http://www.affymetrix.com/support/developer/powertools/changelog/VIGNETTE-snp6-on-axiom.html) and [Axiom](http://www.affymetrix.com/support/developer/powertools/changelog/VIGNETTE-Axiom-probeset-genotype.html) arrays. As an example, the following command will run the genotyping for the Affymetrix SNP6 array: ``` path_to_output_folder="..." cel_list_file="..." apt-probeset-genotype \ --analysis-files-path . \ --xml-file GenomeWideSNP_6.apt-probeset-genotype.AxiomGT1.xml \ --out-dir $path_to_output_folder \ --cel-files $cel_list_file \ --special-snps GenomeWideSNP_6.specialSNPs \ --chip-type GenomeWideEx_6 \ --chip-type GenomeWideSNP_6 \ --table-output false \ --cc-chp-output \ --write-models \ --read-models-brlmmp GenomeWideSNP_6.generic_prior.txt ``` Affymetrix provides Library and NetAffx Annotation files for their arrays ([here](http://www.affymetrix.com/support/technical/byproduct.affx?cat=dnaarrays), [here](http://media.affymetrix.com/analysis/downloads/lf/genotyping), and [here](http://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-data-analysis/genechip-array-annotation-files.html)) As an example, the following commands will obtain the files necessary to run the genotyping for the Affymetrix SNP6 array: ``` wget http://tools.thermofisher.com/content/sfs/supportfiles/genomewidesnp6_libraryfile.zip wget http://tools.thermofisher.com/content/sfs/supportfiles/SNP6_supplemental_axiom_analysis_files.zip wget http://tools.thermofisher.com/content/sfs/supportfiles/GenomeWideSNP_6-na35-annot-csv.zip unzip -oj genomewidesnp6_libraryfile.zip CD_GenomeWideSNP_6_rev3/Full/GenomeWideSNP_6/LibFiles/GenomeWideSNP_6.{cdf,chrXprobes,chrYprobes,specialSNPs} unzip -o SNP6_supplemental_axiom_analysis_files.zip GenomeWideSNP_6.{generic_prior.txt,apt-probeset-genotype.AxiomGT1.xml,AxiomGT1.sketch} unzip -o GenomeWideSNP_6-na35-annot-csv.zip GenomeWideSNP_6.na35.annot.csv ``` Note: If the program exits due to different chip types or probe counts with error message such as `Wrong CEL ChipType: expecting: 'GenomeWideSNP_6' and #######.CEL is: 'GenomeWideEx_6'` then make sure you included the option `--chip-type GenomeWideEx_6 --chip-type GenomeWideSNP_6` or `--force` to the command line to solve the problem Convert Affymetrix CHP files to VCF =================================== The affy2vcf bcftools plugin can be used to convert Affymetrix CHP files to VCF ``` csv_manifest_file="..." # for example csv_manifest_file="GenomeWideSNP_6.na35.annot.csv" ref="$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" # or ref="$HOME/GRCh37/human_g1k_v37.fasta" path_to_chp_folder="cc-chp" path_to_txt_folder="..." out_prefix="..." bcftools +affy2vcf \ --no-version -Ou \ --csv $csv_manifest_file \ --fasta-ref $ref \ --chps $path_to_chp_folder \ --snp $path_to_txt_folder/AxiomGT1.snp-posteriors.txt \ --extra $out_prefix.tsv | \ bcftools sort -Ou -T ./bcftools. | \ bcftools norm --no-version -o $out_prefix.bcf -Ob -c x -f $ref --write-index ``` Heavy random access to the reference will be needed, so it is important that enough extra memory be available for the operating system to cache the reference or else the task can run excruciatingly slowly. The final VCF might contain duplicates. If this is an issue `bcftools norm -d exact` can be used to remove such variants. There is often no need to use the `--adjust-clusters` option for Affymetrix data as the cluster posteriors are already adjusted using the data processed by the genotype caller Optionally, between the conversion and the sorting step you can include a `bcftools reheader --samples ` command to assign new names to the samples where `` contains `old_name new_name\n` pairs separated by whitespaces, each on a separate line, with `old_name` being the CHP file name without the `.chp` extension Using an alternative genome reference ===================================== Illumina provides [GRCh38/hg38](http://support.illumina.com/bulletins/2017/04/infinium-human-genotyping-manifests-and-support-files--with-anno.html) manifests for many of its genotyping arrays. However, if your genotyping array is not supported for the newer reference by Illumina, you can use the `--fasta-flank` and `--sam-flank` options to realign the flank sequences from the manifest files you have and recompute the marker positions. This approach uses [flank sequence](http://support.illumina.com/bulletins/2016/05/infinium-genotyping-manifest-column-headings.html) and [strand](http://support.illumina.com/bulletins/2017/06/how-to-interpret-dna-strand-and-allele-information-for-infinium-.html) information to identify the marker [coordinates](http://support.illumina.com/bulletins/2016/06/-infinium-genotyping-array-manifest-files-what-does-chr-or-mapinfo---mean.html). It will need a sequence aligner such as `bwa` to realign the sequences and it seems to reproduce the coordinates provided from Illumina more than 99.9% of the times. Mapping information will follow the [implicit dbSNP standard](http://github.com/Illumina/GTCtoVCF#manifests). Occasionally the flank sequence provided by Illumina is incorrect and it is impossible to recover the correct marker coordinate from the flank sequence alone You first have to generate an alignment file for the flank sequences from a CSV manifest file ``` csv_manifest_file="..." ref="$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" # or ref="$HOME/GRCh37/human_g1k_v37.fasta" bam_alignment_file="..." bcftools +gtc2vcf \ -c $csv_manifest_file \ --fasta-flank | \ bwa mem -M $ref - | \ samtools view -bS \ -o $bam_alignment_file ``` Notice that you need to use the `-M` option to mark shorter split hits as secondary and you should not sort the output BAM file as gtc2vcf expects it to have the sequences in the same order as in the CSV file . Then you load the alignment file while converting your GTC files to VCF including the `-s $bam_alignment_file` option Some older manifest files from Illumina have thousands of markers with incorrect RefStrand annotations that will lead to incorrect genotypes. While Illumina has not explained why this is the case, it still distributes incorrect manifests. If you are using one of the following manifests ``` Human1M-Duov3_H Human610-Quadv1_H Human660W-Quad_v1_H HumanCytoSNP-12v2-1_Anova HumanOmni1-Quad_v1-0-Multi_H HumanOmni1-Quad_v1-0_H ``` We advise to either contact Illumina to demand a fixed version or to use gtc2vcf to realign the flank sequences Also, Illumina assigns chromosomal positions to indels by first left aligning the flank sequences in an incoherent way (see [here](http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py)). Apparently this is incoherent enough that Illumina also cannot get the coordinates of homopolymer indels right. For example, chromosome 13 ClinVar indel [rs80359507](http://www.ncbi.nlm.nih.gov/clinvar/variation/37959) is assigned to position 32913838 in the manifest file for the GSA-24v2-0 array, but it is assigned to position 32913837 in the manifest file for GSA-24v3-0 array (GRCh37 coordinates). If you want to trust genotypes at homopolymer indels, we advise to use gtc2vcf to realign the flank sequences We also found numerous examples of markers from Illumina manifest files that are mapped to the wrong chromosome, such as markers rs10465468, rs12401272, rs185597746, rs188145685 which are localized over XY in the Illumina manifest files for the GSA-24v2-0 array and the GSA-24v3-0 array but their flank sequences map to chromosome Y. If you trust the flank sequences better than the coordinates from the Illumina manifest files, we advise to use gtc2vcf to realign the flank sequences The same functionality exists for the affy2vcf tool to convert Affymetrix data Detect contamination ==================== To detect contamination we use a model similar to what employed by [BAFRegress](http://genome.sph.umich.edu/wiki/BAFRegress) and described in [Jun et al. 2012](http://doi.org/10.1016/j.ajhg.2012.09.004) which estimates BAF deviations at homozygous sites towards reference population means. The model needs allele frequencies which can be inferred from the BCFtools/gtc2vcf output: ``` bcftools +BAFregress $out_prefix.bcf ``` or they can be inferred from a separate resource: ``` bcftools +BAFregress --af 1kGP_high_coverage_Illumina.sites.bcf --tag AF $out_prefix.bcf ``` Plot variants ============= Install basic tools (Debian/Ubuntu specific if you have admin privileges): ``` sudo apt install r-cran-optparse r-cran-ggplot2 r-cran-data.table r-cran-gridextra ``` Download R scripts ``` /bin/rm -f $HOME/bin/gtc2vcf_plot.R wget -P $HOME/bin http://raw.githubusercontent.com/freeseek/gtc2vcf/master/gtc2vcf_plot.R chmod a+x $HOME/bin/gtc2vcf_plot.R ``` Plot variant (for Illumina data) ``` gtc2vcf_plot.R \ --illumina \ --vcf input.vcf \ --chrom 11 \ --pos 66328095 \ --png rs1815739.png ``` ![](rs1815739.png) Plot variant (for Affymetrix data) ``` gtc2vcf_plot.R \ --affymetrix \ --vcf input.vcf \ --chrom 1 \ --pos 196642233 \ --png rs800292.png ``` ![](rs800292.png) Illumina GenCall ================ To genotype raw Illumina IDAT intensity files using Illumina GenCall algorithms, Illumina over the course of the year has provided several command line interfaces written in the .NET language: - [AutoConvert](http://support.illumina.com/array/array_software/beeline/downloads.html) (2011) - [AutoConvert 2.0](http://support.illumina.com/array/array_software/beeline/downloads.html) (2017) - [IAAP CLI](http://support.illumina.com/array/array_software/illumina-array-analysis-platform.html) (2019) - [Array Analysis CLI](http://support.illumina.com/array/array_software/ima-array-analysis-cli/downloads.html) (2023) We provide instructions to install and run these interfaces. The `sed -i -e ':a' -e 'N' -e '$!ba'` installation commands are used to prevent the interfaces from timestamping the output GTC files by removing the [System.DateTime](http://learn.microsoft.com/en-us/dotnet/api/system.datetime) calls and accesses to the [CreationTime](http://learn.microsoft.com/en-us/dotnet/api/system.io.filesysteminfo.creationtime) property from the binaries, with the goal of making each execution completely reproducible. AutoConvert 2.0, IAAP-CLI, and Array Analysis CLI binaries will both perform version 1.2.0 of the normalization step and seem to produce the exact same results while AutoConvert will only perform version 1.1.2 of the normalization step yielding somewhat different results. If you want to run these binaries but fail to download them, contact the [author](mailto:giulio.genovese@gmail.com) for troubleshooting Illumina also provides the [Beeline](http://support.illumina.com/array/array_software/beeline.html) software for free and this includes the AutoConvert.exe command line executable which allows to call genotypes from raw intensity data using Illumina's proprietary GenCall algorithm. AutoConvert is almost entirely written in Mono/.Net language, except for one small mathmatical function (findClosestSitesToPointsAlongAxis) which is included within a Windows PE32+ library (MathRoutines.dll). As this is [unmanaged code](http://www.mono-project.com/docs/advanced/embedding/), to be run on Linux with [Mono](http://www.mono-project.com/) it needs to be embedded in an equivalent Linux ELF64 library (libMathRoutines.dll.so) as shown below. This function is run as part of the [normalization](http://doi.org/10.1093/bioinformatics/btm443) of the raw intensities when sampling [400 candidate homozygotes](http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf) before calling genotypes. Illumina AutoConvert -------------------- To run Illumina AutoConvert (version 1.6.3.1) you will need to fix the hardcoded Windows [backlashes](http://en.wikipedia.org/wiki/Backslash) into UNIX [slashes](http://en.wikipedia.org/wiki/Slash_(punctuation), as shown below ``` mkdir -p $HOME/bin && cd /tmp wget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/beeline/autoconvert-software-v1-6-3-installer.zip wget http://raw.githubusercontent.com/freeseek/gtc2vcf/master/nearest_neighbor.c unzip -o autoconvert-software-v1-6-3-installer.zip msiextract -C Illumina/AutoConvert SetupAutoConvert64_1.6.3.1.msi msiextract -l SetupAutoConvert64_1.6.3.1.msi | grep DLL$ | while read dll; do mv Illumina/AutoConvert/$dll Illumina/AutoConvert/${dll%DLL}dll; done gcc -fPIC -shared -O2 -o Illumina/AutoConvert/libMathRoutines.dll.so nearest_neighbor.c sed -i 's/\x00\x03\\\x00/\x00\x03\/\x00/' Illumina/AutoConvert/AutoCallLib.dll sed -i 's/G\x00R\x00N\x00.\x00i\x00d\x00a\x00t\x00/G\x00r\x00n\x00.\x00i\x00d\x00a\x00t\x00/' Illumina/AutoConvert/AutoCallLib.dll sed -i 's/R\x00E\x00D\x00.\x00i\x00d\x00a\x00t\x00/R\x00e\x00d\x00.\x00i\x00d\x00a\x00t\x00/' Illumina/AutoConvert/AutoCallLib.dll sed -i 's/\\\x00M\x00o\x00d\x00u\x00l\x00e\x00s\x00\\\x00B\x00S\x00G\x00T\x00\\\x00C\x00l\x00u\x00s\x00t\x00e\x00r\x00A\x00l\x00g\x00o\x00r\x00i\x00t\x00h\x00m\x00s\x00\\\x00/\/\x00M\x00o\x00d\x00u\x00l\x00e\x00s\x00\/\x00B\x00S\x00G\x00T\x00\/\x00C\x00l\x00u\x00s\x00t\x00e\x00r\x00A\x00l\x00g\x00o\x00r\x00i\x00t\x00h\x00m\x00s\x00\/\x00/' Illumina/AutoConvert/AutoCallLib.dll sed -i 's/\\\x00M\x00o\x00d\x00u\x00l\x00e\x00s\x00\\\x00B\x00S\x00G\x00T\x00/\/\x00M\x00o\x00d\x00u\x00l\x00e\x00s\x00\/\x00B\x00S\x00G\x00T\x00/' Illumina/AutoConvert/Modules/BSGT/ClusterAlgorithms/{GoldenGate/GGCA,InfiniumII/I2CA,GenTrain/ILCA}.dll sed -i 's/\\\x00d\x00a\x00t\x00.\x00b\x00i\x00n\x00/\/\x00d\x00a\x00t\x00.\x00b\x00i\x00n\x00/' Illumina/AutoConvert/Modules/BSGT/ClusterAlgorithms/{GoldenGate/GGCA,InfiniumII/I2CA,GenTrain/ILCA}.dll sed -i -e ':a' -e 'N' -e '$!ba' -e 's/\x28\xa6\x00\x00\x0a\x13\x40\x12\x40\x28\xa7\x00\x00\x0a\x72\xad\x12\x00\x70\x28\xa6\x00\x00\x0a\x13\x40\x12\x40\x28\xa8\x00\x00\x0a\x28\x23\x00\x00\x0a/\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x7e\x16\x00\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00/' Illumina/AutoConvert/AutoCallLib.dll sed -i -e ':a' -e 'N' -e '$!ba' -e 's/\x11\x0e\x6f\xe5\x00\x00\x0a\x13\x11\x12\x11\x28\xe6\x00\x00\x0a\x72\xad\x12\x00\x70\x11\x0e\x6f\xe5\x00\x00\x0a\x13\x12\x12\x12\x28\xe7\x00\x00\x0a\x28\x23\x00\x00\x0a/\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x7e\x16\x00\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00/' Illumina/AutoConvert/AutoCallLib.dll rm autoconvert-software-v1-6-3-installer.zip SetupAutoConvert64_1.6.3.1.msi nearest_neighbor.c mv Illumina/AutoConvert $HOME/bin/ rmdir Illumina ``` You can run Illumina's proprietary GenCall algorithm on a single IDAT file pair ``` mono $HOME/bin/AutoConvert/AutoConvert.exe \ $idat_green_file \ $path_to_output_folder \ $bpm_manifest_file \ $egt_cluster_file ``` Make sure that the red IDAT file is in the same folder as the green IDAT file. Alternatively you can run on multiple IDAT file pairs ``` mono $HOME/bin/AutoConvert/AutoConvert.exe \ $path_to_idat_folder \ $path_to_output_folder \ $bpm_manifest_file \ $egt_cluster_file ``` Illumina AutoConvert 2.0 ------------------------ To run Illumina AutoConvert 2.0 (version 2.0.1.179) you will need to separately download an additional Mono/.Net library (Heatmap.dll) from [GenomeStudio](http://support.illumina.com/array/array_software/genomestudio.html) or the [polyploid clustering module](http://support.illumina.com/downloads/genomestudio_polyploid_clustering_module_v1-0_software.html) and include it in your binary directory, most likely due to differences in which Mono and .Net resolve library dependencies, as shown below ``` mkdir -p $HOME/bin && cd /tmp wget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/beeline/autoconvert-software-v2-0-1-installer.zip wget http://support.illumina.com/content/dam/illumina-support/documents/downloads/software/genomestudio/genomestudiopolyploidclusteringv1-0.msi wget http://raw.githubusercontent.com/freeseek/gtc2vcf/master/nearest_neighbor.c unzip -o autoconvert-software-v2-0-1-installer.zip msiextract AutoConvertInstaller.msi msiextract genomestudiopolyploidclusteringv1-0.msi mv Heatmap.DLL Illumina/AutoConvert\ 2.0/ gcc -fPIC -shared -O2 -o Illumina/AutoConvert\ 2.0/libMathRoutines.dll.so nearest_neighbor.c sed -i 's/^0.97$<\/AutosomalCallRateThreshold>\r$$/\10.0\2/' Illumina/AutoConvert\ 2.0/AutoCallConfig.xml sed -i 's/\\\x00d\x00a\x00t\x00.\x00b\x00i\x00n\x00/\/\x00d\x00a\x00t\x00.\x00b\x00i\x00n\x00/' Illumina/AutoConvert\ 2.0/{GGCA,I2CA,HDCA,ILCA,ILCA3}.dll sed -i -e ':a' -e 'N' -e '$!ba' -e 's/\x28\xc7\x00\x00\x0a\x13\x3f\x12\x3f\x28\xc8\x00\x00\x0a\x72\xa8\x15\x00\x70\x28\xc7\x00\x00\x0a\x13\x3f\x12\x3f\x28\xc9\x00\x00\x0a\x28\x1f\x00\x00\x0a/\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x7e\x12\x00\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00/' Illumina/AutoConvert\ 2.0/AutoCallLib.dll msiextract -l genomestudiopolyploidclusteringv1-0.msi | grep -v Heatmap.DLL | xargs rm rmdir Modules/BSPC/clusteralgorithms/* rmdir -p Modules/BSPC/clusteralgorithms rm autoconvert-software-v2-0-1-installer.zip AutoConvertInstaller.msi genomestudiopolyploidclusteringv1-0.msi nearest_neighbor.c mv Illumina/AutoConvert\ 2.0 $HOME/bin/ rmdir Illumina ``` We change the autosomal call rate threshold to 0.0 to more aggressively call gender in lower quality samples If you need to get the Heatmap.dll library from GenomeStudio indtead, you can use the following code ``` wget ftp://webdata2:webdata2@ftp.illumina.com/downloads/software/genomestudio/genomestudio-software-v2-0-4-5-installer.zip unzip -oj genomestudio-software-v2-0-4-5-installer.zip cabextract GenomeStudioInstaller.exe msiextract a0 mv Illumina/GenomeStudio\ 2.0/Heatmap.dll Illumina/AutoConvert\ 2.0/ rm genomestudio-software-v2-0-4-5-installer.zip GenomeStudioInstaller.exe {,a}0 u{0..5} Illumina/GenomeStudio\ 2.0 -r ``` You can run Illumina's proprietary GenCall algorithm on a single IDAT file pair ``` mono $HOME/bin/AutoConvert\ 2.0/AutoConvert.exe \ $idat_green_file \ $path_to_output_folder \ $bpm_manifest_file \ $egt_cluster_file ``` Make sure that the red IDAT file is in the same folder as the green IDAT file. Alternatively you can run on multiple IDAT file pairs ``` mono $HOME/bin/AutoConvert\ 2.0/AutoConvert.exe \ $path_to_idat_folder \ $path_to_output_folder \ $bpm_manifest_file \ $egt_cluster_file ``` Make sure that the IDAT files have the same name prefix as the IDAT folder name. The software might require up to 8GB of RAM to run. Illumina provides manifest (BPM) and cluster (EGT) files for their arrays [here](http://support.illumina.com/array/downloads.html). Notice that if you provide the wrong BPM file, you will get an error such as: `Normalization failed! Unable to normalize!` and if you provide the wrong EGT file, you will get an error such as `System.Exception: Unrecoverable Error...Exiting! Unable to find manifest entry ######## in the cluster file!` Illumina Array Analysis Platform Genotyping Command Line Interface ------------------------------------------------------------------ Illumina provides the [Illumina Array Analysis Platform Genotyping Command Line Interface](http://support.illumina.com/array/array_software/illumina-array-analysis-platform.html) software for free for research use and this includes the iaap-cli 1.1.0 which runs natively on Linux ``` mkdir -p $HOME/bin && cd /tmp wget ftp://webdata2:webdata2@ftp.illumina.com/downloads/software/iaap/iaap-cli-linux-x64-1.1.0.tar.gz tar xzvf iaap-cli-linux-x64-1.1.0.tar.gz -C $HOME/bin/ iaap-cli-linux-x64-1.1.0/iaap-cli --strip-components=1 sed -i -e ':a' -e 'N' -e '$!ba' -e 's/\x28\x17\x01\x00\x0a\x13\x07\x12\x07\x72\xdd\x23\x00\x70\x28\x18\x01\x00\x0a/\x00\x00\x00\x00\x00\x00\x00\x00\x00\x7e\x92\x00\x00\x0a\x00\x00\x00\x00\x00/' $HOME/bin/iaap-cli/ArrayAnalysis.NormToGenCall.Services.dll rm iaap-cli-linux-x64-1.1.0.tar.gz ``` Once iaap-cli is properly installed in your system, run Illumina's proprietary GenCall algorithm on multiple IDAT file pairs ``` CLR_ICU_VERSION_OVERRIDE="$(uconv -V | sed 's/.* //g')" LANG="en_US.UTF-8" $HOME/bin/iaap-cli/iaap-cli \ gencall \ $bpm_manifest_file \ $egt_cluster_file \ $path_to_output_folder \ --idat-folder $path_to_idat_folder \ --output-gtc \ --gender-estimate-call-rate-threshold 0.0 ``` It is important to set the `LANG` environmental variable to `en_US.UTF-8`, if this is set to other values, due to a bug in `iaap-cli` causing malformed GTC files to be generated as a result. Due to another bug in `iaap-cli`, IDAT filenames cannot include more than two `_` characters and should be formatted as `BARCODE_POSITION_(Red|Grn).idat`. When using `iaap_cli` you cannot process old array manifest files with loci data encoded as version 5 or older, such as `HumanHap650Yv3_A.bpm`, as the corresponding code was not carried over and you will get the error `Error in reading file. Unknown Manifest version`. The AutoConvert command line tool can read older manifest files. We change the autosomal call rate threshold to 0.0 both to more aggressively call gender in lower quality samples and to deal with an implementation issue that causes loci with null cluster scores to be included in the determination of the autosomal call rate threshold Illumina Microarray Analytics Array Analysis Command Line Interface ------------------------------------------------------------------- Illumina provides the [Illumina Microarray Analytics Array Analysis Command Line Interface](http://support.illumina.com/array/array_software/ima-array-analysis-cli/downloads.html) software for free for research use and this includes the array-analysis-cli 2.1.0 which runs natively on Linux ``` mkdir -p $HOME/bin && cd /tmp wget http://support.illumina.com/softwaredownload.html?assetId=72f8a34f-0933-4256-bad6-73d830436c74&assetDetails=IlluminaMicroarrayAnalyticsArrayAnalysisCLIv2.1LinuxInstaller-2.1-array-analysis-cli-linux-x64-v2.1.0.tar.gz tar xzvf array-analysis-cli-linux-x64-v2.1.0.tar.gz -C $HOME/bin/ --strip-components=1 sed -i -e ':a' -e 'N' -e '$!ba' -e 's/\x28\x89\x00\x00\x0a\x0A\x12\x00\x72\xa3\x15\x00\x70\x28\x8a\x00\x00\x0a/\x00\x00\x00\x00\x00\x00\x00\x00\x72\xfc\x0d\x00\x70\x00\x00\x00\x00\x00/' $HOME/bin/array-analysis-cli//ArrayAnalysis.Core.dll rm array-analysis-cli-linux-x64-v2.1.0.tar.gz ``` Once array-analysis-cli is properly installed in your system, run Illumina's proprietary GenCall algorithm on multiple IDAT file pairs ``` $HOME/bin/array-analysis-cli/array-analysis-cli \ genotype call \ --bpm-manifest $bpm_manifest_file \ --cluster-file $egt_cluster_file \ --idat-folder . ``` We cannot change the autosomal call rate threshold to 0.0 both to more aggressively call gender in lower quality samples as the default 0.97 value is hardcoded Acknowledgements ================ This work is supported by NIH grant [R01 HG006855](http://grantome.com/grant/NIH/R01-HG006855), NIH grant [R01 MH104964](http://grantome.com/grant/NIH/R01-MH104964), NIH grant [R01MH123451](http://grantome.com/grant/NIH/R01-MH123451), US Department of Defense Breast Cancer Research Breakthrough Award W81XWH-16-1-0316 (project BC151244), and the Stanley Center for Psychiatric Research ================================================ FILE: affy2vcf.c ================================================ /* The MIT License Copyright (c) 2018-2025 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include "bcftools.h" #include "gtc2vcf.h" #define AFFY2VCF_VERSION "2025-10-08" #define TAG_LIST_DFLT "GT,CONF,BAF,LRR,NORMX,NORMY,DELTA,SIZE" #define GC_WIN_DFLT "200" #define VERBOSE (1 << 0) #define LOAD_CEL (1 << 1) #define PROBESET_IDS_LOADED (1 << 2) #define CALLS_LOADED (1 << 3) #define CONFIDENCES_LOADED (1 << 4) #define SUMMARY_LOADED (1 << 5) #define SNP_LOADED (1 << 6) #define ADJUST_CLUSTERS (1 << 7) #define NO_INFO_GC (1 << 8) #define FORMAT_GT (1 << 9) #define FORMAT_CONF (1 << 10) #define FORMAT_BAF (1 << 11) #define FORMAT_LRR (1 << 12) #define FORMAT_NORMX (1 << 13) #define FORMAT_NORMY (1 << 14) #define FORMAT_DELTA (1 << 15) #define FORMAT_SIZE (1 << 16) // #%affymetrix-algorithm-param-apt-opt-use-copynumber-call-codes=0 // #%call-code-1=NoCall:-1:2 // #%call-code-2=AA:0:2 // #%call-code-3=AB:1:2 // #%call-code-4=BB:2:2 #define GT_NC -1 #define GT_AA 0 #define GT_AB 1 #define GT_BB 2 // #%max-alleles=4 // #%max-cn-states=2 // #%call-code-1=OTV_1:-4:1 // #%call-code-2=NoCall_1:-3:1 // #%call-code-3=OTV:-2:2 // #%call-code-4=NoCall:-1:2 // #%call-code-5=AA:0:2 // #%call-code-6=AB:1:2 // #%call-code-7=BB:2:2 // #%call-code-8=ZeroCN:3:0 // #%call-code-9=A:4:1 // #%call-code-10=B:5:1 // #%call-code-11=C:6:1 // #%call-code-12=AC:7:2 // #%call-code-13=BC:8:2 // #%call-code-14=CC:9:2 // #%call-code-15=D:10:1 // #%call-code-16=AD:11:2 // #%call-code-17=BD:12:2 // #%call-code-18=CD:13:2 // #%call-code-19=DD:14:2 // #%call-code-20=E:15:1 // #%call-code-21=AE:16:2 // #%call-code-22=BE:17:2 // #%call-code-23=CE:18:2 // #%call-code-24=DE:19:2 // #%call-code-25=EE:20:2 // #%call-code-26=F:21:1 // #%call-code-27=AF:22:2 // #%call-code-28=BF:23:2 // #%call-code-29=CF:24:2 // #%call-code-30=DF:25:2 // #%call-code-31=EF:26:2 // #%call-code-32=FF:27:2 static const int txt_gt[32] = {GT_NC, GT_NC, GT_NC, GT_NC, GT_AA, GT_AB, GT_BB, GT_NC, GT_AA, GT_BB, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC, GT_NC}; static const int chp_gt[16] = {-1, -1, -1, -1, -1, -1, GT_AA, GT_BB, GT_AB, -1, -1, GT_NC, -1, -1, -1, -1}; /**************************************** * hFILE READING FUNCTIONS * ****************************************/ // read long in network order static inline uint32_t read_long(hFILE *hfile) { uint32_t value; read_bytes(hfile, (void *)&value, sizeof(uint32_t)); value = ntohl(value); return value; } // read float in network order static inline float read_float(hFILE *hfile) { union { uint32_t u; float f; } convert; read_bytes(hfile, (void *)&convert.u, sizeof(uint32_t)); convert.u = ntohl(convert.u); return convert.f; } // read string in network order static inline int32_t read_string8(hFILE *hfile, char **buffer) { int32_t len = (int32_t)read_long(hfile); if (len) { *buffer = (char *)malloc((1 + len) * sizeof(char)); read_bytes(hfile, (void *)*buffer, len * sizeof(char)); (*buffer)[len] = '\0'; } else { *buffer = NULL; } return len; } // read wide-character string in network order static inline int32_t read_string16(hFILE *hfile, wchar_t **buffer) { int32_t len = (int32_t)read_long(hfile); if (len) { *buffer = (wchar_t *)malloc((1 + len) * sizeof(wchar_t)); int i; for (i = 0; i < len; i++) { uint16_t cvalue; read_bytes(hfile, (void *)&cvalue, sizeof(unsigned short)); (*buffer)[i] = (wchar_t)ntohs(cvalue); } (*buffer)[len] = L'\0'; } else { *buffer = NULL; } return len; } /**************************************** * CEL FILE IMPLEMENTATION * ****************************************/ // http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/index.html typedef struct { float mean __attribute__((packed)); float dev __attribute__((packed)); int16_t N; } Cell; typedef struct { int16_t x; int16_t y; } Entry; typedef struct { int32_t row; int32_t col; float upper_left_x; float upper_left_y; float upper_right_x; float upper_right_y; float lower_left_x; float lower_left_y; float lower_right_x; float lower_right_y; int32_t left_cell; int32_t top_cell; int32_t right_cell; int32_t bottom_cell; } SubGrid; typedef struct { char *fn; hFILE *hfile; int32_t version; int32_t num_rows; int32_t num_cols; int32_t num_cells; int32_t n_header; char *header; int32_t n_algorithm; char *algorithm; int32_t n_parameters; char *parameters; int32_t cell_margin; uint32_t num_outlier_cells; uint32_t num_masked_cells; int32_t num_sub_grids; Cell *cells; Entry *masked_entries; Entry *outlier_entries; SubGrid *sub_grids; } xda_cel_t; static xda_cel_t *xda_cel_init(const char *fn, hFILE *hfile, int flags) { xda_cel_t *xda_cel = (xda_cel_t *)calloc(1, sizeof(xda_cel_t)); xda_cel->fn = strdup(fn); xda_cel->hfile = hfile; int32_t magic; read_bytes(xda_cel->hfile, (void *)&magic, sizeof(int32_t)); if (magic != 64) error("XDA CEL file %s magic number is %d while it should be 64\n", xda_cel->fn, magic); read_bytes(xda_cel->hfile, (void *)&xda_cel->version, sizeof(int32_t)); if (xda_cel->version != 4) error("Cannot read XDA CEL file %s. Unsupported XDA CEL file format version: %d\n", xda_cel->fn, xda_cel->version); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_rows, sizeof(int32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_cols, sizeof(int32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_cells, sizeof(int32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->n_header, sizeof(int32_t)); xda_cel->header = (char *)malloc((1 + xda_cel->n_header) * sizeof(char)); read_bytes(xda_cel->hfile, (void *)xda_cel->header, xda_cel->n_header * sizeof(char)); xda_cel->header[xda_cel->n_header] = '\0'; read_bytes(xda_cel->hfile, (void *)&xda_cel->n_algorithm, sizeof(int32_t)); xda_cel->algorithm = (char *)malloc((1 + xda_cel->n_algorithm) * sizeof(char)); read_bytes(xda_cel->hfile, (void *)xda_cel->algorithm, xda_cel->n_algorithm * sizeof(char)); xda_cel->algorithm[xda_cel->n_algorithm] = '\0'; read_bytes(xda_cel->hfile, (void *)&xda_cel->n_parameters, sizeof(int32_t)); xda_cel->parameters = (char *)malloc((1 + xda_cel->n_parameters) * sizeof(char)); read_bytes(xda_cel->hfile, (void *)xda_cel->parameters, xda_cel->n_parameters * sizeof(char)); xda_cel->parameters[xda_cel->n_parameters] = '\0'; read_bytes(xda_cel->hfile, (void *)&xda_cel->cell_margin, sizeof(int32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_outlier_cells, sizeof(uint32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_masked_cells, sizeof(uint32_t)); read_bytes(xda_cel->hfile, (void *)&xda_cel->num_sub_grids, sizeof(int32_t)); if (flags) return xda_cel; xda_cel->cells = (Cell *)malloc(xda_cel->num_cells * sizeof(Cell)); read_bytes(xda_cel->hfile, (void *)xda_cel->cells, xda_cel->num_cells * sizeof(Cell)); xda_cel->masked_entries = (Entry *)malloc(xda_cel->num_masked_cells * sizeof(Entry)); read_bytes(xda_cel->hfile, (void *)xda_cel->masked_entries, xda_cel->num_masked_cells * sizeof(Entry)); xda_cel->outlier_entries = (Entry *)malloc(xda_cel->num_outlier_cells * sizeof(Entry)); read_bytes(xda_cel->hfile, (void *)xda_cel->outlier_entries, xda_cel->num_outlier_cells * sizeof(Entry)); xda_cel->sub_grids = (SubGrid *)malloc(xda_cel->num_sub_grids * sizeof(SubGrid)); read_bytes(xda_cel->hfile, (void *)xda_cel->sub_grids, xda_cel->num_sub_grids * sizeof(SubGrid)); if (!heof(xda_cel->hfile)) error("XDA CEL reader did not reach the end of file %s at position %ld\n", xda_cel->fn, htell(xda_cel->hfile)); return xda_cel; } static void xda_cel_destroy(xda_cel_t *xda_cel) { if (!xda_cel) return; free(xda_cel->fn); if (hclose(xda_cel->hfile) < 0) error("Error closing XDA CEL file\n"); free(xda_cel->header); free(xda_cel->algorithm); free(xda_cel->parameters); free(xda_cel->cells); free(xda_cel->masked_entries); free(xda_cel->outlier_entries); free(xda_cel->sub_grids); free(xda_cel); } static void xda_cel_print(const xda_cel_t *xda_cel, FILE *stream, int verbose) { fprintf(stream, "[CEL]\n"); fprintf(stream, "Version=3\n"); fprintf(stream, "\n[HEADER]\n"); fprintf(stream, "%s", xda_cel->header); fprintf(stream, "\n[INTENSITY]\n"); fprintf(stream, "NumberCells=%d\n", xda_cel->num_cells); fprintf(stream, "CellHeader=X\tY\tMEAN\tSTDV\tNPIXELS\n"); int i; if (!verbose) fprintf(stream, "... use --verbose to visualize Cell Entries ...\n"); else for (i = 0; i < xda_cel->num_cells; i++) fprintf(stream, "%3d\t%3d\t%.1f\t%.1f\t%3d\n", i % xda_cel->num_cols, i / xda_cel->num_cols, xda_cel->cells[i].mean, xda_cel->cells[i].dev, xda_cel->cells[i].N); fprintf(stream, "\n[MASKS]\n"); fprintf(stream, "NumberCells=%d\n", xda_cel->num_masked_cells); fprintf(stream, "CellHeader=X\tY\n"); if (!verbose) fprintf(stream, "... use --verbose to visualize Masked Entries ...\n"); else for (i = 0; i < xda_cel->num_masked_cells; i++) fprintf(stream, "%d\t%d\n", xda_cel->masked_entries[i].x, xda_cel->masked_entries[i].y); fprintf(stream, "\n[OUTLIERS]\n"); fprintf(stream, "NumberCells=%d\n", xda_cel->num_outlier_cells); fprintf(stream, "CellHeader=X\tY\n"); if (!verbose) fprintf(stream, "... use --verbose to visualize Outlier Entries ...\n"); else for (i = 0; i < xda_cel->num_outlier_cells; i++) fprintf(stream, "%d\t%d\n", xda_cel->outlier_entries[i].x, xda_cel->outlier_entries[i].y); fprintf(stream, "\n[MODIFIED]\n"); fprintf(stream, "NumberCells=0\n"); fprintf(stream, "CellHeader=X\tY\tORIGMEAN\n"); } /**************************************** * CHP FILE IMPLEMENTATION * ****************************************/ // http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/index.html #define BYTE 0 #define UBYTE 1 #define SHORT 2 #define USHORT 3 #define INT 4 #define UINT 5 #define FLOAT 6 #define STRING 7 #define WSTRING 8 typedef struct { wchar_t *name; char *value; wchar_t *mime_type; int32_t n_value; int8_t type; } Parameter; typedef struct DataHeader DataHeader; struct DataHeader { char *data_type_identifier; char *guid; wchar_t *datetime; wchar_t *locale; int32_t n_parameters; Parameter *parameters; int32_t n_parents; DataHeader *parents; }; typedef struct { wchar_t *name; int8_t type; int32_t size; } ColHeader; typedef struct { uint32_t pos_first_element; uint32_t pos_next_data_set; wchar_t *name; int32_t n_parameters; Parameter *parameters; uint32_t n_cols; ColHeader *col_headers; uint32_t n_rows; hFILE *hfile; // this should not be destroyed uint32_t n_buffer; uint32_t *col_offsets; char *buffer; } DataSet; typedef struct { uint32_t pos_next_data_group; uint32_t pos_first_data_set; int32_t num_data_sets; wchar_t *name; DataSet *data_sets; } DataGroup; typedef struct { wchar_t *name; int8_t type; int32_t size; } ColumnHeader; typedef struct { char *fn; hFILE *hfile; uint8_t magic; uint8_t version; int32_t num_data_groups; uint32_t pos_first_data_group; DataHeader data_header; DataGroup *data_groups; off_t size; char *display_name; } agcc_t; static void agcc_read_parameters(Parameter *parameter, hFILE *hfile, int flags) { read_string16(hfile, ¶meter->name); parameter->n_value = read_string8(hfile, ¶meter->value); read_string16(hfile, ¶meter->mime_type); if (wcscmp(parameter->mime_type, L"text/x-calvin-integer-8") == 0) parameter->type = BYTE; else if (wcscmp(parameter->mime_type, L"text/x-calvin-unsigned-integer-8") == 0) parameter->type = UBYTE; else if (wcscmp(parameter->mime_type, L"text/x-calvin-integer-16") == 0) parameter->type = SHORT; else if (wcscmp(parameter->mime_type, L"text/x-calvin-unsigned-integer-16") == 0) parameter->type = USHORT; else if (wcscmp(parameter->mime_type, L"text/x-calvin-integer-32") == 0) parameter->type = INT; else if (wcscmp(parameter->mime_type, L"text/x-calvin-unsigned-integer-32") == 0) parameter->type = UINT; else if (wcscmp(parameter->mime_type, L"text/x-calvin-float") == 0) parameter->type = FLOAT; else if (wcscmp(parameter->mime_type, L"text/ascii") == 0) parameter->type = STRING; else if (wcscmp(parameter->mime_type, L"text/plain") == 0) parameter->type = WSTRING; else error("MIME type %ls not allowed\n", parameter->mime_type); // drop parameters that can increase the size of the header dramatically if (flags && wcsncmp(parameter->name, L"affymetrix-algorithm-param-apt-opt-cel", 38) == 0) { free(parameter->name); parameter->name = NULL; parameter->n_value = 0; free(parameter->value); parameter->value = NULL; free(parameter->mime_type); parameter->mime_type = NULL; } } static void agcc_read_data_header(DataHeader *data_header, hFILE *hfile, int flags) { int i; read_string8(hfile, &data_header->data_type_identifier); read_string8(hfile, &data_header->guid); read_string16(hfile, &data_header->datetime); read_string16(hfile, &data_header->locale); data_header->n_parameters = (int32_t)read_long(hfile); data_header->parameters = (Parameter *)malloc(data_header->n_parameters * sizeof(Parameter)); for (i = 0; i < data_header->n_parameters; i++) agcc_read_parameters(&data_header->parameters[i], hfile, flags); data_header->n_parents = (int32_t)read_long(hfile); data_header->parents = (DataHeader *)malloc(data_header->n_parents * sizeof(DataHeader)); for (i = 0; i < data_header->n_parents; i++) agcc_read_data_header(&data_header->parents[i], hfile, flags); } static void agcc_read_data_set(DataSet *data_set, hFILE *hfile, int flags) { int i; data_set->pos_first_element = read_long(hfile); data_set->pos_next_data_set = read_long(hfile); read_string16(hfile, &data_set->name); data_set->n_parameters = (int32_t)read_long(hfile); data_set->parameters = (Parameter *)malloc(data_set->n_parameters * sizeof(Parameter)); for (i = 0; i < data_set->n_parameters; i++) agcc_read_parameters(&data_set->parameters[i], hfile, flags); data_set->n_cols = read_long(hfile); data_set->col_headers = (ColHeader *)malloc(data_set->n_cols * sizeof(ColHeader)); for (i = 0; i < data_set->n_cols; i++) { read_string16(hfile, &data_set->col_headers[i].name); read_bytes(hfile, (void *)&data_set->col_headers[i].type, sizeof(int8_t)); data_set->col_headers[i].size = read_long(hfile); } data_set->n_rows = read_long(hfile); data_set->hfile = hfile; data_set->col_offsets = (uint32_t *)malloc(data_set->n_cols * sizeof(uint32_t *)); data_set->n_buffer = 0; for (i = 0; i < data_set->n_cols; i++) { data_set->col_offsets[i] = data_set->n_buffer; data_set->n_buffer += data_set->col_headers[i].size; } data_set->buffer = (char *)malloc(data_set->n_buffer * sizeof(char)); if (data_set->pos_next_data_set) if (hseek(hfile, data_set->pos_next_data_set, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC file\n", data_set->pos_next_data_set); } static void agcc_read_data_group(DataGroup *data_group, hFILE *hfile, int flags) { int i; data_group->pos_next_data_group = read_long(hfile); data_group->pos_first_data_set = read_long(hfile); data_group->num_data_sets = read_long(hfile); read_string16(hfile, &data_group->name); if (hseek(hfile, data_group->pos_first_data_set, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC file\n", data_group->pos_first_data_set); data_group->data_sets = (DataSet *)malloc(data_group->num_data_sets * sizeof(DataSet)); for (i = 0; i < data_group->num_data_sets; i++) agcc_read_data_set(&data_group->data_sets[i], hfile, flags); if (data_group->pos_next_data_group) if (hseek(hfile, data_group->pos_next_data_group, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC file\n", data_group->pos_next_data_group); } static agcc_t *agcc_init(const char *fn, hFILE *hfile, int flags) { int i; agcc_t *agcc = (agcc_t *)calloc(1, sizeof(agcc_t)); agcc->fn = strdup(fn); agcc->hfile = hfile; // read File Header read_bytes(agcc->hfile, (void *)&agcc->magic, sizeof(uint8_t)); if (agcc->magic != 59) error("AGCC file %s magic number is %d while it should be 59\n", agcc->fn, agcc->magic); read_bytes(agcc->hfile, (void *)&agcc->version, sizeof(uint8_t)); if (agcc->version != 1) error("Cannot read AGCC file %s. Unsupported AGCC file format version: %d\n", agcc->fn, agcc->version); agcc->num_data_groups = (int32_t)read_long(agcc->hfile); agcc->pos_first_data_group = read_long(agcc->hfile); // read Generic Data Header agcc_read_data_header(&agcc->data_header, agcc->hfile, flags); // read Data Groups if (hseek(agcc->hfile, agcc->pos_first_data_group, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC %s file\n", agcc->pos_first_data_group, agcc->fn); agcc->data_groups = (DataGroup *)malloc(agcc->num_data_groups * sizeof(DataGroup)); for (i = 0; i < agcc->num_data_groups; i++) agcc_read_data_group(&agcc->data_groups[i], agcc->hfile, flags); if (!heof(agcc->hfile)) error("AGCC reader did not reach the end of file %s at position %ld\n", agcc->fn, htell(agcc->hfile)); if (hseek(agcc->hfile, 0L, SEEK_END) < 0) error("Fail to seek to end of AGCC %s file\n", agcc->fn); agcc->size = htell(agcc->hfile); char *ptr = strrchr(agcc->fn, '/') ? strrchr(agcc->fn, '/') + 1 : agcc->fn; agcc->display_name = strdup(ptr); ptr = strrchr(agcc->display_name, '.'); if (ptr && strcmp(ptr + 1, "chp") == 0) { *ptr = '\0'; ptr = strrchr(agcc->display_name, '.'); if (ptr && (strcmp(ptr + 1, "AxiomGT1") == 0 || strcmp(ptr + 1, "birdseed-v2") == 0)) *ptr = '\0'; } return agcc; } static void agcc_destroy_parameters(Parameter *parameters, int32_t n_parameters) { int i; for (i = 0; i < n_parameters; i++) { free(parameters[i].name); free(parameters[i].value); free(parameters[i].mime_type); } free(parameters); } static void agcc_destroy_data_header(DataHeader *data_header) { int i; free(data_header->data_type_identifier); free(data_header->guid); free(data_header->datetime); free(data_header->locale); agcc_destroy_parameters(data_header->parameters, data_header->n_parameters); for (i = 0; i < data_header->n_parents; i++) agcc_destroy_data_header(&data_header->parents[i]); free(data_header->parents); } static void agcc_destroy_data_set(DataSet *data_set) { int i; free(data_set->name); agcc_destroy_parameters(data_set->parameters, data_set->n_parameters); for (i = 0; i < data_set->n_cols; i++) free(data_set->col_headers[i].name); free(data_set->col_headers); free(data_set->col_offsets); free(data_set->buffer); } static void agcc_destroy_data_group(DataGroup *data_group) { int i; free(data_group->name); for (i = 0; i < data_group->num_data_sets; i++) agcc_destroy_data_set(&data_group->data_sets[i]); free(data_group->data_sets); } static void agcc_destroy(agcc_t *agcc) { if (!agcc) return; int i; free(agcc->fn); if (hclose(agcc->hfile) < 0) error("Error closing AGCC file\n"); agcc_destroy_data_header(&agcc->data_header); for (i = 0; i < agcc->num_data_groups; i++) agcc_destroy_data_group(&agcc->data_groups[i]); free(agcc->data_groups); free(agcc->display_name); free(agcc); } static void buffer_string16(const uint16_t *value, int32_t n_value, size_t *m_buffer, wchar_t **buffer) { int i; hts_expand(wchar_t, n_value / 2 + 1, *m_buffer, *buffer); for (i = 0; i < n_value / 2; i++) (*buffer)[i] = (wchar_t)ntohs(value[i]); (*buffer)[n_value / 2] = L'\0'; } static void agcc_print_parameters(const Parameter *parameters, int32_t n_parameters, FILE *stream) { int i; union { uint32_t u; float f; } convert; wchar_t *buffer = NULL; size_t m_buffer = 0; for (i = 0; i < n_parameters; i++) { fprintf(stream, "#%%%ls=", parameters[i].name ? parameters[i].name : L""); switch (parameters[i].type) { case BYTE: fprintf(stream, "%d\n", (int8_t)ntohl(*(uint32_t *)parameters[i].value)); break; case UBYTE: fprintf(stream, "%u\n", (uint8_t)ntohl(*(uint32_t *)parameters[i].value)); break; case SHORT: fprintf(stream, "%d\n", (int16_t)ntohl(*(uint32_t *)parameters[i].value)); break; case USHORT: fprintf(stream, "%u\n", (uint16_t)ntohl(*(uint32_t *)parameters[i].value)); break; case INT: fprintf(stream, "%d\n", (int32_t)ntohl(*(uint32_t *)parameters[i].value)); break; case UINT: fprintf(stream, "%u\n", ntohl(*(uint32_t *)parameters[i].value)); break; case FLOAT: convert.u = ntohl(*(uint32_t *)parameters[i].value); fprintf(stream, "%f\n", convert.f); break; case STRING: fprintf(stream, "%s\n", parameters[i].value); break; case WSTRING: buffer_string16((uint16_t *)parameters[i].value, parameters[i].n_value, &m_buffer, &buffer); fprintf(stream, "%ls\n", buffer); break; default: break; } } free(buffer); } static void agcc_print_data_header(const DataHeader *data_header, FILE *stream) { int i; if (data_header->guid) fprintf(stream, "#%%FileIdentifier=%s\n", data_header->guid); fprintf(stream, "#%%FileTypeIdentifier=%s\n", data_header->data_type_identifier); fprintf(stream, "#%%FileLocale=%ls\n", data_header->locale); agcc_print_parameters(data_header->parameters, data_header->n_parameters, stream); for (i = 0; i < data_header->n_parents; i++) agcc_print_data_header(&data_header->parents[i], stream); } typedef void (*col_print_t)(const char *, FILE *stream); void agcc_print_probe_set_name(const char *s, FILE *stream) { uint32_t size = ntohl(*(uint32_t *)s); fwrite(s + 4, 1, size, stream); } void agcc_print_call(const char *s, FILE *stream) { static const char a[16] = "......ABA..N...."; static const char b[16] = "......ABB..C...."; int c = s[0] & 0x0F; fputc(a[c], stream); fputc(b[c], stream); } void agcc_print_float(const char *s, FILE *stream) { union { uint32_t u; float f; } convert; convert.u = ntohl(*(uint32_t *)s); fprintf(stream, "%g", convert.f); } static void agcc_print_data_set(const DataSet *data_set, FILE *stream, int verbose) { fprintf(stream, "#%%SetName=%ls\n", data_set->name); fprintf(stream, "#%%Columns=%d\n", data_set->n_cols); fprintf(stream, "#%%Rows=%d\n", data_set->n_rows); int i, j; agcc_print_parameters(data_set->parameters, data_set->n_parameters, stream); for (i = 0; i < data_set->n_cols; i++) fprintf(stream, "%ls%c", data_set->col_headers[i].name, i + 1 < data_set->n_cols ? '\t' : '\n'); if (data_set->n_rows == 0) return; if (!verbose) { fprintf(stream, "... use --verbose to visualize Data Set ...\n"); return; } if (wcscmp(data_set->name, L"Genotype") != 0) { fprintf(stream, "... can only visualize Genotype Data Set ...\n"); return; } char *col_ends = (char *)malloc(data_set->n_cols * sizeof(char *)); col_print_t *col_prints = (col_print_t *)malloc(data_set->n_cols * sizeof(col_print_t *)); for (i = 0; i < data_set->n_cols; i++) { col_ends[i] = i + 1 < data_set->n_cols ? '\t' : '\n'; if (wcscmp(data_set->col_headers[i].name, L"ProbeSetName") == 0) col_prints[i] = agcc_print_probe_set_name; else if (wcscmp(data_set->col_headers[i].name, L"Call") == 0) col_prints[i] = agcc_print_call; else if (wcscmp(data_set->col_headers[i].name, L"Confidence") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Contrast") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Log Ratio") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Strength") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Signal A") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Signal B") == 0) col_prints[i] = agcc_print_float; else if (wcscmp(data_set->col_headers[i].name, L"Forced Call") == 0) col_prints[i] = agcc_print_call; else error("Unknown column type %ls in AGCC file with type %d\n", data_set->col_headers[i].name, data_set->col_headers[i].type); } if (hseek(data_set->hfile, data_set->pos_first_element, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC file\n", data_set->pos_first_element); for (i = 0; i < data_set->n_rows; i++) { read_bytes(data_set->hfile, (void *)data_set->buffer, data_set->n_buffer); for (j = 0; j < data_set->n_cols; j++) { col_prints[j](data_set->buffer + data_set->col_offsets[j], stream); fputc(col_ends[j], stream); } } free(col_ends); free(col_prints); } static void agcc_print_data_group(const DataGroup *data_group, FILE *stream, int verbose) { fprintf(stream, "#%%GroupName=%ls\n", data_group->name); int i; for (i = 0; i < data_group->num_data_sets; i++) agcc_print_data_set(&data_group->data_sets[i], stream, verbose); } static void agcc_print(const agcc_t *agcc, FILE *stream, int verbose) { fprintf(stream, "#%%File=%s\n", agcc->fn); fprintf(stream, "#%%FileSize=%ld\n", agcc->size); fprintf(stream, "#%%Magic=%d\n", agcc->magic); fprintf(stream, "#%%Version=%d\n", agcc->version); int i; agcc_print_data_header(&agcc->data_header, stream); for (i = 0; i < agcc->num_data_groups; i++) agcc_print_data_group(&agcc->data_groups[i], stream, verbose); } static void chps_to_tsv(uint8_t *magic, agcc_t **agcc, int n, FILE *stream) { int i, j, k; // AxiomGT1 analysis has also cn-probe-chrXY-ratio_gender_meanX, // cn-probe-chrXY-ratio_gender_meanY, cn-probe-chrXY-ratio_gender_ratio, // cn-probe-chrXY-ratio_gender while BRLMM-P analysis has also em-cluster-chrX-het-contrast_gender // em-cluster-chrX-het-contrast_gender_chrX_het_rate // pm_mean static const wchar_t *chipsummary[] = {L"computed_gender", L"call_rate", L"total_call_rate", L"het_rate", L"total_het_rate", L"hom_rate", L"total_hom_rate", L"cluster_distance_mean", L"cluster_distance_stdev", L"allele_summarization_mean", L"allele_summarization_stdev", L"allele_deviation_mean", L"allele_deviation_stdev", L"allele_mad_residuals_mean", L"allele_mad_residuals_stdev"}; fputs("chp", stream); for (j = 0; j < 15; j++) fprintf(stream, "\t%ls", chipsummary[j]); fputc('\n', stream); for (i = 0; i < n; i++) { if (magic[i] != 59) continue; if (strcmp(agcc[i]->data_header.data_type_identifier, "affymetrix-multi-data-type-analysis") != 0) { if (strcmp(agcc[i]->data_header.data_type_identifier, "affymetrix-calvin-intensity") == 0 || strcmp(agcc[i]->data_header.data_type_identifier, "affymetrix-calvin-multi-intensity") == 0) error( "AGCC file %s contains calvin intensities rather multi data type analysis (use --cel to extract " "metadata)\n", agcc[i]->fn); else error("AGCC file %s does not contain multi data type analysis as data type identifier is %s\n", agcc[i]->fn, agcc[i]->data_header.data_type_identifier); } fputs(strrchr(agcc[i]->fn, '/') ? strrchr(agcc[i]->fn, '/') + 1 : agcc[i]->fn, stream); DataHeader *data_header = &agcc[i]->data_header; for (j = 0, k = 0; j < 15; j++) { fputc('\t', stream); while (!data_header->parameters[k].name || wcsncmp(data_header->parameters[k].name, L"affymetrix-chipsummary-", 23) != 0 || wcscmp(&data_header->parameters[k].name[23], chipsummary[j]) != 0) { k++; k %= data_header->n_parameters; } union { uint32_t u; float f; } convert; switch (data_header->parameters[k].type) { case FLOAT: convert.u = ntohl(*(uint32_t *)data_header->parameters[k].value); fprintf(stream, "%.5f", convert.f); break; case STRING: fputs(data_header->parameters[k].value, stream); break; default: error("Unable to print parameter of type %d from %s AGCC file\n", data_header->parameters[k].type, agcc[i]->fn); break; } } fputc('\n', stream); } } /**************************************** * PRINT CEL SUMMARY * ****************************************/ // this function returns // fusion-experiment-name // pixel-cols // pixel-rows // XIN // YIN // VE // temp // power // scan-date // scanner-id // scanner-type // array-type static void parse_dat_header(char *dat_header, char *str[12], int n_str[12]) { char *ss = strchr(dat_header, ' ') + 2; char *se = strchr(dat_header, '\0'); if (!se) goto fail; se = strchr(ss, ':'); if (!se) goto fail; str[0] = ss; n_str[0] = se - ss; ss = se + 5; for (se = ss + 4; isspace(*se) && se >= ss; se--); str[1] = ss; n_str[1] = se - ss + 1; ss = ss + 9; for (se = ss + 4; isspace(*se) && se >= ss; se--); str[2] = ss; n_str[2] = se - ss + 1; ss = ss + 9; for (se = ss + 2; isspace(*se) && se >= ss; se--); str[3] = ss; n_str[3] = se - ss + 1; ss = ss + 7; for (se = ss + 2; isspace(*se) && se >= ss; se--); str[4] = ss; n_str[4] = se - ss + 1; ss = ss + 6; for (se = ss + 2; isspace(*se) && se >= ss; se--); str[5] = ss; n_str[5] = se - ss + 1; ss = ss + 3; for (se = ss + 6; isspace(*se) && se >= ss; se--); str[6] = ss; n_str[6] = se - ss + 1; ss = ss + 7; for (se = ss + 3; isspace(*se) && se >= ss; se--); str[7] = ss; n_str[7] = se - ss + 1; ss = ss + 4; for (se = ss + 17; isspace(*se) && se >= ss; se--); str[8] = ss; n_str[8] = se - ss + 1; ss = ss + 18; se = strchr(ss, ' '); if (!se) goto fail; str[9] = ss; n_str[9] = se - ss; ss = se + 2; se = strstr(ss, "\x14 "); if (!se) goto fail; for (se--; isspace(*se) && se >= ss; se--); str[10] = ss; n_str[10] = se - ss + 1; se = strstr(ss, "\x14 "); if (!se) goto fail; ss = se + 2; se = strstr(ss, "\x14 "); if (!se) goto fail; ss = se + 2; se = strstr(ss, ".1sq"); if (!se) goto fail; str[11] = ss; n_str[11] = se - ss; return; fail: error("DAT header malformed\n"); } // http://github.com/HenrikBengtsson/affxparser/blob/master/R/parseDatHeaderString.R static void cels_to_tsv(uint8_t *magic, void **files, int n, FILE *stream) { int i, j; wchar_t *array_type = NULL; // affymetrix-array-type wchar_t *scanner_type = NULL; // affymetrix-scanner-type wchar_t *scanner_id = NULL; // affymetrix-scanner-id wchar_t *scan_date = NULL; // affymetrix-scan-date wchar_t *fusion_experiment_name = NULL; // affymetrix-fusion-experiment-name size_t m_array_type = 0, m_scanner_type = 0, m_scanner_id = 0, m_scan_date = 0, m_fusion_experiment_name = 0; int32_t pixel_rows = 0; // affymetrix-pixel-rows int32_t pixel_cols = 0; // affymetrix-pixel-cols char *str[12]; int n_str[12]; fprintf(stream, "cel\tarray_type\tscanner_type\tscanner_id\tscan_date\tfusion_experiment_name\tpixel_rows\tpixel_cols\n"); for (i = 0; i < n; i++) { char *ss, *se; agcc_t *agcc = (agcc_t *)files[i]; xda_cel_t *xda_cel = (xda_cel_t *)files[i]; switch (magic[i]) { case 59: if (strcmp(agcc->data_header.data_type_identifier, "affymetrix-calvin-intensity") != 0 && strcmp(agcc->data_header.data_type_identifier, "affymetrix-calvin-multi-intensity") != 0) error("AGCC file %s does not contain calvin intensities as data type identifier is %s\n", agcc->fn, agcc->data_header.data_type_identifier); if (agcc->data_header.n_parents == 0 || (strcmp(agcc->data_header.parents[0].data_type_identifier, "affymetrix-calvin-scan-acquisition") != 0 && strcmp(agcc->data_header.parents[0].data_type_identifier, "affymetrix-calvin-multi-scan-acquisition") != 0)) error("AGCC file %s is missing scan acquisition information as data type identifier is %s\n", agcc->fn, agcc->data_header.parents[0].data_type_identifier); const Parameter *parameter; for (j = 0; j < agcc->data_header.parents[0].n_parameters; j++) { parameter = &agcc->data_header.parents[0].parameters[j]; if (wcscmp(parameter->name, L"affymetrix-array-type") == 0 && parameter->type == WSTRING) buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_array_type, &array_type); else if (wcscmp(parameter->name, L"affymetrix-scanner-type") == 0 && parameter->type == WSTRING) buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scanner_type, &scanner_type); else if (wcscmp(parameter->name, L"affymetrix-scanner-id") == 0 && parameter->type == WSTRING) buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scanner_id, &scanner_id); else if (wcscmp(parameter->name, L"affymetrix-scan-date") == 0 && parameter->type == WSTRING) buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_scan_date, &scan_date); else if (wcscmp(parameter->name, L"affymetrix-fusion-experiment-name") == 0 && parameter->type == WSTRING) buffer_string16((uint16_t *)parameter->value, parameter->n_value, &m_fusion_experiment_name, &fusion_experiment_name); if (wcscmp(parameter->name, L"affymetrix-pixel-rows") == 0 && parameter->type == INT) pixel_rows = (int32_t)ntohl(*(uint32_t *)parameter->value); if (wcscmp(parameter->name, L"affymetrix-pixel-cols") == 0 && parameter->type == INT) pixel_cols = (int32_t)ntohl(*(uint32_t *)parameter->value); } fputs(strrchr(agcc->fn, '/') ? strrchr(agcc->fn, '/') + 1 : agcc->fn, stream); fputc('\t', stream); if (array_type) { fprintf(stream, "%ls", array_type); array_type[0] = L'\0'; } fputc('\t', stream); if (scanner_type) { fprintf(stream, "%ls", scanner_type); scanner_type[0] = L'\0'; } fputc('\t', stream); if (scanner_id) { fprintf(stream, "%ls", scanner_id); scanner_id[0] = L'\0'; } fputc('\t', stream); if (scan_date) { fprintf(stream, "%ls", scan_date); scan_date[0] = L'\0'; } fputc('\t', stream); if (fusion_experiment_name) { fprintf(stream, "%ls", fusion_experiment_name); fusion_experiment_name[0] = L'\0'; } fputc('\t', stream); if (pixel_rows) { fprintf(stream, "%d", pixel_rows); pixel_rows = 0; } fputc('\t', stream); if (pixel_cols) { fprintf(stream, "%d", pixel_cols); pixel_cols = 0; } fputc('\n', stream); break; case 64: ss = strstr(xda_cel->header, "\nDatHeader=["); if (!ss) error("XDA CEL file %s is missing DAT header\n", xda_cel->fn); ss = strchr(ss + 12, ']'); if (!ss) error("XDA CEL file %s is missing DAT header\n", xda_cel->fn); ss++; se = strchr(ss, '\n'); if (!se) error("XDA CEL file %s is missing DAT header\n", xda_cel->fn); *se = '\0'; parse_dat_header(ss, str, n_str); *se = '\n'; fprintf(stream, "%s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\n", strrchr(xda_cel->fn, '/') ? strrchr(xda_cel->fn, '/') + 1 : xda_cel->fn, n_str[11], str[11], n_str[10], str[10], n_str[9], str[9], n_str[8], str[8], n_str[0], str[0], n_str[1], str[1], n_str[2], str[2]); break; default: break; } } free(array_type); free(scanner_type); free(scanner_id); free(scan_date); free(fusion_experiment_name); } /**************************************** * htsFILE READING FUNCTIONS * ****************************************/ static htsFile *unheader(const char *fn, kstring_t *str) { htsFile *fp = hts_open(fn, "r"); if (fp == NULL) error("Could not open %s: %s\n", fn, strerror(errno)); do // skip header if (hts_getline(fp, KS_SEP_LINE, str) <= 0) error("Empty file: %s\n", fn); while (str->s[0] == '#'); return fp; } /************************************************ * PROBEST IDS FILE IMPLEMENTATION * ************************************************/ static void *probeset_ids_init(const char *fn) { void *probeset_ids = khash_str2int_init(); kstring_t str = {0, 0, NULL}; htsFile *fp = unheader(fn, &str); int moff = 0, *off = NULL, ncols; ncols = ksplit_core(str.s, '\t', &moff, &off); if (ncols < 1 || strcmp(&str.s[off[0]], "probeset_id")) error("Malformed first line from probeset IDs file: %s\n%s\n", fn, str.s); while (hts_getline(fp, KS_SEP_LINE, &str) > 0) { ncols = ksplit_core(str.s, '\t', &moff, &off); if (khash_str2int_has_key(probeset_ids, &str.s[off[0]])) error("Probe Set %s present multiple times in file %s\n", &str.s[off[0]], fn); khash_str2int_inc(probeset_ids, strdup(&str.s[off[0]])); } free(off); free(str.s); hts_close(fp); return probeset_ids; } /************************************************ * SNP CLUSTER POSTERIORS FILE IMPLEMENTATION * ************************************************/ // http://www.affymetrix.com/support/developer/powertools/changelog/SnpModelConverter_8cpp_source.html typedef struct { float xm; // delta mean of cluster float xss; // delta variance of cluster float k; // strength of mean (pseudo-observations) float v; // strength of variance (pseudo-observations) float ym; // size mean of cluster in other dimension float yss; // size variance of cluster in other dimension float xyss; // covariance of cluster in both directions } cluster_t; typedef struct { char *probeset_id; int copynumber; cluster_t aa; cluster_t ab; cluster_t bb; } snp_t; typedef struct { int is_birdseed; void *probeset_id[2]; snp_t *snps[2]; int n_snps[2]; int m_snps[2]; } snp_models_t; static inline void brlmmp_cluster_init(const char *s, const int *off, cluster_t *cluster) { cluster->xm = strtof(&s[off[0]], NULL); cluster->xss = strtof(&s[off[1]], NULL); cluster->k = strtof(&s[off[2]], NULL); cluster->v = strtof(&s[off[3]], NULL); cluster->ym = strtof(&s[off[4]], NULL); cluster->yss = strtof(&s[off[5]], NULL); cluster->xyss = strtof(&s[off[6]], NULL); } static inline void birdseed_cluster_init(const char *s, const int *off, cluster_t *cluster) { cluster->xm = strtof(&s[off[0]], NULL); cluster->ym = strtof(&s[off[1]], NULL); cluster->xss = strtof(&s[off[2]], NULL); cluster->xyss = strtof(&s[off[3]], NULL); cluster->yss = strtof(&s[off[4]], NULL); cluster->k = strtof(&s[off[5]], NULL); cluster->v = strtof(&s[off[5]], NULL); } static snp_models_t *snp_models_init(const char *fn) { int i; snp_models_t *snp_models = (snp_models_t *)calloc(1, sizeof(snp_models_t)); for (i = 0; i < 2; i++) { snp_models->probeset_id[i] = khash_str2int_init(); } kstring_t str = {0, 0, NULL}; htsFile *fp = unheader(fn, &str); int sep1, sep2, sep3, exp_cols; if (strcmp(str.s, "id\tBB\tAB\tAA\tCV") == 0 || strcmp(str.s, "id\tBB\tAB\tAA\tCV\tOTV") == 0) { if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Missing information in SNP posteriors file: %s\n", fn); sep1 = '\t'; sep2 = ','; sep3 = ':'; exp_cols = 7; } else if (!strchr(str.s, '\t')) { snp_models->is_birdseed = 1; sep1 = ';'; sep2 = ' '; sep3 = '-'; exp_cols = 6; } else { error("Malformed header line in SNP model file %s:\n%s\n", fn, str.s); } snp_t *snp; int moff1 = 0, *off1 = NULL, ncols1; int moff2 = 0, *off2 = NULL, ncols2; do { ncols1 = ksplit_core(str.s, sep1, &moff1, &off1); char *col_str = &str.s[off1[0]]; int len = strlen(col_str); int copynumber; if (col_str[len - 2] == sep3) { char *tmp; copynumber = strtol(&col_str[len - 1], &tmp, 0); if (*tmp) error("Could not parse copynumber %s from file: %s\n", &col_str[len - 1], fn); len -= 2; col_str[len] = '\0'; } else { copynumber = 2; } int idx = copynumber == 2; hts_expand(snp_t, snp_models->n_snps[idx] + 1, snp_models->m_snps[idx], snp_models->snps[idx]); snp = &snp_models->snps[idx][snp_models->n_snps[idx]]; snp->probeset_id = strdup(&str.s[off1[0]]); snp->copynumber = copynumber; if (khash_str2int_has_key(snp_models->probeset_id[idx], snp->probeset_id)) error("Probe Set %s present multiple times in file %s\n", snp->probeset_id, fn); khash_str2int_inc(snp_models->probeset_id[idx], snp->probeset_id); if (ncols1 < 4 - (2 - copynumber) * snp_models->is_birdseed) error("Missing information for probeset %s in SNP posteriors file: %s\n", str.s, fn); col_str = &str.s[off1[1]]; ncols2 = ksplit_core(col_str, sep2, &moff2, &off2); if (ncols2 < exp_cols) error("Missing information for probeset %s in SNP posteriors file: %s\n", str.s, fn); if (snp_models->is_birdseed) birdseed_cluster_init(col_str, off2, &snp->aa); else brlmmp_cluster_init(col_str, off2, &snp->bb); col_str = &str.s[off1[2]]; if (snp_models->is_birdseed && copynumber == 1) { snp->ab.xm = NAN; snp->ab.xss = NAN; snp->ab.k = NAN; snp->ab.v = NAN; snp->ab.ym = NAN; snp->ab.yss = NAN; snp->ab.xyss = NAN; } else { ncols2 = ksplit_core(col_str, sep2, &moff2, &off2); if (ncols2 < exp_cols) error("Missing information for probeset %s in SNP posteriors file: %s\n", str.s, fn); if (snp_models->is_birdseed) birdseed_cluster_init(col_str, off2, &snp->ab); else brlmmp_cluster_init(col_str, off2, &snp->ab); col_str = &str.s[off1[3]]; } ncols2 = ksplit_core(col_str, sep2, &moff2, &off2); if (ncols2 < exp_cols) error("Missing information for probeset %s in SNP posteriors file: %s\n", str.s, fn); if (snp_models->is_birdseed) birdseed_cluster_init(col_str, off2, &snp->bb); else brlmmp_cluster_init(col_str, off2, &snp->aa); snp_models->n_snps[idx]++; } while (hts_getline(fp, KS_SEP_LINE, &str) > 0); free(off2); free(off1); free(str.s); hts_close(fp); return snp_models; } static void snp_models_destroy(snp_models_t *snp_models) { int i, j; for (i = 0; i < 2; i++) { khash_str2int_destroy(snp_models->probeset_id[i]); for (j = 0; j < snp_models->n_snps[i]; j++) free(snp_models->snps[i][j].probeset_id); free(snp_models->snps[i]); } free(snp_models); } /**************************************** * ANNOT.CSV FILE IMPLEMENTATION * ****************************************/ typedef struct { char *probeset_id; char *affy_snp_id; char *dbsnp_rs_id; char *chromosome; int position; int strand; char *flank; } record_t; typedef struct { void *probeset_id; record_t *records; int n_records, m_records; } annot_t; static inline char *unquote(char *str) { if (strcmp(str, "\"---\"") == 0) return NULL; char *ptr = strrchr(str, '"'); if (ptr) *ptr = '\0'; return str + 1; } static annot_t *annot_init(const char *fn, const char *sam_fn, const char *out_fn, int flags) { annot_t *annot = NULL; FILE *out_txt = get_file_handle(out_fn); htsFile *hts = NULL; sam_hdr_t *sam_hdr = NULL; bam1_t *b = NULL; if (sam_fn) { hts = hts_open(sam_fn, "r"); if (hts == NULL || hts_get_format(hts)->category != sequence_data) error("File %s does not contain sequence data\n", sam_fn); sam_hdr = sam_hdr_read(hts); if (sam_hdr == NULL) error("Reading header from \"%s\" failed", sam_fn); b = bam_init1(); if (b == NULL) error("Cannot create SAM record\n"); } kstring_t str = {0, 0, NULL}; htsFile *fp = hts_open(fn, "r"); if (!fp) error("Could not read: %s\n", fn); if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fn); const char *null_strand = "---"; while (str.s[0] == '#') { if (strcmp(str.s, "#%netaffx-annotation-tabular-format-version=1.0") == 0) null_strand = "---"; if (strcmp(str.s, "#%netaffx-annotation-tabular-format-version=1.5") == 0) null_strand = "+"; if (hts && out_txt) fprintf(out_txt, "%s\n", str.s); hts_getline(fp, KS_SEP_LINE, &str); } if (hts && out_txt) fprintf(out_txt, "%s\n", str.s); int probe_set_id_idx = -1; int affy_snp_id_idx = -1; int dbsnp_rs_id_idx = -1; int chromosome_idx = -1; int position_idx = -1; int position_end_idx = -1; int strand_idx = -1; int flank_idx = -1; int allele_a_idx = -1; int allele_b_idx = -1; int i, moff = 0, *off = NULL; int ncols = ksplit_core(str.s, ',', &moff, &off); for (i = 0; i < ncols; i++) { if (strcmp(&str.s[off[i]], "\"Probe Set ID\"") == 0) probe_set_id_idx = i; else if (strcmp(&str.s[off[i]], "\"Affy SNP ID\"") == 0) affy_snp_id_idx = i; else if (strcmp(&str.s[off[i]], "\"dbSNP RS ID\"") == 0) dbsnp_rs_id_idx = i; else if (strcmp(&str.s[off[i]], "\"Chromosome\"") == 0) chromosome_idx = i; else if (strcmp(&str.s[off[i]], "\"Physical Position\"") == 0) position_idx = i; else if (strcmp(&str.s[off[i]], "\"Position End\"") == 0) position_end_idx = i; else if (strcmp(&str.s[off[i]], "\"Strand\"") == 0) strand_idx = i; else if (strcmp(&str.s[off[i]], "\"Flank\"") == 0) flank_idx = i; else if (strcmp(&str.s[off[i]], "\"Allele A\"") == 0) allele_a_idx = i; else if (strcmp(&str.s[off[i]], "\"Allele B\"") == 0) allele_b_idx = i; } if (probe_set_id_idx != 0) error("Probe Set ID not the first column in file: %s\n", fn); if (flank_idx == -1) error("Flank missing from file: %s\n", fn); if (allele_a_idx == -1) error("Allele A missing from file: %s\n", fn); if (allele_b_idx == -1) error("Allele B missing from file: %s\n", fn); const char *probeset_id, *flank, *allele_a, *allele_b; if (!hts && out_txt) { while (hts_getline(fp, KS_SEP_LINE, &str) > 0) { ncols = ksplit_core(str.s, ',', &moff, &off); probeset_id = unquote(&str.s[off[probe_set_id_idx]]); flank = unquote(&str.s[off[flank_idx]]); if (flank) flank2fasta(probeset_id, flank, out_txt); } } else { if (dbsnp_rs_id_idx == -1) error("dbSNP RS ID missing from file: %s\n", fn); if (chromosome_idx == -1) error("Chromosome missing from file: %s\n", fn); if (position_idx == -1) error("Physical Position missing from file: %s\n", fn); if (strand_idx == -1) error("Strand missing from file: %s\n", fn); if (!out_txt) { annot = (annot_t *)calloc(1, sizeof(annot_t)); annot->probeset_id = khash_str2int_init(); } int n_total = 0, n_unmapped = 0; while (hts_getline(fp, KS_SEP_LINE, &str) > 0) { ncols = ksplit_core(str.s, ',', &moff, &off); probeset_id = unquote(&str.s[off[probe_set_id_idx]]); flank = unquote(&str.s[off[flank_idx]]); allele_a = unquote(&str.s[off[allele_a_idx]]); allele_b = unquote(&str.s[off[allele_b_idx]]); const char *chromosome = NULL; int strand = -1, position = 0, idx = -1; if (hts) { if (!flank) { if (flags & VERBOSE) fprintf(stderr, "Missing flank sequence for marker %s\n", probeset_id); n_unmapped++; } else { idx = get_position(hts, sam_hdr, b, probeset_id, flank, 0, &chromosome, &position, &strand); if (idx < 0) error("Reading from %s failed", sam_fn); else if (idx == 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine position for marker %s\n", probeset_id); n_unmapped++; } } n_total++; } else { chromosome = unquote(&str.s[off[chromosome_idx]]); const char *ptr = unquote(&str.s[off[position_idx]]); char *tmp = NULL; if (ptr) { position = strtol(ptr, &tmp, 0); if (*tmp) error("Could not parse position %s from file: %s\n", ptr, fn); } else { position = 0; } ptr = unquote(&str.s[off[strand_idx]]); if (!ptr) strand = -1; else if (strcmp(ptr, "+") == 0) strand = 0; else if (strcmp(ptr, "-") == 0) strand = 1; else strand = -1; } if (out_txt) { // "Ref Allele" and "Alt Allele" will not be updated fprintf(out_txt, "\"%s\"", probeset_id); for (i = 1; i < ncols; i++) { if (i == flank_idx) { fprintf(out_txt, ",\"%s\"", flank); } else if (i == allele_a_idx) { fprintf(out_txt, ",\"%s\"", allele_a); } else if (i == allele_b_idx) { fprintf(out_txt, ",\"%s\"", allele_b); } else if (i == chromosome_idx) { if (chromosome) fprintf(out_txt, ",\"%s\"", chromosome); else fprintf(out_txt, ",\"---\""); } else if (i == position_idx) { if (position) fprintf(out_txt, ",\"%d\"", position); else fprintf(out_txt, ",\"---\""); } else if (i == position_end_idx) { if (flank && position && idx > 0) { const char *left = strchr(flank, '['); const char *middle = strchr(flank, '/'); const char *right = strchr(flank, ']'); if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank); fprintf(out_txt, ",\"%d\"", position + (int)(idx > 1 ? right - middle : middle - left + (*(left + 1) == '-')) - 2); } else { fprintf(out_txt, ",\"---\""); } } else if (i == strand_idx) { fprintf(out_txt, ",\"%s\"", strand == 0 ? "+" : (strand == 1 ? "-" : null_strand)); } else { fprintf(out_txt, ",%s", &str.s[off[i]]); } } fprintf(out_txt, "\n"); } else { hts_expand0(record_t, annot->n_records + 1, annot->m_records, annot->records); annot->records[annot->n_records].probeset_id = strdup(probeset_id); if (khash_str2int_has_key(annot->probeset_id, annot->records[annot->n_records].probeset_id)) error("Probe Set %s present multiple times in file %s\n", annot->records[annot->n_records].probeset_id, fn); khash_str2int_inc(annot->probeset_id, annot->records[annot->n_records].probeset_id); const char *dbsnp_rs_id = unquote(&str.s[off[dbsnp_rs_id_idx]]); if (dbsnp_rs_id) annot->records[annot->n_records].dbsnp_rs_id = strdup(dbsnp_rs_id); if (affy_snp_id_idx >= 0) { const char *affy_snp_id = unquote(&str.s[off[affy_snp_id_idx]]); if (affy_snp_id) annot->records[annot->n_records].affy_snp_id = strdup(affy_snp_id); } if (chromosome) annot->records[annot->n_records].chromosome = strdup(chromosome); annot->records[annot->n_records].position = position; if (flank) { annot->records[annot->n_records].flank = strdup(flank); // check whether alleles A and B need to be flipped in // the flank sequence (happens with T/C and T/G SNPs // only) char *left = strchr(annot->records[annot->n_records].flank, '['); char *middle = strchr(annot->records[annot->n_records].flank, '/'); char *right = strchr(annot->records[annot->n_records].flank, ']'); if (strncmp(left + 1, allele_b, middle - left - 1) == 0 && strncmp(middle + 1, allele_a, right - middle - 1) == 0) { memcpy(left + 1, allele_a, right - middle - 1); *(left + (right - middle)) = '/'; memcpy(left + (right - middle) + 1, allele_b, middle - left - 1); } } annot->records[annot->n_records].strand = strand; annot->n_records++; } } if (hts) fprintf(stderr, "Lines total/unmapped:\t%d/%d\n", n_total, n_unmapped); bam_destroy1(b); sam_hdr_destroy(sam_hdr); if (hts && hts_close(hts) < 0) error("closing \"%s\" failed", fn); } free(off); free(str.s); hts_close(fp); if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt); return annot; } static void annot_destroy(annot_t *annot) { int i; khash_str2int_destroy(annot->probeset_id); for (i = 0; i < annot->n_records; i++) { free(annot->records[i].probeset_id); free(annot->records[i].affy_snp_id); free(annot->records[i].dbsnp_rs_id); free(annot->records[i].chromosome); free(annot->records[i].flank); } free(annot->records); free(annot); } /**************************************** * READER ITERATORS * ****************************************/ #define MAX_LENGTH_PROBE_SET_ID 17 typedef struct { int nsmpl; DataSet **data_sets; int *nrows; int *is_brlmm_p; htsFile *calls_fp; htsFile *confidences_fp; htsFile *summary_fp; char probeset_id[MAX_LENGTH_PROBE_SET_ID + 1]; int *gts; float *conf_arr; float *norm_x_arr; float *norm_y_arr; float *delta_arr; float *size_arr; float *baf_arr; float *lrr_arr; } varitr_t; static void varitr_init_common(varitr_t *varitr) { varitr->gts = (int *)malloc(varitr->nsmpl * sizeof(int)); varitr->conf_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->norm_x_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->norm_y_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->delta_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->size_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->baf_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); varitr->lrr_arr = (float *)malloc(varitr->nsmpl * sizeof(float)); } static varitr_t *varitr_init_cc(bcf_hdr_t *hdr, agcc_t **agcc, int n) { int i; varitr_t *varitr = (varitr_t *)calloc(1, sizeof(varitr_t)); varitr->nsmpl = n; varitr->data_sets = (DataSet **)malloc(n * sizeof(DataSet *)); varitr->nrows = (int *)calloc(n, sizeof(int)); varitr->is_brlmm_p = (int *)malloc(n * sizeof(int)); for (i = 0; i < n; i++) { if (strcmp(agcc[i]->data_header.data_type_identifier, "affymetrix-multi-data-type-analysis") != 0) error("AGCC file %s does not contain multi data type analysis as \n", agcc[i]->fn); if (agcc[i]->num_data_groups == 0 || wcscmp(agcc[i]->data_groups[0].name, L"MultiData") != 0) error("AGCC file %s does not contain multi data\n", agcc[i]->fn); if (agcc[i]->data_groups[0].num_data_sets == 0 || wcscmp(agcc[i]->data_groups[0].data_sets[0].name, L"Genotype") != 0) error("AGCC file %s does not contain genotype data\n", agcc[i]->fn); DataSet *data_set = &agcc[i]->data_groups[0].data_sets[0]; if (wcscmp(data_set->col_headers[0].name, L"ProbeSetName") != 0 || wcscmp(data_set->col_headers[1].name, L"Call") != 0 || wcscmp(data_set->col_headers[2].name, L"Confidence") != 0 || wcscmp(data_set->col_headers[5].name, L"Forced Call") != 0) error("AGCC file %s does not contain genotype data in the expected format\n", agcc[i]->fn); if (wcscmp(data_set->col_headers[3].name, L"Contrast") == 0 || wcscmp(data_set->col_headers[3].name, L"Log Ratio") == 0 || wcscmp(data_set->col_headers[4].name, L"Strength") == 0) varitr->is_brlmm_p[i] = 1; // ProbeSetName / Call / Confidence / Contrast/Log Ratio // / Strength / Forced Call else if (wcscmp(data_set->col_headers[3].name, L"Signal A") == 0 || wcscmp(data_set->col_headers[4].name, L"Signal B") == 0) varitr->is_brlmm_p[i] = 0; // ProbeSetName / Call / Confidence / Signal A // / Signal B / Forced Call else error("AGCC file %s does not contain intensities data in the expected format\n", agcc[i]->fn); if (hseek(data_set->hfile, data_set->pos_first_element, SEEK_SET) < 0) error("Fail to seek to position %d in AGCC file\n", data_set->pos_first_element); bcf_hdr_add_sample(hdr, agcc[i]->display_name); varitr->data_sets[i] = data_set; } varitr_init_common(varitr); return varitr; } static varitr_t *varitr_init_txt(bcf_hdr_t *hdr, const char *calls_fn, const char *confidences_fn, const char *summary_fn) { varitr_t *varitr = (varitr_t *)calloc(1, sizeof(varitr_t)); kstring_t str = {0, 0, NULL}; int i, moff = 0, *off = NULL, ncols; if (calls_fn) { fprintf(stderr, "Reading genotype calls file %s\n", calls_fn); varitr->calls_fp = unheader(calls_fn, &str); ncols = ksplit_core(str.s, '\t', &moff, &off); if (strcmp(&str.s[off[0]], "probeset_id")) error("Malformed first line from calls file: %s\n%s\n", calls_fn, str.s); varitr->nsmpl = ncols - 1; for (i = 1; i < ncols; i++) { char *ptr = strrchr(&str.s[off[i]], '.'); if (ptr && strcmp(ptr + 1, "CEL") == 0) *ptr = '\0'; bcf_hdr_add_sample(hdr, &str.s[off[i]]); } } if (confidences_fn) { fprintf(stderr, "Reading genotype confidences file %s\n", confidences_fn); varitr->confidences_fp = unheader(confidences_fn, &str); ncols = ksplit_core(str.s, '\t', &moff, &off); if (strcmp(&str.s[off[0]], "probeset_id")) error("Malformed first line from confidences file: %s\n%s\n", confidences_fn, str.s); if (!varitr->calls_fp) { varitr->nsmpl = ncols - 1; for (i = 1; i < ncols; i++) { char *ptr = strrchr(&str.s[off[i]], '.'); if (ptr && strcmp(ptr + 1, "CEL") == 0) *ptr = '\0'; bcf_hdr_add_sample(hdr, &str.s[off[i]]); } } } if (summary_fn) { fprintf(stderr, "Reading allelic intensities file %s\n", summary_fn); varitr->summary_fp = unheader(summary_fn, &str); ncols = ksplit_core(str.s, '\t', &moff, &off); if (strcmp(&str.s[off[0]], "probeset_id")) error("Malformed first line from summary file: %s\n%s\n", summary_fn, str.s); if (!varitr->calls_fp && !varitr->confidences_fp) { varitr->nsmpl = ncols - 1; for (i = 1; i < ncols; i++) { char *ptr = strrchr(&str.s[off[i]], '.'); if (ptr && strcmp(ptr + 1, "CEL") == 0) *ptr = '\0'; bcf_hdr_add_sample(hdr, &str.s[off[i]]); } } } free(str.s); free(off); varitr_init_common(varitr); return varitr; } static inline void check_probe_set_id(char *dest, const char *src) { if (dest[0] == '\0') { if (strlen(src) > MAX_LENGTH_PROBE_SET_ID) error("Probe Set Name %s is too long\n", src); strcpy(dest, src); } else { if (strcmp(dest, src) != 0) error("Probe Set Name mismatch: %s %s\n", dest, src); } } static int varitr_loop(varitr_t *varitr, void *probeset_ids) { int i, ret = 0; varitr->probeset_id[0] = '\0'; if (varitr->data_sets) { for (i = 0; i < varitr->nsmpl; i++) { DataSet *data_set = varitr->data_sets[i]; uint32_t n; char probeset_id[MAX_LENGTH_PROBE_SET_ID + 1]; do { varitr->nrows[i]++; // check whether you have arrived at the last element if (varitr->nrows[i] > data_set->n_rows) return -1; read_bytes(data_set->hfile, (void *)data_set->buffer, data_set->n_buffer); n = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[0]]); if (n > MAX_LENGTH_PROBE_SET_ID) error("Probe Set Name %.*s is too long\n", n, &data_set->buffer[data_set->col_offsets[0] + 4]); strncpy(probeset_id, &data_set->buffer[data_set->col_offsets[0] + 4], (size_t)n); probeset_id[n] = '\0'; } while (probeset_ids && !khash_str2int_has_key(probeset_ids, probeset_id)); check_probe_set_id(varitr->probeset_id, probeset_id); varitr->gts[i] = chp_gt[data_set->buffer[data_set->col_offsets[1]] & 0x0F]; union { uint32_t u; float f; } convert; convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[2]]); varitr->conf_arr[i] = convert.f; if (varitr->is_brlmm_p[i]) { convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[3]]); varitr->delta_arr[i] = convert.f; convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[4]]); varitr->size_arr[i] = convert.f; varitr->norm_x_arr[i] = expf((varitr->size_arr[i] + varitr->delta_arr[i] * 0.5f) * (float)M_LN2); varitr->norm_y_arr[i] = expf((varitr->size_arr[i] - varitr->delta_arr[i] * 0.5f) * (float)M_LN2); } else { convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[3]]); varitr->norm_x_arr[i] = convert.f; convert.u = ntohl(*(uint32_t *)&data_set->buffer[data_set->col_offsets[4]]); varitr->norm_y_arr[i] = convert.f; float log2x = logf(varitr->norm_x_arr[i]) * (float)M_LOG2E; float log2y = logf(varitr->norm_y_arr[i]) * (float)M_LOG2E; varitr->delta_arr[i] = log2x - log2y; varitr->size_arr[i] = (log2x + log2y) * 0.5f; } } } else { kstring_t str = {0, 0, NULL}; int moff = 0, *off = NULL, ncols, len; kstring_t str_b = {0, 0, NULL}; int moff_b = 0, *off_b = NULL, ncols_b, len_b; char *tmp; // read genotypes if (varitr->calls_fp) { do { if ((ret = hts_getline(varitr->calls_fp, KS_SEP_LINE, &str)) < 0) goto exit; ncols = ksplit_core(str.s, '\t', &moff, &off); if (ncols != 1 + varitr->nsmpl) error("Expected %d columns but %d columns found in the calls file\n", 1 + varitr->nsmpl, ncols); for (i = 1; i < 1 + varitr->nsmpl; i++) { int gt = strtol(&str.s[off[i]], &tmp, 0); if (*tmp || gt < -4 || gt > 27) error("Could not parse genotype %s found in the calls file\n", &str.s[off[i]]); varitr->gts[i - 1] = txt_gt[4 + gt]; } } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]])); check_probe_set_id(varitr->probeset_id, &str.s[off[0]]); } // read confidences if (varitr->confidences_fp) { do { if ((ret = hts_getline(varitr->confidences_fp, KS_SEP_LINE, &str)) < 0) goto exit; ncols = ksplit_core(str.s, '\t', &moff, &off); if (ncols != 1 + varitr->nsmpl) error("Expected %d columns but %d columns found in the confidences file\n", 1 + varitr->nsmpl, ncols); for (i = 1; i < 1 + varitr->nsmpl; i++) varitr->conf_arr[i - 1] = strtof(&str.s[off[i]], &tmp); } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]])); check_probe_set_id(varitr->probeset_id, &str.s[off[0]]); } // read intensities if (varitr->summary_fp) { do { // skips -C/-D/-E/-F/-G summary statistics do { if ((ret = hts_getline(varitr->summary_fp, KS_SEP_LINE, &str)) < 0) goto exit; ncols = ksplit_core(str.s, '\t', &moff, &off); if (ncols != 1 + varitr->nsmpl) error("Expected %d columns but %d columns found in the summary file\n", 1 + varitr->nsmpl, ncols); len = strlen(&str.s[off[0]]); } while (str.s[off[0] + len - 2] != '-' && str.s[off[0] + len - 1] != 'A'); // skips probes with -A summary statistics only do { // check whether the next line contains the expected -B probeset_id if ((ret = hts_getline(varitr->summary_fp, KS_SEP_LINE, &str_b)) < 0) goto exit; ncols_b = ksplit_core(str_b.s, '\t', &moff_b, &off_b); if (ncols_b != 1 + varitr->nsmpl) error("Expected %d columns but %d columns found in the summary file\n", 1 + varitr->nsmpl, ncols_b); len_b = strlen(&str_b.s[off_b[0]]); if (str_b.s[off_b[0] + len_b - 2] == '-' && str_b.s[off_b[0] + len_b - 1] == 'B') break; kstring_t str_tmp = str; str = str_b; str_b = str_tmp; int len_tmp = len; len = len_b; len_b = len_tmp; int moff_tmp = moff; moff = moff_b; moff_b = moff_tmp; int *off_tmp = off; off = off_b; off_b = off_tmp; int ncols_tmp = ncols; ncols = ncols_b; ncols_b = ncols_tmp; } while (1); if (len != len_b || strncmp(&str.s[off[0]], &str_b.s[off_b[0]], len - 2) != 0) error("Mismatching %s and %s Probe Set IDs found in the summary file\n", &str.s[off[0]], &str_b.s[off_b[0]]); for (i = 1; i < 1 + varitr->nsmpl; i++) { varitr->norm_x_arr[i - 1] = strtof(&str.s[off[i]], &tmp); if (*tmp) error("Could not parse intensity value %s found in the summary file\n", &str.s[off[i]]); varitr->norm_y_arr[i - 1] = strtof(&str_b.s[off_b[i]], &tmp); if (*tmp) error("Could not parse intensity value %s found in the summary file\n", &str_b.s[off_b[i]]); float log2x = logf(varitr->norm_x_arr[i - 1]) * (float)M_LOG2E; float log2y = logf(varitr->norm_y_arr[i - 1]) * (float)M_LOG2E; varitr->delta_arr[i - 1] = log2x - log2y; varitr->size_arr[i - 1] = (log2x + log2y) * 0.5f; } str.s[off[0] + len - 2] = '\0'; } while (probeset_ids && !khash_str2int_has_key(probeset_ids, &str.s[off[0]])); check_probe_set_id(varitr->probeset_id, &str.s[off[0]]); } exit: free(str_b.s); free(off_b); free(str.s); free(off); } return ret; } static void varitr_destroy(varitr_t *varitr) { free(varitr->data_sets); free(varitr->nrows); free(varitr->is_brlmm_p); if (varitr->calls_fp) hts_close(varitr->calls_fp); if (varitr->confidences_fp) hts_close(varitr->confidences_fp); if (varitr->summary_fp) hts_close(varitr->summary_fp); free(varitr->gts); free(varitr->conf_arr); free(varitr->norm_x_arr); free(varitr->norm_y_arr); free(varitr->delta_arr); free(varitr->size_arr); free(varitr->baf_arr); free(varitr->lrr_arr); free(varitr); } /**************************************** * OUTPUT FUNCTIONS * ****************************************/ static bcf_hdr_t *hdr_init(const faidx_t *fai, int flags) { bcf_hdr_t *hdr = bcf_hdr_init("w"); int i, n = faidx_nseq(fai); for (i = 0; i < n; i++) { const char *seq = faidx_iseq(fai, i); int len = faidx_seq_len(fai, seq); bcf_hdr_printf(hdr, "##contig=", seq, len); } bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); if (flags & SNP_LOADED) { bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); } if (!(flags & NO_INFO_GC)) bcf_hdr_append(hdr, "##INFO="); if ((flags & CALLS_LOADED) && (flags & FORMAT_GT)) bcf_hdr_append(hdr, "##FORMAT="); if ((flags & CONFIDENCES_LOADED) && (flags & FORMAT_CONF)) bcf_hdr_append(hdr, "##FORMAT="); if (flags & SUMMARY_LOADED) { if (flags & FORMAT_NORMX) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_NORMY) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_DELTA) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_SIZE) bcf_hdr_append(hdr, "##FORMAT="); } if ((flags & SUMMARY_LOADED) && (flags & SNP_LOADED)) { if (flags & FORMAT_BAF) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_LRR) bcf_hdr_append(hdr, "##FORMAT="); } return hdr; } // adjust cluster centers (using apt-probeset-genotype posteriors as priors) // similar to // http://github.com/WGLab/PennCNV/blob/master/affy/bin/generate_affy_geno_cluster.pl static void adjust_clusters(const int *gts, const float *x, const float *y, int n, snp_t *snp) { snp->aa.xm *= 0.2f; snp->ab.xm *= 0.2f; snp->bb.xm *= 0.2f; snp->aa.ym *= 0.2f; snp->ab.ym *= 0.2f; snp->bb.ym *= 0.2f; snp->aa.k = 0.2f; snp->ab.k = 0.2f; snp->bb.k = 0.2f; int i; for (i = 0; i < n; i++) { switch (gts[i]) { case GT_AA: snp->aa.k++; snp->aa.xm += x[i]; snp->aa.ym += y[i]; break; case GT_AB: snp->ab.k++; snp->ab.xm += x[i]; snp->ab.ym += y[i]; break; case GT_BB: snp->bb.k++; snp->bb.xm += x[i]; snp->bb.ym += y[i]; break; default: break; } } snp->aa.xm /= snp->aa.k; snp->ab.xm /= snp->ab.k; snp->bb.xm /= snp->bb.k; snp->aa.ym /= snp->aa.k; snp->ab.ym /= snp->ab.k; snp->bb.ym /= snp->bb.k; } static void update_info_cluster(const bcf_hdr_t *hdr, bcf1_t *rec, const char **info_str, const snp_t *snp) { bcf_update_info_float(hdr, rec, info_str[0], &snp->aa.xm, 1); bcf_update_info_float(hdr, rec, info_str[1], &snp->ab.xm, 1); bcf_update_info_float(hdr, rec, info_str[2], &snp->bb.xm, 1); bcf_update_info_float(hdr, rec, info_str[3], &snp->aa.xss, 1); bcf_update_info_float(hdr, rec, info_str[4], &snp->ab.xss, 1); bcf_update_info_float(hdr, rec, info_str[5], &snp->bb.xss, 1); bcf_update_info_float(hdr, rec, info_str[6], &snp->aa.k, 1); bcf_update_info_float(hdr, rec, info_str[7], &snp->ab.k, 1); bcf_update_info_float(hdr, rec, info_str[8], &snp->bb.k, 1); bcf_update_info_float(hdr, rec, info_str[9], &snp->aa.v, 1); bcf_update_info_float(hdr, rec, info_str[10], &snp->ab.v, 1); bcf_update_info_float(hdr, rec, info_str[11], &snp->bb.v, 1); bcf_update_info_float(hdr, rec, info_str[12], &snp->aa.ym, 1); bcf_update_info_float(hdr, rec, info_str[13], &snp->ab.ym, 1); bcf_update_info_float(hdr, rec, info_str[14], &snp->bb.ym, 1); bcf_update_info_float(hdr, rec, info_str[15], &snp->aa.yss, 1); bcf_update_info_float(hdr, rec, info_str[16], &snp->ab.yss, 1); bcf_update_info_float(hdr, rec, info_str[17], &snp->bb.yss, 1); bcf_update_info_float(hdr, rec, info_str[18], &snp->aa.xyss, 1); bcf_update_info_float(hdr, rec, info_str[19], &snp->ab.xyss, 1); bcf_update_info_float(hdr, rec, info_str[20], &snp->bb.xyss, 1); } // compute LRR and BAF // similar to // http://github.com/WGLab/PennCNV/blob/master/affy/bin/normalize_affy_geno_cluster.pl static void compute_baf_lrr(const float *norm_x, const float *norm_y, int n, const snp_t *snp, int is_birdseed, float *baf, float *lrr) { float aa_theta, ab_theta, bb_theta, aa_r, ab_r, bb_r; if (is_birdseed) { aa_theta = atan2f(snp->aa.ym, snp->aa.xm) * (float)M_2_PI; ab_theta = atan2f(snp->ab.ym, snp->ab.xm) * (float)M_2_PI; bb_theta = atan2f(snp->bb.ym, snp->bb.xm) * (float)M_2_PI; aa_r = snp->aa.xm + snp->aa.ym; ab_r = snp->ab.xm + snp->ab.ym; bb_r = snp->bb.xm + snp->bb.ym; } else { aa_theta = atanf(expf(-snp->aa.xm * (float)M_LN2)) * (float)M_2_PI; ab_theta = atanf(expf(-snp->ab.xm * (float)M_LN2)) * (float)M_2_PI; bb_theta = atanf(expf(-snp->bb.xm * (float)M_LN2)) * (float)M_2_PI; aa_r = expf(snp->aa.ym * (float)M_LN2) * 2.0f * coshf(snp->aa.xm * 0.5f * (float)M_LN2); ab_r = expf(snp->ab.ym * (float)M_LN2) * 2.0f * coshf(snp->ab.xm * 0.5f * (float)M_LN2); bb_r = expf(snp->bb.ym * (float)M_LN2) * 2.0f * coshf(snp->bb.xm * 0.5f * (float)M_LN2); } // handles chromosome Y SNPs if (snp->copynumber == 1) { ab_theta = (aa_theta + bb_theta) * 0.5f; ab_r = (aa_r + bb_r) * 0.5f; } int i; for (i = 0; i < n; i++) { float ilmn_theta = atan2f(norm_y[i], norm_x[i]) * (float)M_2_PI; float ilmn_r = norm_x[i] + norm_y[i]; get_baf_lrr(ilmn_theta, ilmn_r, aa_theta, ab_theta, bb_theta, aa_r, ab_r, bb_r, NAN, &baf[i], &lrr[i]); } } static void process(faidx_t *fai, const annot_t *annot, void *probeset_ids, snp_models_t *snp_models, varitr_t *varitr, htsFile *out_fh, bcf_hdr_t *hdr, int flags, int gc_win) { int i, nsmpl = bcf_hdr_nsamples(hdr); if ((flags & ADJUST_CLUSTERS) && (nsmpl < 100)) fprintf(stderr, "Warning: adjusting clusters with %d sample(s) is not recommended\n", nsmpl); bcf1_t *rec = bcf_init(); char ref_base[] = {'\0', '\0'}; kstring_t allele_a = {0, 0, NULL}; kstring_t allele_b = {0, 0, NULL}; kstring_t flank = {0, 0, NULL}; int32_t *gt_arr = (int32_t *)malloc(nsmpl * 2 * sizeof(int32_t)); float *baf_arr = (float *)malloc(nsmpl * sizeof(float)); float *lrr_arr = (float *)malloc(nsmpl * sizeof(float)); int n_missing = 0, n_no_snp_models = 0, n_skipped = 0; for (i = 0; i < annot->n_records; i++) { // identify variants to use for next VCF record int idx; if (varitr) { if (varitr_loop(varitr, probeset_ids) < 0) break; int ret = khash_str2int_get(annot->probeset_id, varitr->probeset_id, &idx); if (ret < 0) error("Probe Set %s not found in manifest file\n", varitr->probeset_id); } else { if (probeset_ids && !khash_str2int_has_key(probeset_ids, annot->records[i].probeset_id)) { n_skipped++; continue; } idx = i; } record_t *record = &annot->records[idx]; bcf_clear(rec); rec->n_sample = nsmpl; rec->rid = bcf_hdr_name2id_flexible(hdr, record->chromosome); rec->pos = record->position - 1; if (rec->rid < 0 || rec->pos < 0 || record->strand < 0 || !record->flank) { if (flags & VERBOSE) fprintf(stderr, "Skipping unlocalized marker %s\n", record->probeset_id); n_skipped++; continue; } bcf_update_id(hdr, rec, record->probeset_id); flank.l = 0; kputs(record->flank, &flank); strupper(flank.s); if (record->strand) flank_reverse_complement(flank.s); int len, win = min(max(max(gc_win, strlen(flank.s)), 100), rec->pos); char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len); if (!ref || len == 1) error("faidx_fetch_seq failed at %s:%" PRId64 " (are you using the correct reference genome?)\n", bcf_seqname(hdr, rec), rec->pos + 1); strupper(ref); if (!(flags & NO_INFO_GC)) { float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]); bcf_update_info_float(hdr, rec, "GC", &gc_ratio, 1); } ref_base[0] = ref[win]; int32_t allele_b_idx; allele_a.l = allele_b.l = 0; if (strchr(flank.s, '-')) { kputc('D', &allele_a); kputc('I', &allele_b); int ref_is_del = get_indel_alleles(&allele_a, &allele_b, flank.s, ref, win, len, 0); if (ref_is_del < 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine alleles for indel %s\n", record->probeset_id); n_missing++; } if (ref_is_del == 0) { rec->pos--; ref_base[0] = ref[win - 1]; } allele_b_idx = ref_is_del < 0 ? 1 : ref_is_del; } else { const char *left = strchr(flank.s, '['); const char *middle = strchr(flank.s, '/'); const char *right = strchr(flank.s, ']'); if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank.s); kputsn(left + 1, middle - left - 1, &allele_a); kputsn(middle + 1, right - middle - 1, &allele_b); if (middle - left == 2 && right - middle == 2) { allele_b_idx = get_allele_b_idx(ref_base[0], allele_a.s, allele_b.s); } else { int allele_a_match = strncmp(left + 1, &ref[win], middle - left - 1) == 0; int allele_b_match = strncmp(middle + 1, &ref[win], right - middle - 1) == 0; if (allele_a_match && !allele_b_match) { allele_b_idx = 1; } else if (!allele_a_match && allele_b_match) { allele_b_idx = 0; } else if (allele_a_match && allele_b_match) { int allele_a_right = len_common_prefix(right + 1, &ref[win] + (middle - left) - 1, strlen(right + 1)); int allele_b_right = len_common_prefix(right + 1, &ref[win] + (right - middle) - 1, strlen(right + 1)); allele_b_idx = allele_a_right > allele_b_right; } else { allele_b_idx = -1; } } } free(ref); int32_t allele_a_idx = get_allele_a_idx(allele_b_idx); const char *alleles[3]; int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a.s, allele_b.s, allele_b_idx); if (nals < 0) error("Unable to process Probe Set %s\n", record->probeset_id); bcf_update_alleles(hdr, rec, alleles, nals); bcf_update_info_int32(hdr, rec, "ALLELE_A", &allele_a_idx, 1); bcf_update_info_int32(hdr, rec, "ALLELE_B", &allele_b_idx, 1); if (record->dbsnp_rs_id) bcf_update_info_string(hdr, rec, "DBSNP_RS_ID", record->dbsnp_rs_id); if (record->affy_snp_id) bcf_update_info_string(hdr, rec, "AFFY_SNP_ID", record->affy_snp_id); if (varitr) { if ((varitr->data_sets || varitr->calls_fp) && flags & FORMAT_GT) { for (i = 0; i < nsmpl; i++) { switch (varitr->gts[i]) { case GT_AA: gt_arr[2 * i] = bcf_gt_unphased(allele_a_idx); gt_arr[2 * i + 1] = bcf_gt_unphased(allele_a_idx); break; case GT_AB: gt_arr[2 * i] = bcf_gt_unphased(min(allele_a_idx, allele_b_idx)); gt_arr[2 * i + 1] = bcf_gt_unphased(max(allele_a_idx, allele_b_idx)); break; case GT_BB: gt_arr[2 * i] = bcf_gt_unphased(allele_b_idx); gt_arr[2 * i + 1] = bcf_gt_unphased(allele_b_idx); break; case GT_NC: gt_arr[2 * i] = bcf_gt_missing; gt_arr[2 * i + 1] = bcf_gt_missing; break; default: error("Genotype for Probe Set ID %s is malformed: %d\n", record->probeset_id, varitr->gts[i]); break; } } bcf_update_genotypes(hdr, rec, gt_arr, nsmpl * 2); } if ((varitr->data_sets || varitr->confidences_fp) && flags & FORMAT_CONF) bcf_update_format_float(hdr, rec, "CONF", varitr->conf_arr, nsmpl); if (varitr->data_sets || varitr->summary_fp) { if (flags & FORMAT_NORMX) bcf_update_format_float(hdr, rec, "NORMX", varitr->norm_x_arr, nsmpl); if (flags & FORMAT_NORMY) bcf_update_format_float(hdr, rec, "NORMY", varitr->norm_y_arr, nsmpl); if (flags & FORMAT_DELTA) bcf_update_format_float(hdr, rec, "DELTA", varitr->delta_arr, nsmpl); if (flags & FORMAT_SIZE) bcf_update_format_float(hdr, rec, "SIZE", varitr->size_arr, nsmpl); } } if (snp_models) { int rets[2], idxs[2]; for (i = 0; i < 2; i++) { rets[i] = khash_str2int_get(snp_models->probeset_id[i], record->probeset_id, &idxs[i]); } static const char *hap_info_str[] = { "meanX_AA.1", "meanX_AB.1", "meanX_BB.1", "varX_AA.1", "varX_AB.1", "varX_BB.1", "nObsMean_AA.1", "nObsMean_AB.1", "nObsMean_BB.1", "nObsVar_AA.1", "nObsVar_AB.1", "nObsVar_BB.1", "meanY_AA.1", "meanY_AB.1", "meanY_BB.1", "varY_AA.1", "varY_AB.1", "varY_BB.1", "covarXY_AA.1", "covarXY_AB.1", "covarXY_BB.1"}; static const char *dip_info_str[] = { "meanX_AA", "meanX_AB", "meanX_BB", "varX_AA", "varX_AB", "varX_BB", "nObsMean_AA", "nObsMean_AB", "nObsMean_BB", "nObsVar_AA", "nObsVar_AB", "nObsVar_BB", "meanY_AA", "meanY_AB", "meanY_BB", "varY_AA", "varY_AB", "varY_BB", "covarXY_AA", "covarXY_AB", "covarXY_BB"}; if (rets[0] >= 0) update_info_cluster(hdr, rec, hap_info_str, &snp_models->snps[0][idxs[0]]); if (rets[1] >= 0) update_info_cluster(hdr, rec, dip_info_str, &snp_models->snps[1][idxs[1]]); snp_t *snp = rets[1] >= 0 ? &snp_models->snps[1][idxs[1]] : (rets[0] >= 0 ? &snp_models->snps[0][idxs[0]] : NULL); if (!snp) { n_no_snp_models++; if (flags & VERBOSE) fprintf(stderr, "Warning: SNP model for Probe Set ID %s was not found\n", record->probeset_id); } else { if (flags & ADJUST_CLUSTERS) adjust_clusters(varitr->gts, snp_models->is_birdseed ? varitr->norm_x_arr : varitr->delta_arr, snp_models->is_birdseed ? varitr->norm_y_arr : varitr->size_arr, nsmpl, snp); if (flags & SUMMARY_LOADED) { compute_baf_lrr(varitr->norm_x_arr, varitr->norm_y_arr, nsmpl, snp, snp_models->is_birdseed, baf_arr, lrr_arr); if (flags & FORMAT_BAF) bcf_update_format_float(hdr, rec, "BAF", baf_arr, nsmpl); if (flags & FORMAT_LRR) bcf_update_format_float(hdr, rec, "LRR", lrr_arr, nsmpl); } } } if (bcf_write(out_fh, hdr, rec) < 0) error("Unable to write to output VCF file\n"); } if (snp_models) fprintf(stderr, "Lines total/missing-reference/missing-snp-posteriors/skipped:\t%d/%d/%d/%d\n", i, n_missing, n_no_snp_models, n_skipped); else fprintf(stderr, "Lines total/missing-reference/skipped:\t%d/%d/%d\n", i, n_missing, n_skipped); free(gt_arr); free(baf_arr); free(lrr_arr); free(allele_a.s); free(allele_b.s); free(flank.s); bcf_destroy(rec); return; } /**************************************** * PLUGIN * ****************************************/ const char *about(void) { return "convert Affymetrix files to VCF.\n"; } static const char *usage_text(void) { return "\n" "About: convert Affymetrix apt-probeset-genotype output files to VCF. " "(version " AFFY2VCF_VERSION " http://github.com/freeseek/gtc2vcf)\n" "Usage: bcftools +affy2vcf [options] --csv --fasta-ref [ ...]\n" "\n" "Plugin options:\n" " -l, --list-tags list available FORMAT tags with description for VCF output\n" " -t, --tags LIST list of output FORMAT tags [" TAG_LIST_DFLT "]\n" " -c, --csv CSV manifest file (can be gzip compressed)\n" " -f, --fasta-ref reference sequence in fasta format\n" " --set-cache-size select fasta cache size in bytes\n" " --gc-window-size window size in bp used to compute the GC content (-1 for no estimate) " "[" GC_WIN_DFLT "]\n" " --probeset-ids tab delimited file with column 'probeset_id' specifying probesets to " "convert\n" " --calls apt-probeset-genotype calls output (can be gzip compressed)\n" " --confidences apt-probeset-genotype confidences output (can be gzip compressed)\n" " --summary apt-probeset-genotype summary output (can be gzip compressed)\n" " --snp apt-probeset-genotype SNP posteriors output (can be gzip compressed)\n" " --chps input CHP files rather than tab delimited files\n" " --cel input CEL files rather CHP files\n" " --adjust-clusters adjust cluster centers in (Contrast, Size) space (requires --snp)\n" " --no-version do not append version and command line to the header\n" " -o, --output write output to a file [standard output]\n" " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level " "[v]\n" " --threads number of extra output compression threads [0]\n" " -x, --extra write CHP metadata to a file (requires CHP files)\n" " -v, --verbose print verbose information\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Manifest options:\n" " --fasta-flank output flank sequence in FASTA format (requires --csv)\n" " -s, --sam-flank input flank sequence alignment in SAM/BAM format (requires --csv)\n" "\n" "Examples:\n" " bcftools +affy2vcf \\\n" " --csv GenomeWideSNP_6.na35.annot.csv \\\n" " --fasta-ref human_g1k_v37.fasta \\\n" " --chps cc-chp/ \\\n" " --snp AxiomGT1.snp-posteriors.txt \\\n" " --output AxiomGT1.vcf \\\n" " --extra report.tsv\n" " bcftools +affy2vcf \\\n" " --csv GenomeWideSNP_6.na35.annot.csv \\\n" " --fasta-ref human_g1k_v37.fasta \\\n" " --calls AxiomGT1.calls.txt \\\n" " --confidences AxiomGT1.confidences.txt \\\n" " --summary AxiomGT1.summary.txt \\\n" " --snp AxiomGT1.snp-posteriors.txt \\\n" " --output AxiomGT1.vcf\n" "\n" "Examples of manifest file options:\n" " bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv --fasta-flank -o GenomeWideSNP_6.fasta\n" " bwa mem -M Homo_sapiens_assembly38.fasta GenomeWideSNP_6.fasta -o " "GenomeWideSNP_6.sam\n" " bcftools +affy2vcf -c GenomeWideSNP_6.na35.annot.csv -s GenomeWideSNP_6.sam -o " "GenomeWideSNP_6.na35.annot.GRCh38.csv\n" "\n"; } static int parse_tags(const char *str) { int i, flags = 0, n; char **tags = hts_readlist(str, 0, &n); for (i = 0; i < n; i++) { if (!strcasecmp(tags[i], "GT")) flags |= FORMAT_GT; else if (!strcasecmp(tags[i], "CONF")) flags |= FORMAT_CONF; else if (!strcasecmp(tags[i], "NORMX")) flags |= FORMAT_NORMX; else if (!strcasecmp(tags[i], "NORMY")) flags |= FORMAT_NORMY; else if (!strcasecmp(tags[i], "DELTA")) flags |= FORMAT_DELTA; else if (!strcasecmp(tags[i], "SIZE")) flags |= FORMAT_SIZE; else if (!strcasecmp(tags[i], "LRR")) flags |= FORMAT_LRR; else if (!strcasecmp(tags[i], "BAF")) flags |= FORMAT_BAF; else error("Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str, tags[i]); free(tags[i]); } if (n) free(tags); return flags; } static void list_tags(void) { error( "FORMAT/GT Number:1 Type:String .. Genotype\n" "FORMAT/CONF Number:1 Type:Float .. Genotype confidence\n" "FORMAT/BAF Number:1 Type:Float .. B Allele Frequency\n" "FORMAT/LRR Number:1 Type:Float .. Log R Ratio\n" "FORMAT/NORMX Number:1 Type:Float .. Normalized X intensity\n" "FORMAT/NORMY Number:1 Type:Float .. Normalized Y intensity\n" "FORMAT/DELTA Number:1 Type:Float .. Normalized Delta value\n" "FORMAT/SIZE Number:1 Type:Float .. Normalized Size value\n"); } int run(int argc, char *argv[]) { const char *tag_list = TAG_LIST_DFLT; const char *ref_fname = NULL; const char *extra_fname = NULL; const char *csv_fname = NULL; const char *probeset_ids_fname = NULL; const char *calls_fname = NULL; const char *confidences_fname = NULL; const char *summary_fname = NULL; const char *snp_fname = NULL; const char *pathname = NULL; const char *output_fname = "-"; const char *sam_fname = NULL; char *index_fname; char *tmp; int i; int flags = 0; int output_type = FT_VCF; int clevel = -1; int cache_size = 0; int gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0); int n_threads = 0; int record_cmd_line = 1; int write_index = 0; int fasta_flank = 0; faidx_t *fai = NULL; FILE *out_txt = NULL; static struct option loptions[] = {{"list-tags", no_argument, NULL, 'l'}, {"tags", required_argument, NULL, 't'}, {"csv", required_argument, NULL, 'c'}, {"fasta-ref", required_argument, NULL, 'f'}, {"set-cache-size", required_argument, NULL, 1}, {"gc-window-size", required_argument, NULL, 2}, {"probeset-ids", required_argument, NULL, 3}, {"calls", required_argument, NULL, 4}, {"confidences", required_argument, NULL, 5}, {"summary", required_argument, NULL, 6}, {"snp", required_argument, NULL, 7}, {"chps", required_argument, NULL, 11}, {"cel", no_argument, NULL, 12}, {"adjust-clusters", no_argument, NULL, 13}, {"no-version", no_argument, NULL, 8}, {"output", required_argument, NULL, 'o'}, {"output-type", required_argument, NULL, 'O'}, {"threads", required_argument, NULL, 9}, {"extra", required_argument, NULL, 'x'}, {"verbose", no_argument, NULL, 'v'}, {"fasta-flank", no_argument, NULL, 14}, {"sam-flank", required_argument, NULL, 's'}, {"write-index", optional_argument, NULL, 'W'}, {NULL, 0, NULL, 0}}; int c; while ((c = getopt_long(argc, argv, "h?lt:c:f:x:o:O:vs:W::", loptions, NULL)) >= 0) { switch (c) { case 'l': list_tags(); break; case 't': tag_list = optarg; break; case 'c': csv_fname = optarg; break; case 'f': ref_fname = optarg; break; case 1: cache_size = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --set-cache-size %s\n", optarg); break; case 2: gc_win = (int)strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --gc-window-size %s\n", optarg); if (gc_win <= 0) flags |= NO_INFO_GC; break; case 3: probeset_ids_fname = optarg; flags |= PROBESET_IDS_LOADED; break; case 4: calls_fname = optarg; flags |= CALLS_LOADED; break; case 5: confidences_fname = optarg; flags |= CONFIDENCES_LOADED; break; case 6: summary_fname = optarg; flags |= SUMMARY_LOADED; break; case 7: snp_fname = optarg; flags |= SNP_LOADED; break; case 11: pathname = optarg; break; case 12: flags |= LOAD_CEL; break; case 13: flags |= ADJUST_CLUSTERS; break; case 8: record_cmd_line = 0; break; case 'o': output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': output_type = FT_BCF_GZ; break; case 'u': output_type = FT_BCF; break; case 'z': output_type = FT_VCF_GZ; break; case 'v': output_type = FT_VCF; break; default: { clevel = strtol(optarg, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("The output type \"%s\" not recognised\n", optarg); } } if (optarg[1]) { clevel = strtol(optarg + 1, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("Could not parse argument: --compression-level %s\n", optarg + 1); } break; case 9: n_threads = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse argument: --threads %s\n", optarg); break; case 'x': extra_fname = optarg; break; case 'v': flags |= VERBOSE; break; case 14: fasta_flank = 1; break; case 's': sam_fname = optarg; break; case 'W': if (!(write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); break; case 'h': case '?': default: error("%s", usage_text()); } } flags |= parse_tags(tag_list); int nfiles = 0; char **filenames = NULL; if (pathname) { filenames = get_file_list(pathname, flags & LOAD_CEL ? "CEL" : "chp", &nfiles); } else { nfiles = argc - optind; filenames = argv + optind; } uint8_t *magic = (uint8_t *)malloc(nfiles * sizeof(uint8_t *)); void **files = (void **)malloc(nfiles * sizeof(void *)); if (csv_fname) { if (flags & LOAD_CEL) error("Cannot use --csv with --cel as CEL files cannot be converted\n%s", usage_text()); if (fasta_flank && sam_fname) error("Only one of --fasta-flank or --sam-flank options can be used at once\n%s", usage_text()); if (!fasta_flank && !sam_fname && !ref_fname) error("Expected one of --fasta-flank or --sam-flank or --fasta-ref options\n%s", usage_text()); if ((flags & ADJUST_CLUSTERS) && (!summary_fname || !snp_fname)) error("Expected --summary and --snp options with --adjust-clusters option\n%s", usage_text()); if (nfiles == 0 && extra_fname) error("Expected CHP files with --extra option\n%s", usage_text()); if (nfiles > 0 && (calls_fname || confidences_fname || summary_fname)) error( "Cannot load tables --calls, --confidences, --summary if CHP files provided " "instead\n%s", usage_text()); } else if (nfiles == 0) { error("%s", usage_text()); } // beginning of plugin run fprintf(stderr, "affy2vcf " AFFY2VCF_VERSION " http://github.com/freeseek/gtc2vcf\n"); if (nfiles > 0 && !(flags & LOAD_CEL)) flags |= CALLS_LOADED | CONFIDENCES_LOADED | SUMMARY_LOADED; // make sure the process is allowed to open enough files struct rlimit lim; getrlimit(RLIMIT_NOFILE, &lim); if (nfiles + 7 > lim.rlim_max) error("On this system you cannot open more than %ld files at once while %d is required\n", lim.rlim_max, nfiles + 7); if (nfiles + 7 > lim.rlim_cur) { lim.rlim_cur = nfiles + 7; setrlimit(RLIMIT_NOFILE, &lim); } annot_t *annot = NULL; if (csv_fname) { fprintf(stderr, "Reading CSV file %s\n", csv_fname); if (sam_fname) fprintf(stderr, "Reading SAM file %s\n", sam_fname); annot = annot_init(csv_fname, sam_fname, ((sam_fname && !ref_fname) || fasta_flank) ? output_fname : NULL, flags); } for (i = 0; i < nfiles; i++) { hFILE *hfile = hopen(filenames[i], "rb"); if (hfile == NULL) error("Could not open %s: %s\n", filenames[i], strerror(errno)); if (hpeek(hfile, (void *)&magic[i], 1) < 1) { error("Failed to read from file %s\n", filenames[i]); } switch (magic[i]) { case 59: fprintf(stderr, "Reading AGCC file %s\n", filenames[i]); files[i] = (void *)agcc_init(filenames[i], hfile, nfiles > 1); break; case 64: fprintf(stderr, "Reading XDA CEL file %s\n", filenames[i]); files[i] = (void *)xda_cel_init(filenames[i], hfile, nfiles > 1); break; case 65: error("Currently unable to read XDA CHP format for file %s\n", filenames[i]); default: error("Expected magic numbers 59, 64 or 65 but found %d in file %s\n", magic[i], filenames[i]); } } if (annot) { if (extra_fname && !(flags & LOAD_CEL)) { out_txt = get_file_handle(extra_fname); chps_to_tsv(magic, (agcc_t **)files, nfiles, out_txt); } fai = fai_load(ref_fname); if (!fai) error("Could not load the reference %s\n", ref_fname); if (cache_size) fai_set_cache_size(fai, cache_size); if (probeset_ids_fname) fprintf(stderr, "Reading probeset IDs file %s\n", probeset_ids_fname); void *probeset_ids = probeset_ids_fname ? probeset_ids_init(probeset_ids_fname) : NULL; if (snp_fname) fprintf(stderr, "Reading SNP posteriors file %s\n", snp_fname); snp_models_t *snp_models = snp_fname ? snp_models_init(snp_fname) : NULL; fprintf(stderr, "Writing VCF file\n"); bcf_hdr_t *hdr = hdr_init(fai, flags); bcf_hdr_printf(hdr, "##CSV=%s", strrchr(csv_fname, '/') ? strrchr(csv_fname, '/') + 1 : csv_fname); if (sam_fname) bcf_hdr_printf(hdr, "##SAM=%s", strrchr(sam_fname, '/') ? strrchr(sam_fname, '/') + 1 : sam_fname); if (snp_fname) bcf_hdr_printf(hdr, "##SNP=%s", strrchr(snp_fname, '/') ? strrchr(snp_fname, '/') + 1 : snp_fname); if (record_cmd_line) bcf_hdr_append_version(hdr, argc, argv, "bcftools_affy2vcf"); char wmode[8]; set_wmode(wmode, output_type, (char *)output_fname, clevel); htsFile *out_fh = hts_open(output_fname, hts_bcf_wmode(output_type)); if (out_fh == NULL) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, output_fname, strerror(errno)); if (n_threads) hts_set_threads(out_fh, n_threads); varitr_t *varitr = NULL; if (nfiles > 0) varitr = varitr_init_cc(hdr, (agcc_t **)files, nfiles); else if (calls_fname || confidences_fname || summary_fname) varitr = varitr_init_txt(hdr, calls_fname, confidences_fname, summary_fname); if (bcf_hdr_write(out_fh, hdr) < 0) error("Unable to write to output VCF file\n"); if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0) error("Error: failed to initialise index for %s\n", output_fname); if (bcf_hdr_sync(hdr) < 0) error_errno("[%s] Failed to update header", __func__); // updates the number of samples process(fai, annot, probeset_ids, snp_models, varitr, out_fh, hdr, flags, gc_win); if (varitr) varitr_destroy(varitr); if (snp_models) snp_models_destroy(snp_models); if (probeset_ids) khash_str2int_destroy_free(probeset_ids); fai_destroy(fai); bcf_hdr_destroy(hdr); if (write_index) { if (bcf_idx_save(out_fh) < 0) { if (hts_close(out_fh) != 0) error("Close failed %s\n", strcmp(output_fname, "-") ? output_fname : "stdout"); error("Error: cannot write to index %s\n", index_fname); } free(index_fname); } if (hts_close(out_fh) != 0) error("Close failed %s\n", strcmp(output_fname, "-") ? output_fname : "stdout"); annot_destroy(annot); } if (!ref_fname && nfiles > 0) { out_txt = get_file_handle(output_fname); if (nfiles == 1) { switch (magic[0]) { case 59: agcc_print((agcc_t *)files[0], out_txt, flags & VERBOSE); break; case 64: xda_cel_print((xda_cel_t *)files[0], out_txt, flags & VERBOSE); break; default: error("Expected magic numbers 59 or 64 but found %d in file %s\n", magic[0], filenames[0]); } } else if (flags & LOAD_CEL) { cels_to_tsv(magic, files, nfiles, out_txt); } else { chps_to_tsv(magic, (agcc_t **)files, nfiles, out_txt); } } if (pathname) { for (i = 0; i < nfiles; i++) free(filenames[i]); free(filenames); } for (i = 0; i < nfiles; i++) { switch (magic[i]) { case 59: agcc_destroy((agcc_t *)files[i]); break; case 64: xda_cel_destroy((xda_cel_t *)files[i]); break; default: error("Expected magic numbers 59 or 64 but found %d in file %s\n", magic[i], filenames[i]); } } free(magic); free(files); if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt); return 0; } ================================================ FILE: gtc2vcf.c ================================================ /* The MIT License Copyright (c) 2018-2026 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include "bcftools.h" #include "tsv2vcf.h" #include "gtc2vcf.h" #define GTC2VCF_VERSION "2026-01-26" #define GT_NC 0 #define GT_AA 1 #define GT_AB 2 #define GT_BB 3 #define TAG_LIST_DFLT "GT,GQ,IGC,BAF,LRR,NORMX,NORMY,R,THETA,X,Y" #define GC_WIN_DFLT "200" #define CAPACITY_DFLT "32768" #define GENOME_BUILD_DFLT "GRCh38" #define VERBOSE (1 << 0) #define BPM_LOADED (1 << 1) #define CSV_LOADED (1 << 2) #define EGT_LOADED (1 << 3) #define LOAD_IDAT (1 << 4) #define ADJUST_CLUSTERS (1 << 5) #define GENOME_STUDIO (1 << 6) #define NO_INFO_GC (1 << 7) #define FORMAT_GT (1 << 8) #define FORMAT_GQ (1 << 9) #define FORMAT_IGC (1 << 10) #define FORMAT_BAF (1 << 11) #define FORMAT_LRR (1 << 12) #define FORMAT_NORMX (1 << 13) #define FORMAT_NORMY (1 << 14) #define FORMAT_R (1 << 15) #define FORMAT_THETA (1 << 16) #define FORMAT_X (1 << 17) #define FORMAT_Y (1 << 18) /**************************************** * hFILE READING FUNCTIONS * ****************************************/ // read or skip a fixed length array static void read_array(hFILE *hfile, void **arr, size_t *m_arr, size_t nmemb, size_t size, size_t term) { if (arr) { if (!m_arr) { *arr = malloc((nmemb + term) * size); if (!*arr) error("Failed to allocate memory for array\n"); } else if (*m_arr < nmemb + term) { void *tmp = realloc(*arr, (nmemb + term) * size); if (!tmp) error("Failed to allocate memory for array\n"); *arr = tmp; *m_arr = nmemb + term; } if (hread(hfile, *arr, nmemb * size) < nmemb * size) { error("Failed to read %ld bytes from stream\n", nmemb * size); } } else { int i, c = 0; for (i = 0; i < nmemb * size; i++) c = hgetc(hfile); if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nmemb * size); } } // read or skip a length-prefixed array static void read_pfx_array(hFILE *hfile, void **arr, size_t *m_arr, size_t item_size) { int32_t n; if (hread(hfile, (void *)&n, 4) < 4) { error("Failed to read 4 bytes from stream\n"); } read_array(hfile, arr, m_arr, n, item_size, 0); } // read or skip a length-prefixed string // http://en.wikipedia.org/wiki/LEB128#Decode_unsigned_integer static void read_pfx_string(hFILE *hfile, char **str, size_t *m_str) { uint8_t byte; size_t n = 0, shift = 0; while (1) { if (hread(hfile, (void *)&byte, 1) < 1) { error("Failed to read 1 byte from stream\n"); } n |= (size_t)(byte & 0x7F) << shift; if (!(byte & 0x80)) break; shift += 7; } if (n || m_str) { read_array(hfile, (void **)str, m_str, n, 1, 1); if (str) (*str)[n] = '\0'; } } // check whether file is compressed with gzip static int is_gzip(hFILE *hfile) { uint8_t buffer[2]; if (hpeek(hfile, (void *)buffer, 2) < 2) error("Failed to read 2 bytes from stream\n"); return (buffer[0] == 0x1f && buffer[1] == 0x8b); } /**************************************** * BUFFER ARRAY IMPLEMENTATION * ****************************************/ typedef struct { hFILE *hfile; off_t offset; int32_t item_num; int32_t item_offset; size_t item_capacity; size_t item_size; char *buffer; } buffer_array_t; static buffer_array_t *buffer_array_init(hFILE *hfile, size_t capacity, size_t item_size) { buffer_array_t *arr = (buffer_array_t *)malloc(1 * sizeof(buffer_array_t)); arr->hfile = hfile; read_bytes(hfile, (void *)&arr->item_num, sizeof(int32_t)); arr->offset = htell(arr->hfile); arr->item_offset = 0; arr->item_capacity = (capacity <= 0) ? (size_t)strtol(CAPACITY_DFLT, NULL, 0) : capacity; arr->item_size = item_size; arr->buffer = (char *)malloc(arr->item_capacity * item_size); read_bytes(hfile, (void *)arr->buffer, (arr->item_num < arr->item_capacity ? arr->item_num : arr->item_capacity) * item_size); return arr; } static int get_element(buffer_array_t *arr, void *dst, size_t item_idx) { if (!arr || item_idx >= arr->item_num) { return -1; } else if (item_idx - arr->item_offset < arr->item_capacity) { memcpy(dst, (void *)(arr->buffer + (item_idx - arr->item_offset) * arr->item_size), arr->item_size); return 0; } arr->item_offset = item_idx; if (hseek(arr->hfile, arr->offset + item_idx * arr->item_size, SEEK_SET) < 0) { error("Fail to seek to position %ld in file\n", arr->offset + item_idx * arr->item_size); } read_bytes(arr->hfile, (void *)arr->buffer, ((arr->item_num - arr->item_offset) < arr->item_capacity ? (arr->item_num - arr->item_offset) : arr->item_capacity) * arr->item_size); memcpy(dst, (void *)arr->buffer, arr->item_size); return 0; } static void buffer_array_destroy(buffer_array_t *arr) { if (!arr) return; free(arr->buffer); free(arr); } /**************************************** * BPM FILE IMPLEMENTATION * ****************************************/ // http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/BeadPoolManifest.py typedef struct { int32_t version; uint8_t norm_id; // Normalization lookups from manifest. This indexes into list of // normalization transforms read from GTC file char *ilmn_id; // IlmnID (probe identifier) of locus char *name; // Name (variant identifier) of locus int32_t index; char *ilmn_strand; // TOP BOT PLUS MINUS or Top Bot P M char *snp; // SNP value for locus (e.g., [A/C]) char *chrom; // Chromosome for the locus (e.g., XY) char *ploidy; char *species; char *map_info; // Mapping location of locus char *customer_strand; int32_t address_a; // AddressA ID of locus char *allele_a_probe_seq; // CSV files or BPM files with version 4 data block int32_t address_b; // AddressB ID of locus (0 if none) char *allele_b_probe_seq; // CSV files or BPM files with version 4 data block (empty if // none) char *genome_build; char *source; char *source_version; char *source_strand; char *source_seq; // CSV files or BPM files with version 4 data block char *top_genomic_seq; // CSV files or BPM files with version 4 data block int32_t beadset_id; // CSV files uint8_t exp_clusters; uint8_t intensity_only; uint8_t assay_type; // Identifies type of assay (0 - Infinium II, 1 - Infinium I (A/T), // 2 - Infinium I (G/C) uint8_t assay_type_csv; float frac_a; float frac_c; float frac_g; float frac_t; char *ref_strand; // RefStrand annotation } LocusEntry; // retrieve assay type following (allele_a_probe_seq, source_seq) -> assay_type map // (...W., ...W[./.]W...) -> 1 // (...S., ...S[./.]S...) -> 2 // (...S., ...S[./.]W...) -> 1 // (...S., ...W[./.]S...) -> 1 // (...W., ...S[./.]W...) -> 2 // (...W., ...W[./.]S...) -> 2 static uint8_t get_assay_type(const char *allele_a_probe_seq, const char *allele_b_probe_seq, const char *source_seq) { if (!allele_a_probe_seq || !source_seq) return 0xFF; if (!allele_b_probe_seq) return 0; const char *left = strchr(source_seq, '['); const char *right = strchr(source_seq, ']'); if (!left || !right) error("Source sequence is malformed: %s\n", source_seq); char trail_left = toupper(*(left - 1)); char trail_right = toupper(*(right + 1)); if ((trail_left == 'A' || trail_left == 'T') && (trail_right == 'A' || trail_right == 'T')) return 1; if ((trail_left == 'C' || trail_left == 'G') && (trail_right == 'C' || trail_right == 'G')) return 2; int i = 2; while (!(iupac2bitmask(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]) & iupac2bitmask(allele_b_probe_seq[strlen(allele_b_probe_seq) - i]))) i++; char trail_a_probe_seq = toupper(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]); if (trail_a_probe_seq == 'C' || trail_a_probe_seq == 'G' || trail_a_probe_seq == 'S') return 1; if (trail_a_probe_seq == 'A' || trail_a_probe_seq == 'T' || trail_a_probe_seq == 'W') return 2; // these weird rule were deduced from manifests for array GDA_PGx-8v1-0_20042614 if (trail_a_probe_seq == 'Y' && trail_right == 'G') return 1; if (trail_a_probe_seq == 'Y' && trail_right == 'T') return 1; if (trail_a_probe_seq == 'Y' && trail_right == 'A') return 2; if (trail_a_probe_seq == 'K' && trail_right == 'C') return 1; if (trail_a_probe_seq == 'K' && trail_right == 'A') return 2; if (trail_a_probe_seq == 'M' && trail_right == 'G') return 1; if (trail_a_probe_seq == 'M' && trail_right == 'T') return 2; if (trail_a_probe_seq == 'R' && trail_right == 'C') return 1; if (trail_a_probe_seq == 'R' && trail_right == 'T') return 2; fprintf(stderr, "Warning: Unable to retrieve assay type: %s %s %s\n", allele_a_probe_seq, allele_b_probe_seq, source_seq); return 0xFF; } static void locusentry_read(LocusEntry *locus_entry, hFILE *hfile) { locus_entry->norm_id = 0xFF; read_bytes(hfile, (void *)&locus_entry->version, sizeof(int32_t)); if (locus_entry->version < 4 || locus_entry->version == 5 || locus_entry->version > 8) error("Locus version %d in manifest file not supported\n", locus_entry->version); read_pfx_string(hfile, &locus_entry->ilmn_id, NULL); read_pfx_string(hfile, &locus_entry->name, NULL); read_pfx_string(hfile, NULL, NULL); // ASOA read_pfx_string(hfile, NULL, NULL); // ASOB read_pfx_string(hfile, NULL, NULL); // LSO read_bytes(hfile, (void *)&locus_entry->index, sizeof(int32_t)); read_pfx_string(hfile, NULL, NULL); // IllumicodeSeq read_pfx_string(hfile, &locus_entry->ilmn_strand, NULL); read_pfx_string(hfile, &locus_entry->snp, NULL); read_pfx_string(hfile, &locus_entry->chrom, NULL); read_pfx_string(hfile, &locus_entry->ploidy, NULL); read_pfx_string(hfile, &locus_entry->species, NULL); read_pfx_string(hfile, &locus_entry->map_info, NULL); read_pfx_string(hfile, &locus_entry->top_genomic_seq, NULL); // only version 4 read_pfx_string(hfile, &locus_entry->customer_strand, NULL); read_bytes(hfile, (void *)&locus_entry->address_a, sizeof(int32_t)); read_bytes(hfile, (void *)&locus_entry->address_b, sizeof(int32_t)); read_pfx_string(hfile, &locus_entry->allele_a_probe_seq, NULL); // only version 4 read_pfx_string(hfile, &locus_entry->allele_b_probe_seq, NULL); // only version 4 read_pfx_string(hfile, &locus_entry->genome_build, NULL); read_pfx_string(hfile, &locus_entry->source, NULL); read_pfx_string(hfile, &locus_entry->source_version, NULL); read_pfx_string(hfile, &locus_entry->source_strand, NULL); read_pfx_string(hfile, &locus_entry->source_seq, NULL); // only version 4 if (locus_entry->source_seq) { char *ptr = strchr(locus_entry->source_seq, '-'); if (ptr && *(ptr - 1) == '/') { *ptr = *(ptr - 2); *(ptr - 2) = '-'; } } if (locus_entry->version >= 6) { read_bytes(hfile, NULL, 1); // MarkerInCNVRegion read_bytes(hfile, (void *)&locus_entry->exp_clusters, sizeof(int8_t)); read_bytes(hfile, (void *)&locus_entry->intensity_only, sizeof(int8_t)); read_bytes(hfile, (void *)&locus_entry->assay_type, sizeof(uint8_t)); if (locus_entry->assay_type < 0 || locus_entry->assay_type > 2) error("Format error in reading assay type from locus entry\n"); if (locus_entry->address_b == 0 && locus_entry->assay_type != 0) error("Manifest format error: Assay type is inconsistent with address B\n"); if (locus_entry->address_b != 0 && locus_entry->assay_type == 0) error("Manifest format error: Assay type is inconsistent with address B\n"); } else { locus_entry->assay_type = get_assay_type(locus_entry->allele_a_probe_seq, locus_entry->allele_b_probe_seq, locus_entry->source_seq); } if (locus_entry->version >= 7) { read_bytes(hfile, &locus_entry->frac_a, sizeof(float)); read_bytes(hfile, &locus_entry->frac_c, sizeof(float)); read_bytes(hfile, &locus_entry->frac_t, sizeof(float)); read_bytes(hfile, &locus_entry->frac_g, sizeof(float)); } if (locus_entry->version >= 8) read_pfx_string(hfile, &locus_entry->ref_strand, NULL); } typedef struct { char *fn; hFILE *hfile; // bpm file htsFile *fp; // csv file int32_t version; char *manifest_name; // Name of manifest char *control_config; // Control description from manifest int32_t num_loci; // Number of loci in manifest int32_t *indexes; char **names; // Names of loci from manifest void *names2index; uint8_t *norm_ids; LocusEntry *locus_entries; uint8_t *norm_lookups; char **header; size_t m_header; } bpm_t; static uint8_t *bpm_norm_lookups(bpm_t *bpm) { int i; uint8_t sorted_norm_ids[256]; for (i = 0; i < 256; i++) sorted_norm_ids[i] = 0xFF; for (i = 0; i < bpm->num_loci; i++) { int norm_id = bpm->locus_entries[i].norm_id; sorted_norm_ids[norm_id] = norm_id; } int j = 0; for (i = 0; i < 256; i++) if (sorted_norm_ids[i] != 0xFF) sorted_norm_ids[j++] = sorted_norm_ids[i]; uint8_t *norm_lookups = (uint8_t *)malloc(256 * sizeof(uint8_t *)); memset((void *)norm_lookups, 0xFF, 256 * sizeof(uint8_t *)); for (i = 0; i < j; i++) norm_lookups[sorted_norm_ids[i]] = i; return norm_lookups; } static bpm_t *bpm_init(const char *fn, int eof_check, int make_dict) { bpm_t *bpm = (bpm_t *)calloc(1, sizeof(bpm_t)); bpm->fn = strdup(fn); bpm->hfile = hopen(bpm->fn, "rb"); if (bpm->hfile == NULL) error("Could not open %s: %s\n", bpm->fn, strerror(errno)); if (is_gzip(bpm->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", bpm->fn); int i; uint8_t buffer[4]; if (hread(bpm->hfile, (void *)buffer, 4) < 4) error("Failed to read magic number from %s file\n", bpm->fn); if (memcmp(buffer, "BPM", 3) != 0) error("BPM file %s format identifier is bad\n", bpm->fn); if (buffer[3] != 1) error("BPM file %s version is unknown\n", bpm->fn); read_bytes(bpm->hfile, (void *)&bpm->version, sizeof(int32_t)); if (bpm->version & 0x1000) bpm->version ^= 0x1000; if (bpm->version > 5 || bpm->version < 3) error("BPM file %s version %d is unsupported\n", bpm->fn, bpm->version); read_pfx_string(bpm->hfile, &bpm->manifest_name, NULL); if (bpm->version > 1) read_pfx_string(bpm->hfile, &bpm->control_config, NULL); read_bytes(bpm->hfile, (void *)&bpm->num_loci, sizeof(int32_t)); read_array(bpm->hfile, (void **)&bpm->indexes, NULL, bpm->num_loci, sizeof(int32_t), 0); bpm->names = (char **)malloc(bpm->num_loci * sizeof(char *)); for (i = 0; i < bpm->num_loci; i++) read_pfx_string(bpm->hfile, &bpm->names[i], NULL); if (make_dict) { bpm->names2index = khash_str2int_init(); for (i = 0; i < bpm->num_loci; i++) { if (khash_str2int_has_key(bpm->names2index, bpm->names[i])) error("Illumina probe %s present multiple times in file %s\n", bpm->names[i], fn); khash_str2int_inc(bpm->names2index, bpm->names[i]); } } read_array(bpm->hfile, (void **)&bpm->norm_ids, NULL, bpm->num_loci, sizeof(uint8_t), 0); bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry)); LocusEntry locus_entry; for (i = 0; i < bpm->num_loci; i++) { memset(&locus_entry, 0, sizeof(LocusEntry)); locusentry_read(&locus_entry, bpm->hfile); int idx = locus_entry.index - 1; if (idx < 0 || idx >= bpm->num_loci) error("Locus entry index %d is out of boundaries\n", locus_entry.index); if (bpm->norm_ids[idx] > 100) error("Manifest format error: read invalid normalization ID %d\n", bpm->norm_ids[idx]); // To mimic the flawed byte-wrapping behavior from GenomeStudio, AutoCall, and // IAAP, this value is allowed to overflow beyond 255, which happens with some // probes in the Omni5 arrays bpm->norm_ids[idx] += 100 * locus_entry.assay_type; locus_entry.norm_id = bpm->norm_ids[idx]; memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry)); } bpm->norm_lookups = bpm_norm_lookups(bpm); for (i = 0; i < bpm->num_loci; i++) { if (i != bpm->locus_entries[i].index - 1) error("Manifest format error: read invalid number of assay entries\n"); } if (bpm->locus_entries[0].version < 8) fprintf(stderr, "Warning: RefStrand annotation missing from manifest file %s\n", bpm->fn); read_bytes(bpm->hfile, (void *)&bpm->m_header, sizeof(int32_t)); bpm->header = (char **)malloc(bpm->m_header * sizeof(char *)); for (i = 0; i < bpm->m_header; i++) read_pfx_string(bpm->hfile, &bpm->header[i], NULL); if (eof_check && !heof(bpm->hfile)) error( "BPM reader did not reach the end of file %s at position %ld\nUse --do-not-check-eof to suppress this " "check\n", bpm->fn, htell(bpm->hfile)); return bpm; } static void bpm_destroy(bpm_t *bpm) { if (!bpm) return; int i; if (bpm->hfile && hclose(bpm->hfile) < 0) error("Error closing BPM file %s\n", bpm->fn); free(bpm->fn); if (bpm->fp && hts_close(bpm->fp) < 0) error("Error closing CSV file %s\n", bpm->fp->fn); free(bpm->manifest_name); free(bpm->control_config); free(bpm->indexes); if (bpm->names) { for (i = 0; i < bpm->num_loci; i++) free(bpm->names[i]); free(bpm->names); } khash_str2int_destroy(bpm->names2index); free(bpm->norm_ids); for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; free(locus_entry->ilmn_id); free(locus_entry->name); free(locus_entry->ilmn_strand); free(locus_entry->snp); free(locus_entry->chrom); free(locus_entry->ploidy); free(locus_entry->species); free(locus_entry->map_info); free(locus_entry->customer_strand); free(locus_entry->allele_a_probe_seq); free(locus_entry->allele_b_probe_seq); free(locus_entry->genome_build); free(locus_entry->source); free(locus_entry->source_version); free(locus_entry->source_strand); free(locus_entry->source_seq); free(locus_entry->top_genomic_seq); free(locus_entry->ref_strand); } free(bpm->locus_entries); free(bpm->norm_lookups); for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]); free(bpm->header); free(bpm); } static void bpm_to_csv(const bpm_t *bpm, FILE *stream, int flags) { int i; for (i = 0; i < bpm->m_header; i++) fprintf(stream, "%s\n", bpm->header[i]); if (flags & BPM_LOADED) { fprintf(stream, "Index,NormID,IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_" "ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source," "SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters," "Intensity_Only,Assay_Type,Frac A,Frac C,Frac G,Frac T,RefStrand"); if (flags & CSV_LOADED) fprintf(stream, ",Assay_Type_CSV"); fputc('\n', stream); } else { fprintf(stream, "IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_" "ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion," "SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,RefStrand\n"); } if (flags & VERBOSE) { kstring_t address_b = {0, 0, NULL}; if (flags & BPM_LOADED) { for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; address_b.l = 0; ksprintf(&address_b, locus_entry->address_b ? "%010d" : "", locus_entry->address_b); fprintf(stream, "%d,%d,%s,%s,%s,%s,%010d,%-s,%s,%-s,%s,%s,%s,%s,%s,%s,%s,%s,%-s,%-s,%d," "%d,%d,%d,%f,%f,%f,%f,%s", locus_entry->index, locus_entry->norm_id, locus_entry->ilmn_id, locus_entry->name, locus_entry->ilmn_strand, locus_entry->snp, locus_entry->address_a, locus_entry->allele_a_probe_seq ? locus_entry->allele_a_probe_seq : "", address_b.s, locus_entry->allele_b_probe_seq ? locus_entry->allele_b_probe_seq : "", locus_entry->genome_build, locus_entry->chrom, locus_entry->map_info, locus_entry->ploidy, locus_entry->species, locus_entry->source, locus_entry->source_version, locus_entry->source_strand, locus_entry->source_seq ? locus_entry->source_seq : "", locus_entry->top_genomic_seq ? locus_entry->top_genomic_seq : "", locus_entry->beadset_id, locus_entry->exp_clusters, locus_entry->intensity_only, locus_entry->assay_type, locus_entry->frac_a, locus_entry->frac_c, locus_entry->frac_g, locus_entry->frac_t, locus_entry->ref_strand ? locus_entry->ref_strand : ""); if (flags & CSV_LOADED) fprintf(stream, ",%d", locus_entry->assay_type_csv); fputc('\n', stream); } } else { for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; address_b.l = 0; ksprintf(&address_b, locus_entry->address_b ? "%010d" : "", locus_entry->address_b); fprintf(stream, "%s,%s,%s,%s,%010d,%-s,%s,%-s,%s,%s,%s,%s,%s,%s,%s,%s,%-s,%-s,%d,%d,%s\n", locus_entry->ilmn_id, locus_entry->name, locus_entry->ilmn_strand, locus_entry->snp, locus_entry->address_a, locus_entry->allele_a_probe_seq, address_b.s, locus_entry->allele_b_probe_seq ? locus_entry->allele_b_probe_seq : "", locus_entry->genome_build, locus_entry->chrom, locus_entry->map_info, locus_entry->ploidy, locus_entry->species, locus_entry->source, locus_entry->source_version, locus_entry->source_strand, locus_entry->source_seq, locus_entry->top_genomic_seq, locus_entry->beadset_id, locus_entry->exp_clusters, locus_entry->ref_strand ? locus_entry->ref_strand : ""); } } free(address_b.s); } else { fprintf(stream, "... use --verbose to visualize Assay data ...\n"); } fprintf(stream, "[Controls]\n"); fprintf(stream, "%s", bpm->control_config); } /**************************************** * CSV FILE IMPLEMENTATION * ****************************************/ static int tsv_read_uint8(tsv_t *tsv, bcf1_t *rec, void *usr) { uint8_t *uint8 = (uint8_t *)usr; char tmp = *tsv->se; *tsv->se = 0; char *endptr; *uint8 = (uint8_t)strtol(tsv->ss, &endptr, 0); *tsv->se = tmp; return 0; } static int tsv_read_int32(tsv_t *tsv, bcf1_t *rec, void *usr) { int32_t *int32 = (int32_t *)usr; char tmp = *tsv->se; *tsv->se = 0; char *endptr; *int32 = (int32_t)strtol(tsv->ss, &endptr, 10); *tsv->se = tmp; return 0; } static int tsv_read_float(tsv_t *tsv, bcf1_t *rec, void *usr) { float *single = (float *)usr; char tmp = *tsv->se; *tsv->se = 0; char *endptr; *single = (float)strtof(tsv->ss, &endptr); *tsv->se = tmp; return 0; } static int tsv_read_string(tsv_t *tsv, bcf1_t *rec, void *usr) { char **str = (char **)usr; if (tsv->se == tsv->ss) { *str = NULL; } else { char tmp = *tsv->se; *tsv->se = 0; *str = strdup(tsv->ss); *tsv->se = tmp; } return 0; } // Petr Danecek's similar implementation in bcftools/tsv2vcf.c static int csv_parse(tsv_t *tsv, bcf1_t *rec, char *str) { int status = 0; tsv->icol = 0; tsv->ss = tsv->se = str; while (*tsv->ss && tsv->icol < tsv->ncols) { while (*tsv->se && *tsv->se != ',') tsv->se++; if (tsv->cols[tsv->icol].setter) { int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr); if (ret < 0) return -1; status++; } if (*tsv->se) tsv->se++; tsv->ss = tsv->se; tsv->icol++; } return status ? 0 : -1; } static void locus_merge(LocusEntry *dest, LocusEntry *src) { if (src->version) dest->version = src->version; if (src->norm_id != 0xFF) dest->norm_id = src->norm_id; if (strcmp(dest->ilmn_id, src->ilmn_id)) { error("BPM and CSV manifests have conflicting IDs: %s and %s\n", dest->ilmn_id, src->ilmn_id); } else { free(dest->ilmn_id); dest->ilmn_id = src->ilmn_id; } if (src->name) { free(dest->name); dest->name = src->name; } if (src->index != 0) dest->index = src->index; if (src->ilmn_strand) { free(dest->ilmn_strand); dest->ilmn_strand = src->ilmn_strand; } if (src->snp) { free(dest->snp); dest->snp = src->snp; } if (src->chrom) { free(dest->chrom); dest->chrom = src->chrom; } if (src->ploidy) { free(dest->ploidy); dest->ploidy = src->ploidy; } if (src->species) { free(dest->species); dest->species = src->species; } if (src->map_info) { free(dest->map_info); dest->map_info = src->map_info; } if (src->customer_strand) { free(dest->customer_strand); dest->customer_strand = src->customer_strand; } if (src->address_a != 0) dest->address_a = src->address_a; if (src->allele_a_probe_seq) { free(dest->allele_a_probe_seq); dest->allele_a_probe_seq = src->allele_a_probe_seq; } if (src->address_b != 0) dest->address_b = src->address_b; if (src->allele_b_probe_seq) { free(dest->allele_b_probe_seq); dest->allele_b_probe_seq = src->allele_b_probe_seq; } if (src->genome_build) { free(dest->genome_build); dest->genome_build = src->genome_build; } if (src->source) { free(dest->source); dest->source = src->source; } if (src->source_version) { free(dest->source_version); dest->source_version = src->source_version; } if (src->source_strand) { free(dest->source_strand); dest->source_strand = src->source_strand; } if (src->source_seq) { free(dest->source_seq); dest->source_seq = src->source_seq; } if (src->top_genomic_seq) { free(dest->top_genomic_seq); dest->top_genomic_seq = src->top_genomic_seq; } if (src->beadset_id) dest->beadset_id = src->beadset_id; if (src->exp_clusters) dest->exp_clusters = src->exp_clusters; if (src->intensity_only) dest->intensity_only = src->intensity_only; if (src->assay_type != 0xFF) dest->assay_type = src->assay_type; if (src->assay_type_csv != 0xFF) dest->assay_type_csv = src->assay_type_csv; if (src->frac_a) dest->frac_a = src->frac_a; if (src->frac_c) dest->frac_c = src->frac_c; if (src->frac_g) dest->frac_g = src->frac_g; if (src->frac_t) dest->frac_t = src->frac_t; if (src->ref_strand) { free(dest->ref_strand); dest->ref_strand = src->ref_strand; } } // this line will read a CSV file and if a BPM object is provided it will fill it rather than // create a new one static bpm_t *bpm_csv_init(const char *fn, bpm_t *bpm, int make_dict) { int bpm_available = bpm != NULL; if (!bpm_available) bpm = (bpm_t *)calloc(1, sizeof(bpm_t)); int bpm_prev_num_loci = bpm->num_loci; bpm->fp = hts_open(fn, "r"); if (bpm->fp == NULL) error("Could not open %s: %s\n", fn, strerror(errno)); kstring_t str = {0, 0, NULL}; kstring_t hdr = {0, 0, NULL}; if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fn); if (strncmp(str.s, "Illumina", 8) && strncmp(str.s, "\"Illumina", 9)) error("Header of file %s is incorrect: %s\n", fn, str.s); kputs(str.s, &hdr); kputc('\n', &hdr); char *tmp = NULL; size_t prev = 0; while (strncmp(str.s + prev, "[Assay]", 7)) { if (strncmp(str.s + prev, "Descriptor File Name,", 21) == 0) { free(bpm->manifest_name); bpm->manifest_name = strdup(str.s + prev + 21); char *ptr = strchr(bpm->manifest_name, ','); if (ptr) *ptr = '\0'; } else if (strncmp(str.s + prev, "Loci Count ,", 12) == 0) { bpm->num_loci = (int)strtol(str.s + prev + 12, &tmp, 0); } else if (strncmp(str.s + prev, "Loci Count,", 11) == 0) { bpm->num_loci = (int)strtol(str.s + prev + 11, &tmp, 0); } if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error("Error reading from file: %s\n", fn); kputs(str.s, &hdr); kputc('\n', &hdr); } if (bpm->num_loci == 0) error("Could not understand number of loci from header of manifest file %s\n", fn); else if (bpm_available && bpm_prev_num_loci != bpm->num_loci) error("BPM manifest file has %d loci while CSV manifest file %s has %d loci\n", bpm_prev_num_loci, fn, bpm->num_loci); int i, moff = 0, *off = NULL; for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]); bpm->m_header = ksplit_core(hdr.s, '\n', &moff, &off); free(bpm->header); bpm->header = (char **)malloc(bpm->m_header * sizeof(char *)); for (i = 0; i < bpm->m_header; i++) bpm->header[i] = strdup(&hdr.s[off[i]]); free(off); free(hdr.s); if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error("Error reading from file: %s\n", fn); LocusEntry locus_entry; tsv_t *tsv = tsv_init(str.s); tsv_register(tsv, "Index", tsv_read_int32, &locus_entry.index); int norm_id = tsv_register(tsv, "NormID", tsv_read_uint8, &locus_entry.norm_id); tsv_register(tsv, "IlmnID", tsv_read_string, &locus_entry.ilmn_id); tsv_register(tsv, "Name", tsv_read_string, &locus_entry.name); tsv_register(tsv, "IlmnStrand", tsv_read_string, &locus_entry.ilmn_strand); tsv_register(tsv, "SNP", tsv_read_string, &locus_entry.snp); tsv_register(tsv, "AddressA_ID", tsv_read_int32, &locus_entry.address_a); tsv_register(tsv, "AlleleA_ProbeSeq", tsv_read_string, &locus_entry.allele_a_probe_seq); tsv_register(tsv, "AddressB_ID", tsv_read_int32, &locus_entry.address_b); tsv_register(tsv, "AlleleB_ProbeSeq", tsv_read_string, &locus_entry.allele_b_probe_seq); tsv_register(tsv, "GenomeBuild", tsv_read_string, &locus_entry.genome_build); tsv_register(tsv, "Chr", tsv_read_string, &locus_entry.chrom); tsv_register(tsv, "MapInfo", tsv_read_string, &locus_entry.map_info); tsv_register(tsv, "Ploidy", tsv_read_string, &locus_entry.ploidy); tsv_register(tsv, "Species", tsv_read_string, &locus_entry.species); tsv_register(tsv, "Source", tsv_read_string, &locus_entry.source); tsv_register(tsv, "SourceVersion", tsv_read_string, &locus_entry.source_version); tsv_register(tsv, "SourceStrand", tsv_read_string, &locus_entry.source_strand); tsv_register(tsv, "SourceSeq", tsv_read_string, &locus_entry.source_seq); tsv_register(tsv, "TopGenomicSeq", tsv_read_string, &locus_entry.top_genomic_seq); int beadset_id = tsv_register(tsv, "BeadSetID", tsv_read_int32, &locus_entry.beadset_id); tsv_register(tsv, "Exp_Clusters", tsv_read_uint8, &locus_entry.exp_clusters); tsv_register(tsv, "Intensity_Only", tsv_read_uint8, &locus_entry.intensity_only); tsv_register(tsv, "Frac A", tsv_read_float, &locus_entry.frac_a); tsv_register(tsv, "Frac C", tsv_read_float, &locus_entry.frac_c); tsv_register(tsv, "Frac G", tsv_read_float, &locus_entry.frac_g); tsv_register(tsv, "Frac T", tsv_read_float, &locus_entry.frac_t); int ref_strand = tsv_register(tsv, "RefStrand", tsv_read_string, &locus_entry.ref_strand); if (ref_strand < 0) fprintf(stderr, "Warning: RefStrand annotation missing from manifest file %s\n", fn); if (!bpm_available) bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry)); for (i = 0; i < bpm->num_loci; i++) { memset(&locus_entry, 0, sizeof(LocusEntry)); locus_entry.norm_id = 0xFF; locus_entry.assay_type = 0xFF; locus_entry.assay_type_csv = 0xFF; if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error("Error reading from file: %s\n", fn); if (csv_parse(tsv, NULL, str.s) < 0) error("Could not parse the manifest file: %s\n", str.s); if (beadset_id == 0 && locus_entry.beadset_id == 0) error("BeadSetID value 0 for probe %s is not allowed\n", locus_entry.ilmn_id); if (locus_entry.source_seq) { char *ptr = strchr(locus_entry.source_seq, '-'); if (ptr && *(ptr - 1) == '/') { *ptr = *(ptr - 2); *(ptr - 2) = '-'; } } locus_entry.assay_type_csv = get_assay_type(locus_entry.allele_a_probe_seq, locus_entry.allele_b_probe_seq, locus_entry.source_seq); if (locus_entry.index == 0) locus_entry.index = i + 1; int idx = locus_entry.index - 1; if (idx < 0 || idx >= bpm->num_loci) error("Locus entry index %d is out of boundaries\n", idx); if (!bpm_available) { memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry)); } else { locus_merge(&bpm->locus_entries[idx], &locus_entry); if (bpm->locus_entries[idx].assay_type != 0xff && bpm->locus_entries[idx].assay_type != bpm->locus_entries[idx].assay_type_csv) fprintf(stderr, "Warning: Failed to retrieve assay type %d: %s %s %s\n", bpm->locus_entries[idx].assay_type, bpm->locus_entries[idx].allele_a_probe_seq, bpm->locus_entries[idx].allele_b_probe_seq, bpm->locus_entries[idx].source_seq); } } tsv_destroy(tsv); if (hts_getline(bpm->fp, KS_SEP_LINE, &str) <= 0) error("Error reading from file: %s\n", fn); if (strncmp(str.s, "[Controls]", 10) != 0) error( "Missing [Controls] section from manifest file: %s\n" "Found the following line instead: %s\n", fn, str.s); while (hts_getline(bpm->fp, KS_SEP_LINE, &str) > 0) kputc('\n', &str); free(bpm->control_config); bpm->control_config = str.s; if (make_dict && !bpm->names2index) { bpm->names2index = khash_str2int_init(); for (i = 0; i < bpm->num_loci; i++) { if (khash_str2int_has_key(bpm->names2index, bpm->locus_entries[i].name)) error("Illumina probe %s present multiple times in file %s\n", bpm->locus_entries[i].name, fn); khash_str2int_inc(bpm->names2index, bpm->locus_entries[i].name); } } if (norm_id == 0) { free(bpm->norm_lookups); bpm->norm_lookups = bpm_norm_lookups(bpm); } return bpm; } /**************************************** * EGT FILE IMPLEMENTATION * ****************************************/ // http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumEGTFile.java // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/ClusterFile.py typedef struct { int32_t N; // Number of samples assigned to cluster during training float r_dev; // R (intensity) std deviation value float r_mean; // R (intensity) mean value float theta_dev; // Theta std devation value float theta_mean; // Theta mean value } ClusterStats; typedef struct { float cluster_separation; // A score measure the separation between genotype clusters float total_score; // The GenTrain score float original_score; // The original score before editing this cluster uint8_t edited; // Whether this cluster has been manually manipulated } ClusterScore; typedef struct { ClusterStats aa_cluster_stats; // Describes AA genotype cluster ClusterStats ab_cluster_stats; // Describes AB genotype cluster ClusterStats bb_cluster_stats; // Describes BB genotype cluster float intensity_threshold; // Intensity threshold for no-call ClusterScore cluster_score; // Various scores for cluster int32_t address; // Bead type identifier for probe A float r_mean; // precomputed clusters mean } ClusterRecord; typedef struct { char *fn; hFILE *hfile; int32_t version; char *gencall_version; // The GenCall version char *cluster_version; // The clustering algorithm version char *call_version; // The genotyping algorithm version char *normalization_version; // The normalization algorithm version char *date_created; // The date the cluster file was created (e.g., 3/9/2017 2:18:30 PM) uint8_t is_wgt; int32_t data_block_version; char *opa; char *manifest_name; // The manifest name used to build this cluster file int32_t num_records; ClusterRecord *cluster_records; char **names; // Names of records from manifest void *names2index; } egt_t; static void clusterscore_read(ClusterScore *clusterscore, hFILE *hfile) { read_bytes(hfile, (void *)&clusterscore->cluster_separation, sizeof(float)); read_bytes(hfile, (void *)&clusterscore->total_score, sizeof(float)); read_bytes(hfile, (void *)&clusterscore->original_score, sizeof(float)); read_bytes(hfile, (void *)&clusterscore->edited, sizeof(uint8_t)); } static void clusterrecord_read(ClusterRecord *clusterrecord, hFILE *hfile, int32_t data_block_version) { read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.N, sizeof(int32_t)); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.N, sizeof(int32_t)); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.N, sizeof(int32_t)); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_mean, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_mean, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_mean, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_dev, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_mean, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_mean, sizeof(float)); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_mean, sizeof(float)); if (data_block_version >= 7) { read_bytes(hfile, (void *)&clusterrecord->intensity_threshold, sizeof(float)); read_bytes(hfile, NULL, 14 * sizeof(float)); } else { clusterrecord->intensity_threshold = NAN; } } static egt_t *egt_init(const char *fn, int eof_check) { int i; egt_t *egt = (egt_t *)calloc(1, sizeof(egt_t)); egt->fn = strdup(fn); egt->hfile = hopen(egt->fn, "rb"); if (egt->hfile == NULL) error("Could not open %s: %s\n", egt->fn, strerror(errno)); if (is_gzip(egt->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", egt->fn); read_bytes(egt->hfile, (void *)&egt->version, sizeof(int32_t)); if (egt->version != 3) error("EGT cluster file version %d not supported\n", egt->version); read_pfx_string(egt->hfile, &egt->gencall_version, NULL); read_pfx_string(egt->hfile, &egt->cluster_version, NULL); read_pfx_string(egt->hfile, &egt->call_version, NULL); read_pfx_string(egt->hfile, &egt->normalization_version, NULL); read_pfx_string(egt->hfile, &egt->date_created, NULL); read_bytes(egt->hfile, (void *)&egt->is_wgt, sizeof(uint8_t)); if (egt->is_wgt != 1) error("Only WGT cluster file version supported\n"); read_pfx_string(egt->hfile, &egt->manifest_name, NULL); read_bytes(egt->hfile, (void *)&egt->data_block_version, sizeof(int32_t)); if (egt->data_block_version < 5 || egt->data_block_version == 6 || egt->data_block_version > 9) error("Data block version %d in cluster file not supported\n", egt->data_block_version); read_pfx_string(egt->hfile, &egt->opa, NULL); read_bytes(egt->hfile, (void *)&egt->num_records, sizeof(int32_t)); egt->cluster_records = (ClusterRecord *)malloc(egt->num_records * sizeof(ClusterRecord)); for (i = 0; i < egt->num_records; i++) clusterrecord_read(&egt->cluster_records[i], egt->hfile, egt->data_block_version); for (i = 0; i < egt->num_records; i++) clusterscore_read(&egt->cluster_records[i].cluster_score, egt->hfile); // toss useless strings such as aa_ab_bb/aa_ab/aa_bb/ab_bb for (i = 0; i < egt->num_records; i++) read_pfx_string(egt->hfile, NULL, NULL); egt->names = (char **)malloc(egt->num_records * sizeof(char *)); egt->names2index = khash_str2int_init(); for (i = 0; i < egt->num_records; i++) { read_pfx_string(egt->hfile, &egt->names[i], NULL); if (khash_str2int_has_key(egt->names2index, egt->names[i])) error("Illumina probe %s present multiple times in file %s\n", egt->names[i], fn); khash_str2int_inc(egt->names2index, egt->names[i]); } for (i = 0; i < egt->num_records; i++) read_bytes(egt->hfile, (void *)&egt->cluster_records[i].address, sizeof(int32_t)); int32_t aa_n, ab_n, bb_n; for (i = 0; i < egt->num_records; i++) { read_bytes(egt->hfile, (void *)&aa_n, sizeof(int32_t)); read_bytes(egt->hfile, (void *)&ab_n, sizeof(int32_t)); read_bytes(egt->hfile, (void *)&bb_n, sizeof(int32_t)); if (egt->cluster_records[i].aa_cluster_stats.N != aa_n || egt->cluster_records[i].ab_cluster_stats.N != ab_n || egt->cluster_records[i].bb_cluster_stats.N != bb_n) error("Cluster counts don't match with EGT cluster file %s\n", egt->fn); } if (egt->data_block_version == 9) read_bytes(egt->hfile, NULL, egt->num_records * sizeof(float)); if (eof_check && !heof(egt->hfile)) error( "EGT reader did not reach the end of file %s at position %ld\nUse --do-not-check-eof to suppress this " "check\n", egt->fn, htell(egt->hfile)); for (i = 0; i < egt->num_records; i++) { ClusterStats *aa = &egt->cluster_records[i].aa_cluster_stats; ClusterStats *ab = &egt->cluster_records[i].ab_cluster_stats; ClusterStats *bb = &egt->cluster_records[i].bb_cluster_stats; egt->cluster_records[i].r_mean = (aa->N * aa->r_mean + ab->N * ab->r_mean + bb->N * bb->r_mean) / (aa->N + ab->N + bb->N); } return egt; } static void egt_destroy(egt_t *egt) { if (!egt) return; int i; if (hclose(egt->hfile) < 0) error("Error closing EGT file %s\n", egt->fn); free(egt->fn); free(egt->gencall_version); free(egt->cluster_version); free(egt->call_version); free(egt->normalization_version); free(egt->date_created); free(egt->opa); free(egt->manifest_name); free(egt->cluster_records); for (i = 0; i < egt->num_records; i++) free(egt->names[i]); free(egt->names); khash_str2int_destroy(egt->names2index); free(egt); } static void egt_to_csv(const egt_t *egt, FILE *stream, int verbose) { fprintf(stream, "Illumina, Inc.\n"); fprintf(stream, "[Heading]\n"); fprintf(stream, "Descriptor File Name,%s\n", strrchr(egt->fn, '/') ? strrchr(egt->fn, '/') + 1 : egt->fn); fprintf(stream, "GenCall version,%s\n", egt->gencall_version); fprintf(stream, "Clustering algorithm version,%s\n", egt->cluster_version); fprintf(stream, "Genotyping algorithm version,%s\n", egt->call_version); fprintf(stream, "Normalization algorithm version,%s\n", egt->normalization_version); fprintf(stream, "Date Manufactured,%s\n", egt->date_created); fprintf(stream, "Manifest name used to build this cluster file,%s\n", egt->manifest_name); fprintf(stream, "OPA,%s\n", egt->opa ? egt->opa : ""); fprintf(stream, "Loci Count,%d\n", egt->num_records); fprintf(stream, "[Assay]\n"); fprintf(stream, "Name,AA.N,AA.R_dev,AA.R_mean,AA.Theta_dev,AA.Theta_mean,AB.N,AB.R_dev,AB.R_mean,AB." "Theta_dev,AB.Theta_mean,BB.N,BB.R_dev,BB.R_mean,BB.Theta_dev,BB.Theta_mean,Intensity " "Threshold,Cluster Separation,GenTrain Score,Original Score,Edited,Address\n"); if (verbose) { int i; for (i = 0; i < egt->num_records; i++) { ClusterRecord *cluster_record = &egt->cluster_records[i]; fprintf(stream, "%s,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%f,%f,%f,%d,%d\n", egt->names[i], cluster_record->aa_cluster_stats.N, cluster_record->aa_cluster_stats.r_dev, cluster_record->aa_cluster_stats.r_mean, cluster_record->aa_cluster_stats.theta_dev, cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.N, cluster_record->ab_cluster_stats.r_dev, cluster_record->ab_cluster_stats.r_mean, cluster_record->ab_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_mean, cluster_record->bb_cluster_stats.N, cluster_record->bb_cluster_stats.r_dev, cluster_record->bb_cluster_stats.r_mean, cluster_record->bb_cluster_stats.theta_dev, cluster_record->bb_cluster_stats.theta_mean, cluster_record->intensity_threshold, cluster_record->cluster_score.cluster_separation, cluster_record->cluster_score.total_score, cluster_record->cluster_score.original_score, cluster_record->cluster_score.edited, cluster_record->address); } } else { fprintf(stream, "... use --verbose to visualize Assay data ...\n"); } } /**************************************** * IDAT FILE IMPLEMENTATION * ****************************************/ // http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py // http://github.com/HenrikBengtsson/illuminaio/blob/master/R/readIDAT.R #define NUM_SNPS_READ 1000 // ID_N_CORES // #define ... 100 // ID_BACKGROUNDS - not used // #define ... 101 // ID_BACKGROUND_DEVS - not used #define ILLUMINA_ID 102 // ID_BEAD_TYPES #define SD 103 // ID_DEVS #define MEAN 104 // ID_MEANS // #define ... 105 // ID_MEDIANS - not used // #define ... 106 // ID_N_BEADS - not used #define NBEADS 107 // ID_N_GOOD_BEADS // #define ... 108 // ID_TRIMMED_MEANS - not used #define MID_BLOCK 200 // ID_ILLUMICODES #define RUN_INFO 300 // ID_PROCESS_HISTORY #define RED_GREEN 400 // ID_TENTH_PERCENTILE #define IDAT_SNP_MANIFEST 401 // ID_SAMPLE_BEADSET #define SENTRIX_BARCODE 402 // ID_BARCODE #define CHIP_TYPE 403 // ID_SENTRIX_FORMAT #define SENTRIX_POSITION 404 // ID_SECTION_LABEL #define BEADSET 405 // ID_BEADSET #define IDAT_SAMPLE_NAME 406 // ID_DNA #define DESCRIPTION 407 // ID_OPA #define IDAT_SAMPLE_PLATE 408 // ID_DNA_PLATE #define IDAT_SAMPLE_WELL 409 // ID_WELL #define IDAT_SAMPLE_COUNT 410 // ID_SAMPLE_COUNT // #define ... 411 // ID_DX - not used #define IDAT_VLN 510 // ID_VLN typedef struct { const char *chip_type; int num_snps; int num_mid_blocks; const char *chip_type_guess; } chip_type_t; static chip_type_t chip_types[] = { {"1-95um_multi-swath_for_4x5M", 4568350, 4568350, "HumanOmni5-4-v1-0"}, {"1-95um_multi-swath_for_4x5M", 4640213, 4640213, "HumanOmni5-4v1-1"}, {"1-95um_multi-swath_for_4x5M", 4685673, 4685673, "InfiniumOmni5-4v1-2"}, {"1-95um_multi-swath_for_4x5M", 4696316, 4696316, "HumanOmni5-4-v1-0"}, {"1-95um_multi-swath_for_8x2-5M", 2266191, 2266191, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266367, 2266367, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266404, 2266404, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266406, 2266406, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2268676, 2268676, "MEGAEx_BioVU_15075710"}, {"1-95um_multi-swath_for_8x2-5M", 2315574, 2315574, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2389000, 2389000, "CCPMBiobankMEGA2_20002558X345183"}, {"1-95um_multi-swath_for_8x2-5M", 2508689, 2508689, "GDA-8v1-0"}, {"1-95um_multi-swath_for_8x2-5M", 2550870, 2550870, "HumanOmni2.5-8v1"}, {"1-95um_multi-swath_for_8x2-5M", 2563064, 2563064, "HumanOmni25M-8v1-1"}, {"1-95um_multi-swath_for_8x2-5M", 2575219, 2575219, "HumanOmni2.5-8v1"}, {"1-95um_multi-swath_for_8x2-5M", 2605775, 2605775, "HumanOmni25M-8v1-1"}, {"BeadChip 12x1", 55300, 55300, "humanmethylation27_270596_v1-2 ???"}, {"BeadChip 12x1Q", 191668, 191668, "CanineHD"}, {"BeadChip 12x1Q", 299260, 299260, "HumanCytoSNP-12v2-1"}, {"BeadChip 12x8", 301084, 301084, "HumanCore-12v1-0"}, {"BeadChip 12x8", 304138, 304138, "HumanExome-12v1-1"}, {"BeadChip 12x8", 567727, 567727, "HumanCoreExome-12-v1-0"}, {"BeadChip 12x8", 569060, 569060, "HumanCoreExome-12-v1-0"}, {"BeadChip 12x8", 573012, 573012, "HumanCoreExome-12-v1-1"}, {"BeadChip 12x8", 576769, 576769, "HumanCoreExome-12-v1-1"}, {"BeadChip 12x8", 622399, 622399, "humanmethylation450_15017482_v-1-2 ???"}, {"BeadChip 12x8", 722405, 722405, "HumanOmniExpress-12-v1-1"}, {"BeadChip 12x8", 734889, 734889, "HumanOmniExpress-12-v1-0"}, {"BeadChip 12x8", 736136, 736136, "HumanOmniExpress-12-v1-0"}, {"BeadChip 1x12", 577085, 8627, "HumanHap550v3"}, {"BeadChip 1x12", 661182, 49163, "HumanHap650Yv3"}, {"BeadChip 1x40", 1129736, 57373, "Human1Mv1"}, {"BeadChip 1x40 66", 1078890, 52497, "Human1Mv1"}, {"BeadChip 24x1x4", 306776, 306776, "InfiniumCore-24v1-2"}, {"BeadChip 24x1x4", 527136, 527136, "OncoArray-500K"}, {"BeadChip 24x1x4", 577781, 577781, "HumanCoreExome-24v1-0"}, {"BeadChip 24x1x4", 581261, 581261, "HumanCoreExome-24v1-2"}, {"BeadChip 24x1x4", 582684, 582684, "HumanCoreExome-24v1-1"}, {"BeadChip 24x1x4", 611866, 611866, "HumanCoreExome-24v1-4"}, {"BeadChip 24x1x4", 623302, 623302, "PsychChip_15048346"}, {"BeadChip 24x1x4", 623513, 623513, "InfiniumPsychArray-24v1-1"}, {"BeadChip 24x1x4", 638714, 638714, "PsychChip_v1-1_15073391"}, {"BeadChip 24x1x4", 647864, 647864, "InfiniumPsychArray-24v1-3"}, {"BeadChip 24x1x4", 663209, 663209, "GSA-24v1-0"}, {"BeadChip 24x1x4", 704215, 704215, "GSA-24v3-0"}, {"BeadChip 24x1x4", 708013, 708013, "DeCodeGenetics_V1_20012591"}, {"BeadChip 24x1x4", 710576, 710576, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 710606, 710606, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 710608, 710608, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 715653, 715653, "HumanOmniExpress-24v1-1"}, {"BeadChip 24x1x4", 716279, 716279, "InfiniumOmniExpress-24v1-2"}, {"BeadChip 24x1x4", 718963, 718963, "HumanOmniExpress-24-v1-0"}, {"BeadChip 24x1x4", 719234, 719234, "HumanOmniExpress-24-v1-0"}, {"BeadChip 24x1x4", 729110, 729110, "ASA-24v1-0"}, {"BeadChip 24x1x4", 733354, 733354, "GSA-24v2-0"}, {"BeadChip 24x1x4", 749019, 749019, "DeCodeGenetics_V3_20032937X331991"}, {"BeadChip 24x1x4", 751614, 751614, "GSAMD-24v3-0-EA_20034606"}, {"BeadChip 24x1x4", 766804, 766804, "JSA-24v1-0"}, {"BeadChip 24x1x4", 776509, 776509, "ASA-24v1-0"}, {"BeadChip 24x1x4", 780343, 780343, "GSAMD-24v2-0_20024620"}, {"BeadChip 24x1x4", 780509, 780509, "GSAMD-24v2-0_20024620"}, {"BeadChip 24x1x4", 818205, 818205, "GSA-24v2-0"}, {"BeadChip 2x10", 321354, 37161, "HumanHap300v2"}, {"BeadChip 2x12", 381079, 29275, "HumanCNV370v1"}, {"BeadChip 2x20", 561686, 54936, "HumanHap550v3"}, {"BeadChip 2x6Q", 1224000, 180026, "Human1M-Duov3"}, {"BeadChip 2x6Q", 1224629, 180026, "Human1M-Duov3"}, {"BeadChip 48x4", 730546, 730546, "GSA-MD-48v4-0_20098041"}, {"BeadChip 4x10", 2623923, 1300482, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2623923, 1323441, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624666, 1300941, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624666, 1323725, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624671, 1323726, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2655594, 1354653, "HumanOmni2.5-4v1"}, {"BeadChip 4X1X14", 1186430, 1186430, "HumanOmni1-Quad_v1-0"}, {"BeadChip 4x2Q", 376216, 186490, "HumanCNV370-Quadv3"}, {"BeadChip 4x3Q", 626122, 208778, "Human610-Quadv1"}, {"BeadChip 4x3Q", 667447, 208778, "Human660W-Quad_v1"}, {"BeadChip 8x5", 1052641, 1052641, "infinium-methylationepic-v-1-0 ???"}, {"BeadChip 8x5", 867478, 867478, "CytoSNP-850K"}, {"BeadChip 8x5", 988240, 988240, "HumanOmniExpressExome-8-v1-1"}, {"BeadChip 8x5", 989536, 989536, "HumanOmniExpressExome-8-v1-1"}, {"BeadChip 8x5", 992824, 992824, "HumanOmniExpressExome-8-v1-4"}, {"BeadChip 8x5", 996003, 996003, "HumanOmniExpressExome-8-v1-2"}, {"BeadChip 8x5", 996055, 996055, "HumanOmniExpressExome-8-v1-2"}, {"SLIDE.15028542.24x1x3", 307984, 307984, "HumanCore-24v1-0"}, {"SLIDE.15028542.24x1x3", 311460, 311460, "HumanCore-24v1-0"}, {NULL, 0, 0, NULL}}; typedef struct { char *run_time; char *block_type; char *block_pars; char *block_code; char *code_version; } RunInfo; typedef struct { char *fn; hFILE *hfile; int64_t version; int32_t number_toc_entries; uint16_t *id; int64_t *toc; int32_t num_snps; int32_t num_mid_blocks; int32_t *ilmn_id; uint16_t *sd; uint16_t *mean; uint8_t *nbeads; const uint16_t *trimmed_mean; // only used for historical purposes uint8_t *mid_block; uint8_t red_green[4]; char *snp_manifest; char *sentrix_barcode; char *chip_type; char *sentrix_position; char *beadset; char *sample_name; char *description; char *sample_plate; char *sample_well; int32_t sample_count; char *vln; RunInfo *run_infos; int32_t m_run_infos; const char *chip_type_guess; const char *imaging_date; const char *scanner_data; void *ilmn_id2index; } idat_t; KHASH_MAP_INIT_INT(32, int32_t) static int idat_read(idat_t *idat, uint16_t id) { int i; for (i = 0; i < idat->number_toc_entries && id != idat->id[i]; i++); if (i == idat->number_toc_entries) return -1; if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0) error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn); switch (id) { case NUM_SNPS_READ: read_bytes(idat->hfile, (void *)&idat->num_snps, sizeof(int32_t)); break; case ILLUMINA_ID: idat->ilmn_id = (int32_t *)malloc(idat->num_snps * sizeof(int32_t)); read_bytes(idat->hfile, (void *)idat->ilmn_id, idat->num_snps * sizeof(int32_t)); int ret; idat->ilmn_id2index = kh_init(32); khash_t(32) *hash = (khash_t(32) *)idat->ilmn_id2index; for (i = 0; i < idat->num_snps; i++) { khiter_t k = kh_put(32, hash, idat->ilmn_id[i], &ret); if (ret < 0) error("Unable to insert Illumina ID %d in hash table\n", idat->ilmn_id[i]); if (ret > 0) kh_val(hash, k) = kh_size(hash) - 1; else error("Duplicate Illumina ID %d in hash table\n", idat->ilmn_id[i]); } break; case SD: idat->sd = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t)); read_bytes(idat->hfile, (void *)idat->sd, idat->num_snps * sizeof(uint16_t)); break; case MEAN: idat->mean = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t)); read_bytes(idat->hfile, (void *)idat->mean, idat->num_snps * sizeof(uint16_t)); idat->trimmed_mean = idat->mean; break; case NBEADS: idat->nbeads = (uint8_t *)malloc(idat->num_snps * sizeof(uint8_t)); read_bytes(idat->hfile, (void *)idat->nbeads, idat->num_snps * sizeof(uint8_t)); break; case MID_BLOCK: read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t)); idat->mid_block = (uint8_t *)malloc(idat->num_mid_blocks * sizeof(uint8_t)); read_bytes(idat->hfile, (void *)idat->mid_block, idat->num_mid_blocks * sizeof(uint8_t)); break; case RED_GREEN: read_bytes(idat->hfile, (void *)&idat->red_green, 4 * sizeof(uint8_t)); break; case IDAT_SNP_MANIFEST: read_pfx_string(idat->hfile, &idat->snp_manifest, NULL); break; case SENTRIX_BARCODE: read_pfx_string(idat->hfile, &idat->sentrix_barcode, NULL); break; case CHIP_TYPE: read_pfx_string(idat->hfile, &idat->chip_type, NULL); break; case SENTRIX_POSITION: read_pfx_string(idat->hfile, &idat->sentrix_position, NULL); break; case BEADSET: read_pfx_string(idat->hfile, &idat->beadset, NULL); break; case IDAT_SAMPLE_NAME: read_pfx_string(idat->hfile, &idat->sample_name, NULL); break; case DESCRIPTION: read_pfx_string(idat->hfile, &idat->description, NULL); break; case IDAT_SAMPLE_PLATE: read_pfx_string(idat->hfile, &idat->sample_plate, NULL); break; case IDAT_SAMPLE_WELL: read_pfx_string(idat->hfile, &idat->sample_well, NULL); break; case IDAT_SAMPLE_COUNT: read_bytes(idat->hfile, (void *)&idat->sample_count, sizeof(int32_t)); break; case IDAT_VLN: read_pfx_string(idat->hfile, &idat->vln, NULL); break; case RUN_INFO: read_bytes(idat->hfile, (void *)&idat->m_run_infos, sizeof(int32_t)); idat->run_infos = (RunInfo *)calloc(idat->m_run_infos, sizeof(RunInfo)); for (i = 0; i < idat->m_run_infos; i++) { read_pfx_string(idat->hfile, &idat->run_infos[i].run_time, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_type, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_pars, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_code, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].code_version, NULL); } break; default: error("IDAT file format does not support TOC entry %d\n", id); break; } return 0; } static idat_t *idat_init(const char *fn, int load_arrays) { idat_t *idat = (idat_t *)calloc(1, sizeof(idat_t)); idat->fn = strdup(fn); idat->hfile = hopen(idat->fn, "rb"); if (idat->hfile == NULL) error("Could not open %s: %s\n", idat->fn, strerror(errno)); if (is_gzip(idat->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", idat->fn); int i; uint8_t buffer[4]; if (hread(idat->hfile, (void *)buffer, 4) < 4) error("Failed to read magic number from %s file\n", idat->fn); if (memcmp(buffer, "IDAT", 4) != 0) error("IDAT file %s format identifier is bad\n", idat->fn); read_bytes(idat->hfile, (void *)&idat->version, sizeof(int64_t)); if (idat->version < 3) error("Cannot read IDAT file %s. Unsupported IDAT file format version: %ld\n", idat->fn, idat->version); read_bytes(idat->hfile, (void *)&idat->number_toc_entries, sizeof(int32_t)); idat->id = (uint16_t *)malloc(idat->number_toc_entries * sizeof(uint16_t)); idat->toc = (int64_t *)malloc(idat->number_toc_entries * sizeof(int64_t)); for (i = 0; i < idat->number_toc_entries; i++) { read_bytes(idat->hfile, (void *)&idat->id[i], sizeof(uint16_t)); read_bytes(idat->hfile, (void *)&idat->toc[i], sizeof(int64_t)); } for (i = 0; i < idat->number_toc_entries; i++) { if (!load_arrays && idat->id[i] <= MID_BLOCK) { if (idat->id[i] == MID_BLOCK) { if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0) error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn); read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t)); } continue; } idat_read(idat, idat->id[i]); } if (idat->chip_type) { const chip_type_t *ptr; for (ptr = chip_types; ptr->chip_type; ptr++) { if (strcmp(idat->chip_type, ptr->chip_type) == 0 && ptr->num_snps == idat->num_snps && ptr->num_mid_blocks == idat->num_mid_blocks) idat->chip_type_guess = ptr->chip_type_guess; } } for (i = 0; i < idat->m_run_infos; i++) { if (strcmp(idat->run_infos[i].block_type, "Scan") != 0) continue; idat->imaging_date = idat->run_infos[i].run_time; idat->scanner_data = idat->run_infos[i].block_pars; } return idat; } static void idat_destroy(idat_t *idat) { if (!idat) return; if (hclose(idat->hfile) < 0) error("Error closing IDAT file %s\n", idat->fn); free(idat->fn); free(idat->id); free(idat->toc); free(idat->snp_manifest); free(idat->sentrix_barcode); free(idat->chip_type); free(idat->sentrix_position); free(idat->beadset); free(idat->sample_name); free(idat->description); free(idat->sample_plate); free(idat->sample_well); free(idat->vln); int i; for (i = 0; i < idat->m_run_infos; i++) { free(idat->run_infos[i].run_time); free(idat->run_infos[i].block_type); free(idat->run_infos[i].block_pars); free(idat->run_infos[i].block_code); free(idat->run_infos[i].code_version); } free(idat->run_infos); free(idat->ilmn_id); free(idat->sd); free(idat->mean); free(idat->nbeads); free(idat->mid_block); if (idat->ilmn_id2index) kh_destroy(32, idat->ilmn_id2index); free(idat); } static void idat_to_csv(const idat_t *idat, FILE *stream, int verbose) { int i; fprintf(stream, "Illumina, Inc.\n"); fprintf(stream, "[Heading]\n"); fprintf(stream, "Descriptor File Name,%s\n", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn); fprintf(stream, "IDAT file version,%ld\n", idat->version); fprintf(stream, "Number of TOC entries,%d\n", idat->number_toc_entries); fprintf(stream, "Probes Count,%d\n", idat->num_snps); fprintf(stream, "Mid Blocks Count,%d\n", idat->num_mid_blocks); fprintf(stream, "Red Green,%02x %02x %02x %02x\n", idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3]); fprintf(stream, "SNP Manifest,%s\n", idat->snp_manifest ? idat->snp_manifest : ""); fprintf(stream, "Sentrix Barcode,%s\n", idat->sentrix_barcode); fprintf(stream, "Chip Type,%s\n", idat->chip_type); fprintf(stream, "Sentrix Position,%s\n", idat->sentrix_position); fprintf(stream, "BeadSet,%s\n", idat->beadset ? idat->beadset : ""); fprintf(stream, "Sample Name,%s\n", idat->sample_name ? idat->sample_name : ""); fprintf(stream, "Description,%s\n", idat->description ? idat->description : ""); fprintf(stream, "Sample Plate,%s\n", idat->sample_plate ? idat->sample_plate : ""); fprintf(stream, "Sample Well,%s\n", idat->sample_well ? idat->sample_well : ""); fprintf(stream, "Sample Count,%d\n", idat->sample_count); fprintf(stream, "Vln,%s\n", idat->vln ? idat->vln : ""); fprintf(stream, "Chip Prefix (Guess),%s\n", idat->chip_type_guess ? idat->chip_type_guess : "Unknown"); fprintf(stream, "[Assay]\n"); fprintf(stream, "IlmnID,Sd,Mean,Nbeads\n"); if (verbose) { for (i = 0; i < idat->num_snps; i++) fprintf(stream, "%d,%d,%d,%d\n", idat->ilmn_id[i], idat->sd[i], idat->mean[i], idat->nbeads[i]); fprintf(stream, "[Mid Blocks]\n"); for (i = 0; i < idat->num_mid_blocks; i++) fprintf(stream, "%d\n", idat->mid_block[i]); } else { fprintf(stream, "... use --verbose to visualize Assay data ...\n"); fprintf(stream, "[Mid Blocks]\n"); fprintf(stream, "... use --verbose to visualize Mid Blocks data ...\n"); } fprintf(stream, "[Run Infos]\n"); for (i = 0; i < idat->m_run_infos; i++) { fprintf(stream, "%s\t%s\t%s\t%s\t%s\n", idat->run_infos[i].run_time, idat->run_infos[i].block_type, idat->run_infos[i].block_pars, idat->run_infos[i].block_code, idat->run_infos[i].code_version); } } static void idats_to_tsv(idat_t **idats, int n, FILE *stream) { fprintf(stream, "idat\tnumber_probes\tnumber_mid_blocks\tred_green\tmanifest_file\tsentrix_" "barcode\tchip_type\t" "sentrix_position\tbeadset\tsample_name\tdescription\tsample_plate\tsample_" "well\tsample_count\tvln\t" "chip_type_guess\tscan_date\tscanner_data\n"); int i; for (i = 0; i < n; i++) { idat_t *idat = idats[i]; fprintf(stream, "%s\t%d\t%d\t%02x %02x %02x " "%02x\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn, idat->num_snps, idat->num_mid_blocks, idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3], idat->snp_manifest ? idat->snp_manifest : "", idat->sentrix_barcode, idat->chip_type, idat->sentrix_position, idat->beadset ? idat->beadset : "", idat->sample_name ? idat->sample_name : "", idat->description ? idat->description : "", idat->sample_plate ? idat->sample_plate : "", idat->sample_well ? idat->sample_well : "", idat->sample_count, idat->vln ? idat->vln : "", idat->chip_type_guess ? idat->chip_type_guess : "Unknown", idat->imaging_date ? idat->imaging_date : "", idat->scanner_data ? idat->scanner_data : ""); } } /**************************************** * GTC FILE IMPLEMENTATION * ****************************************/ // http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumGTCFile.java // http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/GenotypeCalls.py #define NUM_SNPS 1 #define PLOIDY 2 // AutoConvert 2.0 #define PLOIDY_TYPE 3 // AutoConvert 2.0 #define GTC_SAMPLE_NAME 10 #define GTC_SAMPLE_PLATE 11 #define GTC_SAMPLE_WELL 12 #define CLUSTER_FILE 100 #define GTC_SNP_MANIFEST 101 #define IMAGING_DATE 200 #define AUTOCALL_DATE 201 #define AUTOCALL_VERSION 300 #define NORMALIZATION_TRANSFORMS 400 #define CONTROLS_X 500 #define CONTROLS_Y 501 #define RAW_X 1000 #define RAW_Y 1001 #define GENOTYPES 1002 #define BASE_CALLS 1003 #define GENOTYPE_SCORES 1004 #define SCANNER_DATA 1005 #define CALL_RATE 1006 #define GENDER 1007 #define LOGR_DEV 1008 #define GC10 1009 #define DX 1010 #define SAMPLE_DATA 1011 #define B_ALLELE_FREQS 1012 // AutoConvert 2.0 #define LOGR_RATIOS 1013 // AutoConvert 2.0 #define PERCENTILES_X 1014 // AutoConvert 2.0 #define PERCENTILES_Y 1015 // AutoConvert 2.0 #define SLIDE_IDENTIFIER 1016 // AutoConvert 2.0 static const char *code2genotype[] = { "NC", "AA", "AB", "BB", "NULL", "A", "B", "AAA", "AAB", "ABB", "BBB", "AAAA", "AAAB", "AABB", "ABBB", "BBBB", "AAAAA", "AAAAB", "AAABB", "AABBB", "ABBBB", "BBBBB", "AAAAAA", "AAAAAB", "AAAABB", "AAABBB", "AABBBB", "ABBBBB", "BBBBBB", "AAAAAAA", "AAAAAAB", "AAAAABB", "AAAABBB", "AAABBBB", "AABBBBB", "ABBBBBB", "BBBBBBB", "AAAAAAAA", "AAAAAAAB", "AAAAAABB", "AAAAABBB", "AAAABBBB", "AAABBBBB", "AABBBBBB", "ABBBBBBB", "BBBBBBBB"}; typedef struct { int32_t version; float offset_x; float offset_y; float scale_x; float scale_y; float shear; float theta; float cvx; float cvy; float nn12; float rr12; float taa; float tbb; } XForm; typedef char BaseCall[2]; typedef struct { char *scanner_name; int32_t pmt_green; int32_t pmt_red; char *scanner_version; char *imaging_user; } ScannerData; typedef struct { float p50gc; int32_t num_calls; int32_t num_no_calls; int32_t num_intensity_only; } SampleData; typedef uint16_t Percentiles[3]; typedef struct { char *fn; hFILE *hfile; int32_t version; int32_t number_toc_entries; uint16_t *id; int32_t *toc; int32_t num_snps; int32_t ploidy; int32_t ploidy_type; char *sample_name; char *sample_plate; char *sample_well; char *cluster_file; char *snp_manifest; char *imaging_date; char *autocall_date; char *autocall_version; XForm *normalization_transforms; size_t m_normalization_transforms; uint16_t *controls_x; size_t m_controls_x; uint16_t *controls_y; size_t m_controls_y; ScannerData scanner_data; float call_rate; char gender; float logr_dev; float p10gc; int32_t dx; SampleData sample_data; Percentiles percentiles_x; Percentiles percentiles_y; char *sentrix_id; char *display_name; float *sin_theta; // precomputed sine transforms float *cos_theta; // precomputed cosine transforms size_t capacity; buffer_array_t *raw_x; buffer_array_t *raw_y; buffer_array_t *genotypes; buffer_array_t *base_calls; buffer_array_t *genotype_scores; buffer_array_t *b_allele_freqs; buffer_array_t *logr_ratios; } gtc_t; static int gtc_read(gtc_t *gtc, uint16_t id) { int i; for (i = 0; i < gtc->number_toc_entries && id != gtc->id[i]; i++); if (i == gtc->number_toc_entries) return -1; if (id != NUM_SNPS && id != PLOIDY && id != PLOIDY_TYPE) { if (hseek(gtc->hfile, gtc->toc[i], SEEK_SET) < 0) error("Fail to seek to position %d in GTC %s file \n", gtc->toc[i], gtc->fn); } switch (id) { case NUM_SNPS: gtc->num_snps = gtc->toc[i]; break; case PLOIDY: gtc->ploidy = gtc->toc[i]; break; case PLOIDY_TYPE: gtc->ploidy_type = gtc->toc[i]; break; case GTC_SAMPLE_NAME: read_pfx_string(gtc->hfile, >c->sample_name, NULL); break; case GTC_SAMPLE_PLATE: read_pfx_string(gtc->hfile, >c->sample_plate, NULL); break; case GTC_SAMPLE_WELL: read_pfx_string(gtc->hfile, >c->sample_well, NULL); break; case CLUSTER_FILE: read_pfx_string(gtc->hfile, >c->cluster_file, NULL); break; case GTC_SNP_MANIFEST: read_pfx_string(gtc->hfile, >c->snp_manifest, NULL); break; case IMAGING_DATE: read_pfx_string(gtc->hfile, >c->imaging_date, NULL); break; case AUTOCALL_DATE: read_pfx_string(gtc->hfile, >c->autocall_date, NULL); break; case AUTOCALL_VERSION: read_pfx_string(gtc->hfile, >c->autocall_version, NULL); break; case NORMALIZATION_TRANSFORMS: read_pfx_array(gtc->hfile, (void **)>c->normalization_transforms, >c->m_normalization_transforms, sizeof(XForm)); break; case CONTROLS_X: read_pfx_array(gtc->hfile, (void **)>c->controls_x, >c->m_controls_x, sizeof(uint16_t)); break; case CONTROLS_Y: read_pfx_array(gtc->hfile, (void **)>c->controls_y, >c->m_controls_y, sizeof(uint16_t)); break; case RAW_X: gtc->raw_x = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint16_t)); break; case RAW_Y: gtc->raw_y = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint16_t)); break; case GENOTYPES: gtc->genotypes = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(uint8_t)); break; case BASE_CALLS: gtc->base_calls = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(BaseCall)); break; case GENOTYPE_SCORES: gtc->genotype_scores = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float)); break; case SCANNER_DATA: read_pfx_string(gtc->hfile, >c->scanner_data.scanner_name, NULL); read_bytes(gtc->hfile, (void *)>c->scanner_data.pmt_green, sizeof(float)); read_bytes(gtc->hfile, (void *)>c->scanner_data.pmt_red, sizeof(float)); read_pfx_string(gtc->hfile, >c->scanner_data.scanner_version, NULL); read_pfx_string(gtc->hfile, >c->scanner_data.imaging_user, NULL); break; case CALL_RATE: read_bytes(gtc->hfile, (void *)>c->call_rate, sizeof(float)); break; case GENDER: read_bytes(gtc->hfile, (void *)>c->gender, sizeof(char)); break; case LOGR_DEV: read_bytes(gtc->hfile, (void *)>c->logr_dev, sizeof(float)); break; case GC10: read_bytes(gtc->hfile, (void *)>c->p10gc, sizeof(float)); break; case DX: read_bytes(gtc->hfile, (void *)>c->dx, sizeof(int32_t)); break; case SAMPLE_DATA: read_bytes(gtc->hfile, (void *)>c->sample_data, sizeof(SampleData)); break; case B_ALLELE_FREQS: gtc->b_allele_freqs = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float)); break; case LOGR_RATIOS: gtc->logr_ratios = buffer_array_init(gtc->hfile, gtc->capacity, sizeof(float)); break; case PERCENTILES_X: read_bytes(gtc->hfile, (void *)>c->percentiles_x, sizeof(Percentiles)); break; case PERCENTILES_Y: read_bytes(gtc->hfile, (void *)>c->percentiles_y, sizeof(Percentiles)); break; case SLIDE_IDENTIFIER: read_pfx_string(gtc->hfile, >c->sentrix_id, NULL); break; default: error("GTC file format does not support TOC entry %d\n", id); break; } return 0; } static gtc_t *gtc_init(const char *fn, size_t capacity) { gtc_t *gtc = (gtc_t *)calloc(1, sizeof(gtc_t)); gtc->fn = strdup(fn); gtc->hfile = hopen(gtc->fn, "rb"); if (gtc->hfile == NULL) error("Could not open %s: %s\n", gtc->fn, strerror(errno)); if (is_gzip(gtc->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", gtc->fn); int i; uint8_t buffer[4]; if (hread(gtc->hfile, (void *)buffer, 4) < 4) error("Failed to read magic number from %s file\n", gtc->fn); if (memcmp(buffer, "gtc", 3) != 0) error("GTC file %s format identifier is bad\n", gtc->fn); if (buffer[3] > 5 && buffer[3] < 3) error("GTC file %s version %d is unsupported\n", gtc->fn, buffer[3]); gtc->version = (int32_t)buffer[3]; read_bytes(gtc->hfile, (void *)>c->number_toc_entries, sizeof(int32_t)); gtc->id = (uint16_t *)malloc(gtc->number_toc_entries * sizeof(uint16_t)); gtc->toc = (int32_t *)malloc(gtc->number_toc_entries * sizeof(int32_t)); for (i = 0; i < gtc->number_toc_entries; i++) { read_bytes(gtc->hfile, (void *)>c->id[i], sizeof(uint16_t)); read_bytes(gtc->hfile, (void *)>c->toc[i], sizeof(int32_t)); } gtc->capacity = capacity; for (i = 0; i < gtc->number_toc_entries; i++) gtc_read(gtc, gtc->id[i]); const char *ptr = strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn; gtc->display_name = strndup(ptr, strlen(ptr) - 4); gtc->sin_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float)); gtc->cos_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float)); for (i = 0; i < gtc->m_normalization_transforms; i++) { gtc->sin_theta[i] = (float)sin((double)gtc->normalization_transforms[i].theta); gtc->cos_theta[i] = (float)cos((double)gtc->normalization_transforms[i].theta); } return gtc; } static void gtc_destroy(gtc_t *gtc) { if (!gtc) return; if (hclose(gtc->hfile) < 0) error("Error closing GTC file %s\n", gtc->fn); free(gtc->fn); free(gtc->id); free(gtc->toc); free(gtc->sample_name); free(gtc->sample_plate); free(gtc->sample_well); free(gtc->cluster_file); free(gtc->snp_manifest); free(gtc->imaging_date); free(gtc->autocall_date); free(gtc->autocall_version); free(gtc->normalization_transforms); free(gtc->controls_x); free(gtc->controls_y); free(gtc->scanner_data.scanner_name); free(gtc->scanner_data.scanner_version); free(gtc->scanner_data.imaging_user); free(gtc->sentrix_id); free(gtc->display_name); free(gtc->sin_theta); free(gtc->cos_theta); buffer_array_destroy(gtc->raw_x); buffer_array_destroy(gtc->raw_y); buffer_array_destroy(gtc->genotypes); buffer_array_destroy(gtc->base_calls); buffer_array_destroy(gtc->genotype_scores); buffer_array_destroy(gtc->b_allele_freqs); buffer_array_destroy(gtc->logr_ratios); free(gtc); } static void gtc_to_csv(const gtc_t *gtc, FILE *stream, int verbose) { fprintf(stream, "Illumina, Inc.\n"); fprintf(stream, "[Heading]\n"); fprintf(stream, "Descriptor File Name,%s\n", strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn); fprintf(stream, "GTC genotype file version,%d\n", gtc->version); fprintf(stream, "Number of TOC entries,%d\n", gtc->number_toc_entries); fprintf(stream, "Number of SNPs,%d\n", gtc->num_snps); fprintf(stream, "Ploidy,%d\n", gtc->ploidy); fprintf(stream, "Ploidy Type,%d\n", gtc->ploidy_type); fprintf(stream, "Sample name,%s\n", gtc->sample_name ? gtc->sample_name : ""); fprintf(stream, "Sample plate,%s\n", gtc->sample_plate ? gtc->sample_plate : ""); fprintf(stream, "Sample well,%s\n", gtc->sample_well ? gtc->sample_well : ""); fprintf(stream, "Cluster file,%s\n", gtc->cluster_file ? gtc->cluster_file : ""); fprintf(stream, "SNP manifest,%s\n", gtc->snp_manifest ? gtc->snp_manifest : ""); fprintf(stream, "Imaging date,%s\n", gtc->imaging_date ? gtc->imaging_date : ""); fprintf(stream, "AutoCall date,%s\n", gtc->autocall_date ? gtc->autocall_date : ""); fprintf(stream, "AutoCall version,%s\n", gtc->autocall_version); fprintf(stream, "Number of normalization transforms,%ld\n", gtc->m_normalization_transforms); fprintf(stream, "Number of controls X,%ld\n", gtc->m_controls_x); fprintf(stream, "Number of controls Y,%ld\n", gtc->m_controls_y); fprintf(stream, "Name of the scanner,%s\n", gtc->scanner_data.scanner_name ? gtc->scanner_data.scanner_name : ""); fprintf(stream, "Pmt Green,%d\n", gtc->scanner_data.pmt_green); fprintf(stream, "Pmt Red,%d\n", gtc->scanner_data.pmt_red); fprintf(stream, "Version of the scanner software used,%s\n", gtc->scanner_data.scanner_version ? gtc->scanner_data.scanner_version : ""); fprintf(stream, "Name of the scanner user,%s\n", gtc->scanner_data.imaging_user ? gtc->scanner_data.imaging_user : ""); fprintf(stream, "Call Rate,%f\n", gtc->call_rate); fprintf(stream, "Computed Gender,%c\n", gtc->gender); fprintf(stream, "LogR deviation,%f\n", gtc->logr_dev); fprintf(stream, "GenCall score - 10th percentile,%f\n", gtc->p10gc); fprintf(stream, "DX,%d\n", gtc->dx); fprintf(stream, "GenCall score - 50th percentile,%f\n", gtc->sample_data.p50gc); fprintf(stream, "Number of valid calls,%d\n", gtc->sample_data.num_calls); fprintf(stream, "Number of invalid calls,%d\n", gtc->sample_data.num_no_calls); fprintf(stream, "Number of loci that are \"Intensity Only\" or \"Zeroed\",%d\n", gtc->sample_data.num_intensity_only); fprintf(stream, "P05 X,%d\n", gtc->percentiles_x[0]); fprintf(stream, "P50 X,%d\n", gtc->percentiles_x[1]); fprintf(stream, "P95 X,%d\n", gtc->percentiles_x[2]); fprintf(stream, "P05 Y,%d\n", gtc->percentiles_y[0]); fprintf(stream, "P50 Y,%d\n", gtc->percentiles_y[1]); fprintf(stream, "P95 Y,%d\n", gtc->percentiles_y[2]); fprintf(stream, "Sentrix identifier for the slide,%s\n", gtc->sentrix_id ? gtc->sentrix_id : ""); fprintf(stream, "[Assay]\n"); fprintf(stream, "Raw X,Raw Y,GType,Top Alleles,Score,B Allele Freq,Log R Ratio\n"); int i; if (verbose) { for (i = 0; i < gtc->num_snps; i++) { uint16_t raw_x = 0, raw_y = 0; get_element(gtc->raw_x, (void *)&raw_x, i); get_element(gtc->raw_y, (void *)&raw_y, i); uint8_t genotype = 0; get_element(gtc->genotypes, (void *)&genotype, i); BaseCall base_call = {'-', '-'}; get_element(gtc->base_calls, (void *)&base_call, i); float genotype_score = NAN, b_allele_freq = NAN, logr_ratio = NAN; get_element(gtc->genotype_scores, (void *)&genotype_score, i); get_element(gtc->b_allele_freqs, (void *)&b_allele_freq, i); get_element(gtc->logr_ratios, (void *)&logr_ratio, i); fprintf(stream, "%d,%d,%s,%c%c,%.10f,%.10f,%.10f\n", raw_x, raw_y, code2genotype[genotype], base_call[0], base_call[1], genotype_score, b_allele_freq, logr_ratio); } } else { fprintf(stream, "... use --verbose to visualize assay data ...\n"); } fprintf(stream, "[Normalization Transforms]\n"); fprintf(stream, "Version,Offset X,Offset Y,Scale X,Scale Y,Shear,Theta,CVX,CVY,NN12,RR12,TAA,TBB\n"); if (verbose) { for (i = 0; i < gtc->m_normalization_transforms; i++) { fprintf(stream, "%d,%.10f,%.10f,%.10f,%.10f,%.10f,%.10f,", gtc->normalization_transforms[i].version, gtc->normalization_transforms[i].offset_x, gtc->normalization_transforms[i].offset_y, gtc->normalization_transforms[i].scale_x, gtc->normalization_transforms[i].scale_y, gtc->normalization_transforms[i].shear, gtc->normalization_transforms[i].theta); fprintf(stream, "%.10f,%.10f,%.10f,%.10f,%.10f,%.10f\n", gtc->normalization_transforms[i].cvx, gtc->normalization_transforms[i].cvy, gtc->normalization_transforms[i].nn12, gtc->normalization_transforms[i].rr12, gtc->normalization_transforms[i].taa, gtc->normalization_transforms[i].tbb); } } else { fprintf(stream, "... use --verbose to visualize assay data ...\n"); } // fprintf(stream, "[Controls]\n"); // fprintf(stream, "Raw X,Raw Y\n"); // if (verbose) { // for (i = 0; i < gtc->m_controls_x; i++) // fprintf(stream, "%d,%d\n", gtc->controls_x[i], gtc->controls_y[i]); // } else { // fprintf(stream, "... use --verbose to visualize controls data ...\n"); // } } static void gtcs_to_tsv(gtc_t **gtcs, int n, FILE *stream) { fprintf(stream, "gtc\tnumber_snps\tploidy\tploidy_type\tsample_name\tsample_plate\tsample_" "well\tcluster_file\tsnp_manifest\t" "scan_date\tautocall_date\tautocall_version\tnumber_normalization_" "transforms\tnumber_x_controls\t" "number_y_controls\tscanner_name\tpmt_green\tpmt_red\tscanner_software_" "version\tscanner_username\tcall_rate\t" "computed_gender\tlogr_deviation\tgencall_score_10_percentile\tdx\tgencall_score_" "50_percentile\t" "number_valid_calls\tnumber_invalid_calls\tnumber_intensity_only_or_zeroed_" "loci\tp05_x\tp50_x\tp95_x\tp05_y\t" "p50_y\tp95_y\tsentrix_barcode\n"); int i; for (i = 0; i < n; i++) { gtc_t *gtc = gtcs[i]; fprintf(stream, "%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%ld\t%ld\t%ld\t%s\t%d\t%d\t%" "s\t%s\t%f\t%c\t%f\t%f\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\n", strrchr(gtc->fn, '/') ? strrchr(gtc->fn, '/') + 1 : gtc->fn, gtc->num_snps, gtc->ploidy, gtc->ploidy_type, gtc->sample_name ? gtc->sample_name : "", gtc->sample_plate ? gtc->sample_plate : "", gtc->sample_well ? gtc->sample_well : "", gtc->cluster_file ? gtc->cluster_file : "", gtc->snp_manifest ? gtc->snp_manifest : "", gtc->imaging_date ? gtc->imaging_date : "", gtc->autocall_date ? gtc->autocall_date : "", gtc->autocall_version ? gtc->autocall_version : "", gtc->m_normalization_transforms, gtc->m_controls_x, gtc->m_controls_y, gtc->scanner_data.scanner_name ? gtc->scanner_data.scanner_name : "", gtc->scanner_data.pmt_green, gtc->scanner_data.pmt_red, gtc->scanner_data.scanner_version ? gtc->scanner_data.scanner_version : "", gtc->scanner_data.imaging_user ? gtc->scanner_data.imaging_user : "", gtc->call_rate, gtc->gender, gtc->logr_dev, gtc->p10gc, gtc->dx, gtc->sample_data.p50gc, gtc->sample_data.num_calls, gtc->sample_data.num_no_calls, gtc->sample_data.num_intensity_only, gtc->percentiles_x[0], gtc->percentiles_x[1], gtc->percentiles_x[2], gtc->percentiles_y[0], gtc->percentiles_y[1], gtc->percentiles_y[2], gtc->sentrix_id ? gtc->sentrix_id : ""); } } /**************************************** * SAM FILE IMPLEMENTATION * ****************************************/ static bpm_t *sam_csv_init(const char *fn, bpm_t *bpm, const char *genome_build, int flags) { htsFile *hts = hts_open(fn, "r"); if (hts == NULL || hts_get_format(hts)->category != sequence_data) error("File %s does not contain sequence data\n", fn); sam_hdr_t *sam_hdr = sam_hdr_read(hts); if (sam_hdr == NULL) error("Reading header from \"%s\" failed", fn); bam1_t *b = bam_init1(); if (b == NULL) error("Cannot create SAM record\n"); kstring_t str = {0, 0, NULL}; const char *chromosome = NULL; int i, strand = -1, position = 0, n_unmapped = 0; for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; int idx = get_position(hts, sam_hdr, b, locus_entry->ilmn_id, locus_entry->source_seq, 1, &chromosome, &position, &strand); if (idx < 0) { error("Reading from %s failed", fn); } else if (idx == 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine position for marker %s\n", locus_entry->ilmn_id); n_unmapped++; } free(locus_entry->genome_build); locus_entry->genome_build = strdup(genome_build); free(locus_entry->chrom); locus_entry->chrom = strdup(chromosome ? chromosome : "0"); free(locus_entry->map_info); str.l = 0; kputw(position, &str); locus_entry->map_info = strdup(str.s); free(locus_entry->ref_strand); locus_entry->ref_strand = ((strand < 0) || ((strcasecmp(locus_entry->ilmn_strand, locus_entry->source_strand) != 0) == strand)) ? strdup("+") : strdup("-"); } fprintf(stderr, "Lines total/unmapped:\t%d/%d\n", bpm->num_loci, n_unmapped); free(str.s); bam_destroy1(b); sam_hdr_destroy(sam_hdr); if (hts_close(hts) < 0) error("closing \"%s\" failed", fn); return bpm; } /**************************************** * INTENSITIES COMPUTATIONS * ****************************************/ // compute normalized intensities (http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf) static inline void raw_x_y2norm_x_y(uint16_t raw_x, uint16_t raw_y, float offset_x, float offset_y, float cos_theta, float sin_theta, float shear, float scale_x, float scale_y, float *norm_x, float *norm_y) { float temp_x = (float)raw_x - offset_x; float temp_y = (float)raw_y - offset_y; float temp_x2 = cos_theta * temp_x + sin_theta * temp_y; float temp_y2 = -sin_theta * temp_x + cos_theta * temp_y; float temp_x3 = temp_x2 - shear * temp_y2; *norm_x = temp_x3 < 0.0f ? 0.0f : temp_x3 / scale_x; *norm_y = temp_y2 < 0.0f ? 0.0f : temp_y2 / scale_y; } // compute Theta and R from raw intensities static inline void norm_x_y2ilmn_theta_r(float norm_x, float norm_y, float *ilmn_theta, float *ilmn_r) { *ilmn_theta = (float)(atan2((double)norm_y, (double)norm_x) * M_2_PI); *ilmn_r = norm_x + norm_y; } static void adjust_clusters(const uint8_t *gts, const float *ilmn_theta, const float *ilmn_r, int n, ClusterRecord *cluster_record) { cluster_record->aa_cluster_stats.N = 0; cluster_record->ab_cluster_stats.N = 0; cluster_record->bb_cluster_stats.N = 0; cluster_record->aa_cluster_stats.theta_mean *= 0.2f; cluster_record->ab_cluster_stats.theta_mean *= 0.2f; cluster_record->bb_cluster_stats.theta_mean *= 0.2f; cluster_record->aa_cluster_stats.r_mean *= 0.2f; cluster_record->ab_cluster_stats.r_mean *= 0.2f; cluster_record->bb_cluster_stats.r_mean *= 0.2f; int i; for (i = 0; i < n; i++) { switch (gts[i]) { case GT_AA: cluster_record->aa_cluster_stats.N++; cluster_record->aa_cluster_stats.theta_mean += ilmn_theta[i]; cluster_record->aa_cluster_stats.r_mean += ilmn_r[i]; break; case GT_AB: cluster_record->ab_cluster_stats.N++; cluster_record->ab_cluster_stats.theta_mean += ilmn_theta[i]; cluster_record->ab_cluster_stats.r_mean += ilmn_r[i]; break; case GT_BB: cluster_record->bb_cluster_stats.N++; cluster_record->bb_cluster_stats.theta_mean += ilmn_theta[i]; cluster_record->bb_cluster_stats.r_mean += ilmn_r[i]; break; default: break; } } cluster_record->aa_cluster_stats.theta_mean /= ((float)cluster_record->aa_cluster_stats.N + 0.2f); cluster_record->ab_cluster_stats.theta_mean /= ((float)cluster_record->ab_cluster_stats.N + 0.2f); cluster_record->bb_cluster_stats.theta_mean /= ((float)cluster_record->bb_cluster_stats.N + 0.2f); cluster_record->aa_cluster_stats.r_mean /= ((float)cluster_record->aa_cluster_stats.N + 0.2f); cluster_record->ab_cluster_stats.r_mean /= ((float)cluster_record->ab_cluster_stats.N + 0.2f); cluster_record->bb_cluster_stats.r_mean /= ((float)cluster_record->bb_cluster_stats.N + 0.2f); } /**************************************** * CONVERSION UTILITIES * ****************************************/ static inline char rev_allele(char allele) { static const char allele_complement[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'T', 0, 'G', 'D', 0, 0, 'C', 0, 'I', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'A', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; if (allele > 95) return 0; return allele_complement[(int)allele]; } static void gtcs_to_gs(gtc_t **gtc, int n, const bpm_t *bpm, const egt_t *egt, FILE *stream, int flags) { int i, j; // print header fputs("Index\tName\tAddress\tChr\tPosition", stream); if (flags & EGT_LOADED) fputs("\tGenTrain Score", stream); if (flags & BPM_LOADED) fputs("\tFrac A\tFrac C\tFrac G\tFrac T", stream); for (i = 0; i < n; i++) { if (flags & FORMAT_GT) fprintf(stream, "\t%s.GType", gtc[i]->display_name); if (flags & FORMAT_IGC) fprintf(stream, "\t%s.Score", gtc[i]->display_name); if ((flags & BPM_LOADED) && (flags & FORMAT_THETA)) fprintf(stream, "\t%s.Theta", gtc[i]->display_name); if ((flags & BPM_LOADED) && (flags & FORMAT_R)) fprintf(stream, "\t%s.R", gtc[i]->display_name); if (flags & FORMAT_BAF) fprintf(stream, "\t%s.B Allele Freq", gtc[i]->display_name); if (flags & FORMAT_LRR) fprintf(stream, "\t%s.Log R Ratio", gtc[i]->display_name); if (flags & FORMAT_X) fprintf(stream, "\t%s.X Raw", gtc[i]->display_name); if (flags & FORMAT_Y) fprintf(stream, "\t%s.Y Raw", gtc[i]->display_name); if ((flags & BPM_LOADED) && (flags & FORMAT_NORMX)) fprintf(stream, "\t%s.X", gtc[i]->display_name); if ((flags & BPM_LOADED) && (flags & FORMAT_NORMY)) fprintf(stream, "\t%s.Y", gtc[i]->display_name); if (flags & FORMAT_GT) fprintf(stream, "\t%s.Top Alleles\t%s.Plus/Minus Alleles", gtc[i]->display_name, gtc[i]->display_name); } fputc('\n', stream); // print loci for (j = 0; j < bpm->num_loci; j++) { LocusEntry *locus_entry = &bpm->locus_entries[j]; int norm_id = locus_entry && bpm->norm_lookups && bpm->locus_entries[j].norm_id != 0xFF ? bpm->norm_lookups[bpm->locus_entries[j].norm_id] : -1; ClusterRecord *cluster_record = NULL; if (flags & EGT_LOADED) { int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) error("Illumina probe %s not found in cluster file\n", locus_entry->name); cluster_record = &egt->cluster_records[idx]; } int strand = !locus_entry->ref_strand ? -1 : (strcmp(locus_entry->ref_strand, "+") == 0 ? 0 : (strcmp(locus_entry->ref_strand, "-") == 0 ? 1 : -1)); if (strand < 0) error("Unable to process reference strand %s\n", locus_entry->ref_strand); fprintf(stream, "%d\t%s\t%d\t%s\t%s", bpm->indexes ? bpm->indexes[j] : j, locus_entry->name, locus_entry->address_a, locus_entry->chrom, locus_entry->map_info); if (cluster_record) fprintf(stream, "\t%f", cluster_record->cluster_score.total_score); if (flags & BPM_LOADED) fprintf(stream, "\t%f\t%f\t%f\t%f", locus_entry->frac_a, locus_entry->frac_c, locus_entry->frac_g, locus_entry->frac_t); uint16_t raw_x, raw_y; float norm_x, norm_y, ilmn_r, ilmn_theta, baf, lrr; for (i = 0; i < n; i++) { uint8_t genotype; get_element(gtc[i]->genotypes, (void *)&genotype, j); float genotype_score; get_element(gtc[i]->genotype_scores, (void *)&genotype_score, j); BaseCall base_call; get_element(gtc[i]->base_calls, (void *)&base_call, j); get_element(gtc[i]->raw_x, (void *)&raw_x, j); get_element(gtc[i]->raw_y, (void *)&raw_y, j); norm_x = -NAN; norm_y = -NAN; ilmn_theta = -NAN; ilmn_r = -NAN; baf = -NAN; lrr = -NAN; if ((raw_x || raw_y) && norm_id >= 0) { XForm *xform = >c[i]->normalization_transforms[norm_id]; raw_x_y2norm_x_y(raw_x, raw_y, xform->offset_x, xform->offset_y, gtc[i]->cos_theta[norm_id], gtc[i]->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y); norm_x_y2ilmn_theta_r(norm_x, norm_y, &ilmn_theta, &ilmn_r); if (cluster_record) get_baf_lrr(ilmn_theta, ilmn_r, cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.theta_mean, cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean, cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean, locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf, &lrr); } if (isnan(baf)) get_element(gtc[i]->b_allele_freqs, (void *)&baf, j); if (isnan(lrr)) get_element(gtc[i]->logr_ratios, (void *)&lrr, j); char allele_a = strand ? rev_allele(locus_entry->snp[1]) : locus_entry->snp[1]; char allele_b = strand ? rev_allele(locus_entry->snp[3]) : locus_entry->snp[3]; BaseCall ref_call; switch (genotype) { case GT_NC: ref_call[0] = '-'; ref_call[1] = '-'; break; case GT_AA: ref_call[0] = allele_a; ref_call[1] = allele_a; break; case GT_AB: ref_call[0] = allele_a; ref_call[1] = allele_b; break; case GT_BB: ref_call[0] = allele_b; ref_call[1] = allele_b; break; default: error("Unable to process marker %s\n", locus_entry->name); break; } if (flags & FORMAT_GT) fprintf(stream, "\t%s", code2genotype[genotype]); if (flags & FORMAT_IGC) fprintf(stream, "\t%f", genotype_score); if ((flags & BPM_LOADED) && (flags & FORMAT_THETA)) fprintf(stream, "\t%f", ilmn_theta); if ((flags & BPM_LOADED) && (flags & FORMAT_R)) fprintf(stream, "\t%f", ilmn_r); if (flags & FORMAT_BAF) fprintf(stream, "\t%f", baf); if (flags & FORMAT_LRR) fprintf(stream, "\t%f", lrr); if (flags & FORMAT_X) fprintf(stream, "\t%u", raw_x); if (flags & FORMAT_Y) fprintf(stream, "\t%u", raw_y); if ((flags & BPM_LOADED) && (flags & FORMAT_NORMX)) fprintf(stream, "\t%f", norm_x); if ((flags & BPM_LOADED) && (flags & FORMAT_NORMY)) fprintf(stream, "\t%f", norm_y); if (flags & FORMAT_GT) fprintf(stream, "\t%c%c\t%c%c", base_call[0], base_call[1], ref_call[0], ref_call[1]); } fputc('\n', stream); } } static bcf_hdr_t *hdr_init(const faidx_t *fai, int flags) { bcf_hdr_t *hdr = bcf_hdr_init("w"); int i, n = faidx_nseq(fai); for (i = 0; i < n; i++) { const char *seq = faidx_iseq(fai, i); int len = faidx_seq_len(fai, seq); bcf_hdr_printf(hdr, "##contig=", seq, len); } bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); if (flags & BPM_LOADED) { bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); } if (flags & CSV_LOADED) { bcf_hdr_append(hdr, "##INFO="); } if ((flags & BPM_LOADED) | (flags & CSV_LOADED)) { bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); } if (flags & EGT_LOADED) { bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); } if (!(flags & NO_INFO_GC)) bcf_hdr_append(hdr, "##INFO="); if (flags & FORMAT_GT) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_GQ) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_IGC) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_BAF) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_LRR) bcf_hdr_append(hdr, "##FORMAT="); if ((flags & BPM_LOADED) | (flags & GENOME_STUDIO)) { if (flags & FORMAT_NORMX) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_NORMY) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_R) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_THETA) bcf_hdr_append(hdr, "##FORMAT="); } if (flags & FORMAT_X) bcf_hdr_append(hdr, "##FORMAT="); if (flags & FORMAT_Y) bcf_hdr_append(hdr, "##FORMAT="); return hdr; } static int gts_to_gt_arr(int32_t *gt_arr, const uint8_t *gts, int n, int allele_a_idx, int allele_b_idx) { int i; for (i = 0; i < n; i++) { switch (gts[i]) { case GT_NC: gt_arr[2 * i] = bcf_gt_missing; gt_arr[2 * i + 1] = bcf_gt_missing; break; case GT_AA: gt_arr[2 * i] = bcf_gt_unphased(allele_a_idx); gt_arr[2 * i + 1] = bcf_gt_unphased(allele_a_idx); break; case GT_AB: gt_arr[2 * i] = bcf_gt_unphased(min(allele_a_idx, allele_b_idx)); gt_arr[2 * i + 1] = bcf_gt_unphased(max(allele_a_idx, allele_b_idx)); break; case GT_BB: gt_arr[2 * i] = bcf_gt_unphased(allele_b_idx); gt_arr[2 * i + 1] = bcf_gt_unphased(allele_b_idx); break; default: return -1; } } return 0; } static int locus2bcf(const LocusEntry *locus_entry, const ClusterRecord *cluster_record, const bcf_hdr_t *hdr, const faidx_t *fai, int gc_win, int flags, kstring_t *allele_a, kstring_t *allele_b, kstring_t *flank, int32_t *allele_a_idx, int32_t *allele_b_idx, bcf1_t *rec) { rec->rid = bcf_hdr_name2id_flexible(hdr, locus_entry->chrom); char *endptr; rec->pos = strtol(locus_entry->map_info, &endptr, 0) - 1; if (locus_entry->map_info == endptr) error("Map info %s for marker %s is not understood\n", locus_entry->map_info, locus_entry->ilmn_id); int strand = !locus_entry->ref_strand ? -1 : (strcmp(locus_entry->ref_strand, "+") == 0 ? 0 : (strcmp(locus_entry->ref_strand, "-") == 0 ? 1 : -1)); if (rec->rid < 0 || rec->pos < 0) { if (flags & VERBOSE) fprintf(stderr, "Skipping unlocalized marker %s\n", locus_entry->ilmn_id); return -1; } bcf_update_id(hdr, rec, locus_entry->name); int len, win = min(max(100, locus_entry->source_seq ? max(gc_win, strlen(locus_entry->source_seq)) : gc_win), rec->pos); char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len); if (!ref || len == 1) error("faidx_fetch_seq failed at %s:%" PRId64 " (are you using the correct reference genome?)\n", bcf_seqname(hdr, rec), rec->pos + 1); strupper(ref); if (!(flags & NO_INFO_GC)) { float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]); bcf_update_info_float(hdr, rec, "GC", &gc_ratio, 1); } char ref_base[] = {'\0', '\0'}; ref_base[0] = ref[win]; allele_a->l = allele_b->l = 0; kputc(locus_entry->snp[1], allele_a); kputc(locus_entry->snp[3], allele_b); int is_indel = allele_a->s[0] == 'D' || allele_a->s[0] == 'I' || allele_b->s[0] == 'D' || allele_b->s[0] == 'I'; int ref_is_del = -1; if (is_indel && strand >= 0 && locus_entry->source_seq && strchr(locus_entry->source_seq, '-')) { flank->l = 0; kputs(locus_entry->source_seq, flank); strupper(flank->s); if ((strcasecmp(locus_entry->ilmn_strand, locus_entry->source_strand) != 0) != strand) flank_reverse_complement(flank->s); int shift = flank_left_shift(flank->s); ref_is_del = get_indel_alleles(allele_a, allele_b, flank->s, ref, win, len, shift); if (ref_is_del == 0) { rec->pos--; ref_base[0] = ref[win - 1]; } *allele_b_idx = ref_is_del < 0 ? 1 : ref_is_del ^ (locus_entry->snp[3] == 'D'); } else { if (allele_a->s[0] == 'N' && allele_b->s[0] == 'A') { allele_a->s[0] = '.'; allele_b->s[0] = '.'; } else if (is_indel) { ref_base[0] = allele_a->s[0]; } else { if (strand < 0) { if (strcmp(locus_entry->ilmn_strand, "BOT") == 0 || strcmp(locus_entry->ilmn_strand, "Bot") == 0) { allele_a->s[0] = rev_nt(allele_a->s[0]); allele_b->s[0] = rev_nt(allele_b->s[0]); } strand = get_strand_from_top_alleles(allele_a->s, allele_b->s, ref, win, len); if (strand < 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine reference strand for SNP %s\n", locus_entry->ilmn_id); allele_a->s[0] = '.'; allele_b->s[0] = '.'; } } if (strand == 1) { allele_a->s[0] = rev_nt(allele_a->s[0]); allele_b->s[0] = rev_nt(allele_b->s[0]); } } *allele_b_idx = get_allele_b_idx(ref_base[0], allele_a->s, allele_b->s); } free(ref); *allele_a_idx = get_allele_a_idx(*allele_b_idx); const char *alleles[3]; int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a->s, allele_b->s, *allele_b_idx); if (nals < 0) error("Unable to process marker %s\n", locus_entry->ilmn_id); bcf_update_alleles(hdr, rec, alleles, nals); bcf_update_info_int32(hdr, rec, "ALLELE_A", allele_a_idx, 1); bcf_update_info_int32(hdr, rec, "ALLELE_B", allele_b_idx, 1); if (flags & BPM_LOADED) { bcf_update_info_float(hdr, rec, "FRAC_A", &locus_entry->frac_a, 1); bcf_update_info_float(hdr, rec, "FRAC_C", &locus_entry->frac_c, 1); bcf_update_info_float(hdr, rec, "FRAC_G", &locus_entry->frac_g, 1); bcf_update_info_float(hdr, rec, "FRAC_T", &locus_entry->frac_t, 1); bcf_update_info_int32(hdr, rec, "NORM_ID", &locus_entry->norm_id, 1); } if (flags & CSV_LOADED) { bcf_update_info_int32(hdr, rec, "BEADSET_ID", &locus_entry->beadset_id, 1); } if ((flags & BPM_LOADED) | (flags & CSV_LOADED)) { int32_t assay_type = (int32_t)(flags & BPM_LOADED ? locus_entry->assay_type : locus_entry->assay_type_csv); bcf_update_info_flag(hdr, rec, "INTENSITY_ONLY", NULL, locus_entry->intensity_only); bcf_update_info_int32(hdr, rec, "ASSAY_TYPE", &assay_type, 1); } if (flags & EGT_LOADED) { bcf_update_info_float(hdr, rec, "GenTrain_Score", &cluster_record->cluster_score.total_score, 1); bcf_update_info_float(hdr, rec, "Orig_Score", &cluster_record->cluster_score.original_score, 1); if (cluster_record->cluster_score.edited) bcf_update_info_flag(hdr, rec, "Edited", NULL, 1); bcf_update_info_float(hdr, rec, "Cluster_Sep", &cluster_record->cluster_score.cluster_separation, 1); bcf_update_info_int32(hdr, rec, "N_AA", &cluster_record->aa_cluster_stats.N, 1); bcf_update_info_int32(hdr, rec, "N_AB", &cluster_record->ab_cluster_stats.N, 1); bcf_update_info_int32(hdr, rec, "N_BB", &cluster_record->bb_cluster_stats.N, 1); bcf_update_info_float(hdr, rec, "devR_AA", &cluster_record->aa_cluster_stats.r_dev, 1); bcf_update_info_float(hdr, rec, "devR_AB", &cluster_record->ab_cluster_stats.r_dev, 1); bcf_update_info_float(hdr, rec, "devR_BB", &cluster_record->bb_cluster_stats.r_dev, 1); bcf_update_info_float(hdr, rec, "devTHETA_AA", &cluster_record->aa_cluster_stats.theta_dev, 1); bcf_update_info_float(hdr, rec, "devTHETA_AB", &cluster_record->ab_cluster_stats.theta_dev, 1); bcf_update_info_float(hdr, rec, "devTHETA_BB", &cluster_record->bb_cluster_stats.theta_dev, 1); bcf_update_info_float(hdr, rec, "meanR_AA", &cluster_record->aa_cluster_stats.r_mean, 1); bcf_update_info_float(hdr, rec, "meanR_AB", &cluster_record->ab_cluster_stats.r_mean, 1); bcf_update_info_float(hdr, rec, "meanR_BB", &cluster_record->bb_cluster_stats.r_mean, 1); bcf_update_info_float(hdr, rec, "meanTHETA_AA", &cluster_record->aa_cluster_stats.theta_mean, 1); bcf_update_info_float(hdr, rec, "meanTHETA_AB", &cluster_record->ab_cluster_stats.theta_mean, 1); bcf_update_info_float(hdr, rec, "meanTHETA_BB", &cluster_record->bb_cluster_stats.theta_mean, 1); bcf_update_info_float(hdr, rec, "Intensity_Threshold", &cluster_record->intensity_threshold, 1); } if (is_indel && ref_is_del < 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine alleles for indel %s\n", locus_entry->ilmn_id); return 1; } return 0; } static void gtcs_to_vcf(faidx_t *fai, const bpm_t *bpm, const egt_t *egt, gtc_t **gtc, int n, htsFile *out_fh, bcf_hdr_t *hdr, int flags, int gc_win) { int i, j; uint8_t *gts = (uint8_t *)malloc(n * sizeof(uint8_t)); int32_t *gt_arr = (int32_t *)malloc(n * 2 * sizeof(int32_t)); int32_t *gq_arr = (int32_t *)malloc(n * sizeof(int32_t)); float *igc_arr = (float *)malloc(n * sizeof(float)); float *baf_arr = (float *)malloc(n * sizeof(float)); float *lrr_arr = (float *)malloc(n * sizeof(float)); float *norm_x_arr = (float *)malloc(n * sizeof(float)); float *norm_y_arr = (float *)malloc(n * sizeof(float)); float *ilmn_r_arr = (float *)malloc(n * sizeof(float)); float *ilmn_theta_arr = (float *)malloc(n * sizeof(float)); int32_t *raw_x_arr = (int32_t *)malloc(n * sizeof(int32_t)); int32_t *raw_y_arr = (int32_t *)malloc(n * sizeof(int32_t)); bcf1_t *rec = bcf_init(); kstring_t allele_a = {0, 0, NULL}; kstring_t allele_b = {0, 0, NULL}; kstring_t flank = {0, 0, NULL}; int32_t allele_a_idx, allele_b_idx; int n_missing = 0, n_skipped = 0; for (j = 0; j < bpm->num_loci; j++) { bcf_clear(rec); LocusEntry *locus_entry = &bpm->locus_entries[j]; int norm_id = bpm->norm_lookups && bpm->locus_entries[j].norm_id != 0xFF ? bpm->norm_lookups[bpm->locus_entries[j].norm_id] : -1; ClusterRecord *cluster_record = NULL; if (flags & EGT_LOADED) { int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) error("Illumina probe %s not found in cluster file\n", locus_entry->name); cluster_record = &egt->cluster_records[idx]; } switch (locus2bcf(locus_entry, cluster_record, hdr, fai, gc_win, flags, &allele_a, &allele_b, &flank, &allele_a_idx, &allele_b_idx, rec)) { case -1: n_skipped++; continue; case 1: n_missing++; break; } uint16_t raw_x, raw_y; rec->n_sample = n; for (i = 0; i < n; i++) { get_element(gtc[i]->genotypes, (void *)>s[i], j); get_element(gtc[i]->genotype_scores, (void *)&igc_arr[i], j); gq_arr[i] = (int)(-10 * log10(1 - igc_arr[i]) + .5); if (gq_arr[i] < 0) gq_arr[i] = 0; if (gq_arr[i] > 50) gq_arr[i] = 50; get_element(gtc[i]->raw_x, (void *)&raw_x, j); get_element(gtc[i]->raw_y, (void *)&raw_y, j); raw_x_arr[i] = (int32_t)raw_x; raw_y_arr[i] = (int32_t)raw_y; norm_x_arr[i] = -NAN; norm_y_arr[i] = -NAN; ilmn_r_arr[i] = -NAN; ilmn_theta_arr[i] = -NAN; baf_arr[i] = -NAN; lrr_arr[i] = -NAN; if ((raw_x || raw_y) && norm_id >= 0) { XForm *xform = >c[i]->normalization_transforms[norm_id]; raw_x_y2norm_x_y(raw_x, raw_y, xform->offset_x, xform->offset_y, gtc[i]->cos_theta[norm_id], gtc[i]->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x_arr[i], &norm_y_arr[i]); norm_x_y2ilmn_theta_r(norm_x_arr[i], norm_y_arr[i], &ilmn_theta_arr[i], &ilmn_r_arr[i]); if (cluster_record) get_baf_lrr(ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.theta_mean, cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean, cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean, locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]); } if (isnan(baf_arr[i])) get_element(gtc[i]->b_allele_freqs, (void *)&baf_arr[i], j); if (isnan(lrr_arr[i])) get_element(gtc[i]->logr_ratios, (void *)&lrr_arr[i], j); } if ((flags & ADJUST_CLUSTERS) && norm_id >= 0 && cluster_record && !bpm->locus_entries[j].intensity_only) { adjust_clusters(gts, ilmn_theta_arr, ilmn_r_arr, n, cluster_record); for (i = 0; i < n; i++) { if (!isnan(ilmn_theta_arr[i]) && !isnan(ilmn_r_arr[i])) get_baf_lrr(ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.theta_mean, cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean, cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean, locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]); } } if (!bpm->locus_entries[j].intensity_only) { gts_to_gt_arr(gt_arr, gts, n, allele_a_idx, allele_b_idx); bcf_update_genotypes(hdr, rec, gt_arr, n * 2); bcf_update_format_int32(hdr, rec, "GQ", gq_arr, n); bcf_update_format_float(hdr, rec, "IGC", igc_arr, n); } bcf_update_format_float(hdr, rec, "BAF", baf_arr, n); bcf_update_format_float(hdr, rec, "LRR", lrr_arr, n); bcf_update_format_float(hdr, rec, "NORMX", norm_x_arr, n); bcf_update_format_float(hdr, rec, "NORMY", norm_y_arr, n); bcf_update_format_float(hdr, rec, "R", ilmn_r_arr, n); bcf_update_format_float(hdr, rec, "THETA", ilmn_theta_arr, n); bcf_update_format_int32(hdr, rec, "X", raw_x_arr, n); bcf_update_format_int32(hdr, rec, "Y", raw_y_arr, n); if (bcf_write(out_fh, hdr, rec) < 0) error("Unable to write to output VCF file\n"); } fprintf(stderr, "Lines total/missing-reference/skipped:\t%d/%d/%d\n", bpm->num_loci, n_missing, n_skipped); free(gts); free(gt_arr); free(gq_arr); free(igc_arr); free(baf_arr); free(lrr_arr); free(norm_x_arr); free(norm_y_arr); free(ilmn_r_arr); free(ilmn_theta_arr); free(raw_x_arr); free(raw_y_arr); free(allele_a.s); free(allele_b.s); free(flank.s); bcf_destroy(rec); bcf_hdr_destroy(hdr); } #define GS_GT 0 #define GS_TOP_STRAND 1 #define GS_REF_STRAND 2 #define GS_IGC 3 #define GS_BAF 4 #define GS_LRR 5 #define GS_NORMX 6 #define GS_NORMY 7 #define GS_R 8 #define GS_THETA 9 #define GS_X 10 #define GS_Y 11 typedef struct { int *col2sample; int type; void *ptr; } gs_col_t; static int tsv_setter_gs_col(tsv_t *tsv, bcf1_t *rec, void *usr) { gs_col_t *gs_col = (gs_col_t *)usr; uint8_t *gts; char *strand_alleles, *endptr; switch (gs_col->type) { case GS_GT: gts = (uint8_t *)gs_col->ptr + gs_col->col2sample[tsv->icol]; if ((tsv->ss[0] == 'N' && tsv->ss[1] == 'C') || (tsv->ss[0] == '-' && tsv->ss[1] == '-')) *gts = GT_NC; else if (tsv->ss[0] == 'A' && tsv->ss[1] == 'A') *gts = GT_AA; else if (tsv->ss[0] == 'A' && tsv->ss[1] == 'B') *gts = GT_AB; else if (tsv->ss[0] == 'B' && tsv->ss[1] == 'B') *gts = GT_BB; else return -1; break; case GS_TOP_STRAND: case GS_REF_STRAND: strand_alleles = (char *)gs_col->ptr + 2 * gs_col->col2sample[tsv->icol]; strand_alleles[0] = tsv->ss[0]; strand_alleles[1] = tsv->ss[1]; break; case GS_IGC: case GS_BAF: case GS_LRR: case GS_NORMX: case GS_NORMY: case GS_R: case GS_THETA: ((float *)gs_col->ptr + gs_col->col2sample[tsv->icol])[0] = strtof(tsv->ss, &endptr); if (tsv->ss == endptr) return -1; break; case GS_X: case GS_Y: ((int32_t *)gs_col->ptr + gs_col->col2sample[tsv->icol])[0] = strtol(tsv->ss, &endptr, 0); if (tsv->ss == endptr) return -1; break; default: return -1; } return 0; } static int tsv_setter_chrom_flexible(tsv_t *tsv, bcf1_t *rec, void *usr) { char tmp = *tsv->se; *tsv->se = 0; rec->rid = bcf_hdr_name2id_flexible((bcf_hdr_t *)usr, tsv->ss); *tsv->se = tmp; return rec->rid == -1 ? -1 : 0; } static int tsv_setter_ilmn_strand(tsv_t *tsv, bcf1_t *rec, void *usr) { char **strand = (char **)usr; *strand = tsv->ss; return 0; } static int tsv_setter_snp(tsv_t *tsv, bcf1_t *rec, void *usr) { char **snp = (char **)usr; if (strncmp(tsv->ss, "[N/A]", 5) == 0) *snp = NULL; else *snp = tsv->ss; return 0; } static int tsv_register_all(tsv_t *tsv, const char *id, tsv_setter_t setter, void *usr) { int i, n = 0; for (i = 0; i < tsv->ncols; i++) { if (!tsv->cols[i].name || strcasecmp(tsv->cols[i].name, id)) continue; tsv->cols[i].setter = setter; tsv->cols[i].usr = usr; n++; } return n ? 0 : -1; } // adapted from Petr Danecek's implementation of tsv_parse() in bcftools/tsv2vcf.c static int tsv_parse_delimiter(tsv_t *tsv, bcf1_t *rec, char *str, int delimiter) { int status = 0; tsv->icol = 0; tsv->ss = tsv->se = str; while (*tsv->ss && tsv->icol < tsv->ncols) { if (delimiter) while (*tsv->se && (*tsv->se) != delimiter) tsv->se++; else while (*tsv->se && !isspace(*tsv->se)) tsv->se++; if (tsv->cols[tsv->icol].setter) { int ret = tsv->cols[tsv->icol].setter(tsv, rec, tsv->cols[tsv->icol].usr); if (ret < 0) return -1; status++; } if (delimiter) tsv->se++; else while (*tsv->se && isspace(*tsv->se)) tsv->se++; tsv->ss = tsv->se; tsv->icol++; } return status ? 0 : -1; } static void gs_to_vcf(faidx_t *fai, const bpm_t *bpm, const egt_t *egt, htsFile *gs_fh, htsFile *out_fh, bcf_hdr_t *hdr, const char *output_fname, char *index_fname, int write_index, int flags, int gc_win) { // read the header of the table kstring_t line = {0, 0, NULL}; if (hts_getline(gs_fh, KS_SEP_LINE, &line) <= 0) error("Empty file: %s\n", gs_fh->fn); int i, moff = 0, *off = NULL, ncols = ksplit_core(line.s, '\t', &moff, &off); kstring_t str = {0, 0, NULL}; int *col2sample = (int *)malloc(sizeof(int) * ncols); for (i = 0; i < ncols; i++) { char *ptr; if (i > 0) kputc(',', &str); if ((ptr = strrchr(&line.s[off[i]], '.'))) { *ptr++ = '\0'; if ((bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, &line.s[off[i]]) < 0)) bcf_hdr_add_sample(hdr, &line.s[off[i]]); if (strcmp(ptr, "GType") == 0) kputs("GT", &str); else if (strcmp(ptr, "Score") == 0 || strcmp(ptr, "GC Score") == 0) kputs("IGC", &str); else if (strcmp(ptr, "Theta") == 0 || strcmp(ptr, "Theta Illumina") == 0) kputs("THETA", &str); else if (strcmp(ptr, "R") == 0 || strcmp(ptr, "R Illumina") == 0) kputc('R', &str); else if (strcmp(ptr, "X Raw") == 0 || strcmp(ptr, "Raw X") == 0) kputc('X', &str); else if (strcmp(ptr, "Y Raw") == 0 || strcmp(ptr, "Raw Y") == 0) kputc('Y', &str); else if (strcmp(ptr, "X") == 0) kputs("NORMX", &str); else if (strcmp(ptr, "Y") == 0) kputs("NORMY", &str); else if (strcmp(ptr, "B Allele Freq") == 0) kputs("BAF", &str); else if (strcmp(ptr, "Log R Ratio") == 0) kputs("LRR", &str); else if (strcmp(ptr, "Top Alleles") == 0) kputs("TOP_STRAND", &str); else if (strcmp(ptr, "Plus/Minus Alleles") == 0) kputs("REF_STRAND", &str); else if (strcmp(ptr, "Import Calls") == 0) kputc('-', &str); else if (strcmp(ptr, "Concordance") == 0) kputc('-', &str); else if (strcmp(ptr, "Orig Call") == 0) kputc('-', &str); else if (strcmp(ptr, "CNV Value") == 0) kputc('-', &str); else if (strcmp(ptr, "CNV Confidence") == 0) kputc('-', &str); else error("Could not recognize FORMAT field: %s\n", ptr); col2sample[i] = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, &line.s[off[i]]); } else { ptr = &line.s[off[i]]; if (strcmp(ptr, "Index") == 0) kputc('-', &str); else if (strcmp(ptr, "Name") == 0 || strcmp(ptr, "SNP Name") == 0) kputs("ID", &str); else if (strcmp(ptr, "Address") == 0) kputc('-', &str); else if (strcmp(ptr, "Chr") == 0 || strcmp(ptr, "Chromosome") == 0) kputs("CHROM", &str); else if (strcmp(ptr, "Manifest") == 0) kputc('-', &str); else if (strcmp(ptr, "Position") == 0) kputs("POS", &str); else if (strcmp(ptr, "GenTrain Score") == 0) kputs("GENTRAIN_SCORE", &str); else if (strcmp(ptr, "Frac A") == 0) kputs("FRAC_A", &str); else if (strcmp(ptr, "Frac C") == 0) kputs("FRAC_C", &str); else if (strcmp(ptr, "Frac G") == 0) kputs("FRAC_G", &str); else if (strcmp(ptr, "Frac T") == 0) kputs("FRAC_T", &str); else if (strcmp(ptr, "IlmnStrand") == 0 || strcmp(ptr, "ILMN Strand") == 0) kputs("STRAND", &str); else if (strcmp(ptr, "SNP") == 0) kputs("SNP", &str); else error("Could not recognize INFO field: %s\n", ptr); col2sample[i] = -1; } } free(off); if (bcf_hdr_sync(hdr) < 0) error_errno("[%s] Failed to update header", __func__); // updates the number of samples int n = bcf_hdr_nsamples(hdr); tsv_t *tsv = tsv_init(str.s); if (tsv_register(tsv, "CHROM", tsv_setter_chrom_flexible, hdr) < 0) error("Expected Chr or Chromosome column\n"); if (tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0) error("Expected Position column\n"); if (tsv_register(tsv, "ID", tsv_setter_id, hdr) < 0 && bpm) error("Expected Name or SNP Name column when using --genome-studio with --bpm/--csv\n"); char *ilmn_strand = NULL; tsv_register(tsv, "STRAND", tsv_setter_ilmn_strand, &ilmn_strand); char *snp = NULL; tsv_register(tsv, "SNP", tsv_setter_snp, &snp); float total_score; int gentrain_score = tsv_register(tsv, "GENTRAIN_SCORE", tsv_read_float, &total_score); if (gentrain_score) bcf_hdr_append(hdr, "##INFO="); float frac[4]; int frac_a = tsv_register(tsv, "FRAC_A", tsv_read_float, &frac[0]); if (frac_a == 0) bcf_hdr_append(hdr, "##INFO="); int frac_c = tsv_register(tsv, "FRAC_C", tsv_read_float, &frac[1]); if (frac_c == 0) bcf_hdr_append(hdr, "##INFO="); int frac_g = tsv_register(tsv, "FRAC_G", tsv_read_float, &frac[2]); if (frac_g == 0) bcf_hdr_append(hdr, "##INFO="); int frac_t = tsv_register(tsv, "FRAC_T", tsv_read_float, &frac[3]); if (frac_t == 0) bcf_hdr_append(hdr, "##INFO="); if (bcf_hdr_write(out_fh, hdr) < 0) error("Unable to write to output VCF file\n"); if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0) error("Error: failed to initialise index for %s\n", output_fname); uint8_t *gts = (uint8_t *)malloc(n * sizeof(uint8_t)); char *top_strand_alleles = (char *)malloc(n * 2 * sizeof(char)); const char *strand_alleles = top_strand_alleles; char *ref_strand_alleles = (char *)malloc(n * 2 * sizeof(char)); int32_t *gt_arr = (int32_t *)malloc(n * 2 * sizeof(int32_t)); int32_t *gq_arr = (int32_t *)malloc(n * sizeof(int32_t)); float *igc_arr = (float *)malloc(n * sizeof(float)); float *baf_arr = (float *)malloc(n * sizeof(float)); float *lrr_arr = (float *)malloc(n * sizeof(float)); float *norm_x_arr = (float *)malloc(n * sizeof(float)); float *norm_y_arr = (float *)malloc(n * sizeof(float)); float *ilmn_r_arr = (float *)malloc(n * sizeof(float)); float *ilmn_theta_arr = (float *)malloc(n * sizeof(float)); int32_t *raw_x_arr = (int32_t *)malloc(n * sizeof(int32_t)); int32_t *raw_y_arr = (int32_t *)malloc(n * sizeof(int32_t)); int gs_input[12], gs_output[12]; gs_col_t gs_gts = {col2sample, GS_GT, gts}; gs_input[GS_GT] = !tsv_register_all(tsv, "GT", tsv_setter_gs_col, &gs_gts); if (!gs_input[GS_GT]) error("Expected GType column\n"); gs_col_t gs_top_strand = {col2sample, GS_TOP_STRAND, top_strand_alleles}; gs_input[GS_TOP_STRAND] = !tsv_register_all(tsv, "TOP_STRAND", tsv_setter_gs_col, &gs_top_strand); gs_col_t gs_ref_strand = {col2sample, GS_REF_STRAND, ref_strand_alleles}; gs_input[GS_REF_STRAND] = !tsv_register_all(tsv, "REF_STRAND", tsv_setter_gs_col, &gs_ref_strand); if (gs_input[GS_REF_STRAND]) strand_alleles = ref_strand_alleles; gs_col_t gs_igc = {col2sample, GS_IGC, igc_arr}; gs_input[GS_IGC] = !tsv_register_all(tsv, "IGC", tsv_setter_gs_col, &gs_igc); gs_col_t gs_baf = {col2sample, GS_BAF, baf_arr}; gs_input[GS_BAF] = !tsv_register_all(tsv, "BAF", tsv_setter_gs_col, &gs_baf); gs_col_t gs_lrr = {col2sample, GS_LRR, lrr_arr}; gs_input[GS_LRR] = !tsv_register_all(tsv, "LRR", tsv_setter_gs_col, &gs_lrr); gs_col_t gs_norm_x = {col2sample, GS_NORMX, norm_x_arr}; gs_input[GS_NORMX] = !tsv_register_all(tsv, "NORMX", tsv_setter_gs_col, &gs_norm_x); gs_col_t gs_norm_y = {col2sample, GS_NORMY, norm_y_arr}; gs_input[GS_NORMY] = !tsv_register_all(tsv, "NORMY", tsv_setter_gs_col, &gs_norm_y); gs_col_t gs_ilmn_r = {col2sample, GS_R, ilmn_r_arr}; gs_input[GS_R] = !tsv_register_all(tsv, "R", tsv_setter_gs_col, &gs_ilmn_r); gs_col_t gs_ilmn_theta = {col2sample, GS_THETA, ilmn_theta_arr}; gs_input[GS_THETA] = !tsv_register_all(tsv, "THETA", tsv_setter_gs_col, &gs_ilmn_theta); gs_col_t gs_raw_x = {col2sample, GS_X, raw_x_arr}; gs_input[GS_X] = !tsv_register_all(tsv, "X", tsv_setter_gs_col, &gs_raw_x); gs_col_t gs_raw_y = {col2sample, GS_Y, raw_y_arr}; gs_input[GS_Y] = !tsv_register_all(tsv, "Y", tsv_setter_gs_col, &gs_raw_y); gs_output[GS_GT] = flags & FORMAT_GT; gs_output[GS_IGC] = (flags & FORMAT_IGC) && gs_input[GS_IGC]; gs_output[GS_X] = (flags & FORMAT_X) && gs_input[GS_X]; gs_output[GS_Y] = (flags & FORMAT_Y) && gs_input[GS_Y]; gs_output[GS_NORMX] = (flags & FORMAT_NORMX) && gs_input[GS_NORMX]; gs_output[GS_NORMY] = (flags & FORMAT_NORMY) && gs_input[GS_NORMY]; gs_output[GS_R] = (flags & FORMAT_R) && (gs_input[GS_R] || (gs_input[GS_NORMX] && gs_input[GS_NORMY])); gs_output[GS_THETA] = (flags & FORMAT_THETA) && (gs_input[GS_THETA] || (gs_input[GS_NORMX] && gs_input[GS_NORMY])); gs_output[GS_BAF] = (flags & FORMAT_BAF) && (gs_input[GS_BAF] || (egt && ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] && gs_input[GS_THETA])))); gs_output[GS_LRR] = (flags & FORMAT_LRR) && (gs_input[GS_LRR] || (egt && ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] && gs_input[GS_THETA])))); int compute_ilmn_theta_r = gs_input[GS_NORMX] && gs_input[GS_NORMY] && (gs_output[GS_R] || gs_output[GS_THETA] || (egt && (gs_output[GS_BAF] || gs_output[GS_LRR]))); int compute_baf_lrr = ((gs_input[GS_NORMX] && gs_input[GS_NORMY]) || (gs_input[GS_R] || gs_input[GS_THETA])) && egt && (gs_output[GS_BAF] || gs_output[GS_LRR]); bcf1_t *rec = bcf_init(); kstring_t allele_a = {0, 0, NULL}; kputc('.', &allele_a); kstring_t allele_b = {0, 0, NULL}; kputc('.', &allele_b); kstring_t flank = {0, 0, NULL}; int32_t allele_a_idx, allele_b_idx; int n_total = 0, n_missing = 0, n_skipped = 0; while (hts_getline(gs_fh, KS_SEP_LINE, &line) > 0) { if (line.s[0] == '#') continue; // skip comments n_total++; bcf_clear(rec); rec->n_sample = n; int intensity_only = 0; if (!tsv_parse_delimiter(tsv, rec, line.s, '\t')) { if (bpm) { int idx; int ret = khash_str2int_get(bpm->names2index, rec->d.id, &idx); if (ret < 0) error("Illumina probe %s not found in manifest file\n", rec->d.id); LocusEntry *locus_entry = &bpm->locus_entries[idx]; intensity_only = locus_entry->intensity_only; ClusterRecord *cluster_record = NULL; if (flags & EGT_LOADED) { int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) error("Illumina probe %s not found in cluster file\n", locus_entry->name); cluster_record = &egt->cluster_records[idx]; } switch (locus2bcf(locus_entry, cluster_record, hdr, fai, gc_win, flags, &allele_a, &allele_b, &flank, &allele_a_idx, &allele_b_idx, rec)) { case -1: n_skipped++; continue; case 1: n_missing++; break; } if (compute_ilmn_theta_r) for (i = 0; i < n; i++) norm_x_y2ilmn_theta_r(norm_x_arr[i], norm_y_arr[i], &ilmn_theta_arr[i], &ilmn_r_arr[i]); if (compute_baf_lrr) { if ((flags & ADJUST_CLUSTERS) && !locus_entry->intensity_only) adjust_clusters(gts, ilmn_theta_arr, ilmn_r_arr, n, cluster_record); for (i = 0; i < n; i++) { if (!isnan(ilmn_theta_arr[i]) && !isnan(ilmn_r_arr[i])) { get_baf_lrr( ilmn_theta_arr[i], ilmn_r_arr[i], cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.theta_mean, cluster_record->bb_cluster_stats.theta_mean, cluster_record->aa_cluster_stats.r_mean, cluster_record->ab_cluster_stats.r_mean, cluster_record->bb_cluster_stats.r_mean, locus_entry->intensity_only ? cluster_record->r_mean : NAN, &baf_arr[i], &lrr_arr[i]); } else { baf_arr[i] = -NAN; lrr_arr[i] = -NAN; } } } } else { if (rec->rid < 0 || rec->pos < 0) { if (flags & VERBOSE) fprintf(stderr, "Skipping unlocalized marker %s\n", rec->d.id); n_skipped++; continue; } // determine A and B alleles allele_a.s[0] = '.'; allele_b.s[0] = '.'; if (ilmn_strand && snp) { if (strncmp(ilmn_strand, "BOT", 3) == 0) { allele_a.s[0] = rev_nt(snp[1]); allele_b.s[0] = rev_nt(snp[3]); } else { allele_a.s[0] = snp[1]; allele_b.s[0] = snp[3]; } } else { for (i = 0; i < n; i++) { switch (gts[i]) { case GT_NC: break; case GT_AA: allele_a.s[0] = strand_alleles[2 * i]; break; case GT_AB: allele_a.s[0] = strand_alleles[2 * i]; allele_b.s[0] = strand_alleles[2 * i + 1]; break; case GT_BB: allele_b.s[0] = strand_alleles[2 * i]; break; default: error("Unable to process marker %s\n", rec->d.id); break; } } } int len, win = min(max(100, gc_win), rec->pos); char *ref = faidx_fetch_seq(fai, bcf_seqname(hdr, rec), rec->pos - win, rec->pos + win, &len); if (!ref || len == 1) error("faidx_fetch_seq failed at %s:%" PRId64 " (are you using the correct reference genome?)\n", bcf_seqname(hdr, rec), rec->pos + 1); strupper(ref); if (!(flags & NO_INFO_GC)) { float gc_ratio = get_gc_ratio(&ref[max(win - gc_win, 0)], &ref[min(win + gc_win, len)]); bcf_update_info_float(hdr, rec, "GC", &gc_ratio, 1); } char ref_base[] = {ref[win], '\0'}; int is_indel = allele_a.s[0] == 'D' || allele_a.s[0] == 'I' || allele_b.s[0] == 'D' || allele_b.s[0] == 'I'; if (is_indel) { if (allele_a.s[0] == '.') { allele_a.s[0] = allele_b.s[0] == 'D' ? 'I' : 'D'; } if (allele_b.s[0] == '.') { allele_b.s[0] = allele_a.s[0] == 'D' ? 'I' : 'D'; } ref_base[0] = allele_a.s[0]; n_missing++; } else if ((ilmn_strand && snp) || strand_alleles == top_strand_alleles) { if (allele_a.s[0] == '.' || allele_b.s[0] == '.') { allele_a.s[0] = '.'; allele_b.s[0] = '.'; } else { int strand = get_strand_from_top_alleles(allele_a.s, allele_b.s, ref, win, len); if (strand < 0) { if (flags & VERBOSE) fprintf(stderr, "Unable to determine reference strand for SNP %s\n", rec->d.id); allele_a.s[0] = '.'; allele_b.s[0] = '.'; } else if (strand == 1) { allele_a.s[0] = rev_nt(allele_a.s[0]); allele_b.s[0] = rev_nt(allele_b.s[0]); } } } free(ref); allele_b_idx = get_allele_b_idx(ref_base[0], allele_a.s, allele_b.s); allele_a_idx = get_allele_a_idx(allele_b_idx); const char *alleles[3]; int nals = alleles_ab_to_vcf(alleles, ref_base, allele_a.s, allele_b.s, allele_b_idx); if (nals < 0) error("Unable to process marker %s\n", rec->d.id); bcf_update_alleles(hdr, rec, alleles, nals); bcf_update_info_int32(hdr, rec, "ALLELE_A", &allele_a_idx, 1); bcf_update_info_int32(hdr, rec, "ALLELE_B", &allele_b_idx, 1); if (gentrain_score == 0) bcf_update_info_float(hdr, rec, "GenTrain_Score", &total_score, 1); if (frac_a == 0) bcf_update_info_float(hdr, rec, "FRAC_A", &frac[0], 1); if (frac_c == 0) bcf_update_info_float(hdr, rec, "FRAC_C", &frac[1], 1); if (frac_g == 0) bcf_update_info_float(hdr, rec, "FRAC_G", &frac[2], 1); if (frac_t == 0) bcf_update_info_float(hdr, rec, "FRAC_T", &frac[3], 1); } if (!intensity_only) { if (allele_a_idx >= 0 && allele_b_idx >= 0) { gts_to_gt_arr(gt_arr, gts, n, allele_a_idx, allele_b_idx); } else { for (i = 0; i < n; i++) { gt_arr[2 * i] = bcf_gt_missing; gt_arr[2 * i + 1] = bcf_gt_missing; } } bcf_update_genotypes(hdr, rec, gt_arr, n * 2); if (gs_output[GS_IGC]) { for (i = 0; i < n; i++) { gq_arr[i] = (int)(-10 * log10(1 - igc_arr[i]) + .5); if (gq_arr[i] < 0) gq_arr[i] = 0; if (gq_arr[i] > 50) gq_arr[i] = 50; } bcf_update_format_float(hdr, rec, "IGC", (float *)gs_igc.ptr, n); if (flags && FORMAT_GQ) bcf_update_format_int32(hdr, rec, "GQ", gq_arr, n); } } if (gs_output[GS_BAF]) bcf_update_format_float(hdr, rec, "BAF", baf_arr, n); if (gs_output[GS_LRR]) bcf_update_format_float(hdr, rec, "LRR", lrr_arr, n); if (gs_output[GS_NORMX]) bcf_update_format_float(hdr, rec, "NORMX", norm_x_arr, n); if (gs_output[GS_NORMY]) bcf_update_format_float(hdr, rec, "NORMY", norm_y_arr, n); if (gs_output[GS_R]) bcf_update_format_float(hdr, rec, "R", ilmn_r_arr, n); if (gs_output[GS_THETA]) bcf_update_format_float(hdr, rec, "THETA", ilmn_theta_arr, n); if (gs_output[GS_X]) bcf_update_format_int32(hdr, rec, "X", raw_x_arr, n); if (gs_output[GS_Y]) bcf_update_format_int32(hdr, rec, "Y", raw_y_arr, n); if (bcf_write(out_fh, hdr, rec) < 0) error("Unable to write to output VCF file\n"); } else { if (flags & VERBOSE) fprintf(stderr, "Failed to process marker %s\n", rec->d.id); n_skipped++; } } fprintf(stderr, "Lines total/missing-reference/skipped:\t%d/%d/%d\n", n_total, n_missing, n_skipped); free(line.s); free(col2sample); free(gts); free(gt_arr); free(gq_arr); free(igc_arr); free(baf_arr); free(lrr_arr); free(norm_x_arr); free(norm_y_arr); free(ilmn_r_arr); free(ilmn_theta_arr); free(raw_x_arr); free(raw_y_arr); free(top_strand_alleles); free(ref_strand_alleles); tsv_destroy(tsv); free(str.s); free(allele_a.s); free(allele_b.s); free(flank.s); bcf_destroy(rec); bcf_hdr_destroy(hdr); if (hts_close(gs_fh) < 0) error("Error: close failed: %s\n", gs_fh->fn); } /**************************************** * PLUGIN * ****************************************/ const char *about(void) { return "Convert Illumina GTC files to VCF.\n"; } static const char *usage_text(void) { return "\n" "About: convert Illumina GTC files containing intensity data into VCF. " "(version " GTC2VCF_VERSION " http://github.com/freeseek/gtc2vcf)\n" "Usage: bcftools +gtc2vcf [options] [ ...]\n" "\n" "Plugin options:\n" " -l, --list-tags list available FORMAT tags with description for VCF output\n" " -t, --tags LIST list of output FORMAT tags [" TAG_LIST_DFLT "]\n" " -b, --bpm BPM manifest file\n" " -c, --csv CSV manifest file (can be gzip compressed)\n" " -e, --egt EGT cluster file\n" " -f, --fasta-ref reference sequence in fasta format\n" " --set-cache-size select fasta cache size in bytes\n" " --gc-window-size window size in bp used to compute the GC content (-1 for no " "estimate) [" GC_WIN_DFLT "]\n" " -g, --gtcs GTC genotype files from directory or list from file\n" " -i, --idat input IDAT files rather than GTC files\n" " --capacity number of variants to read from intensity files per I/O operation " "[" CAPACITY_DFLT "]\n" " --adjust-clusters adjust cluster centers in (Theta, R) space (requires --bpm and " "--egt)\n" " --use-gtc-sample-names use sample name in GTC files rather than GTC file name\n" " --do-not-check-bpm do not check whether BPM and GTC files match manifest file name\n" " --do-not-check-eof do not check whether the BPM and EGT readers reach the end of the " "file\n" " --genome-studio input a GenomeStudio final report file (in matrix format)\n" " --no-version do not append version and command line to the header\n" " -o, --output write output to a file [standard output]\n" " -O, --output-type u|b|v|z|t[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF\n" " t: GenomeStudio tab-delimited text output, 0-9: compression level " "[v]\n" " --threads number of extra output compression threads [0]\n" " -x, --extra write GTC metadata to a file\n" " -v, --verbose print verbose information\n" " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Manifest options:\n" " --beadset-order output BeadSetID normalization order (requires --bpm and --csv)\n" " --fasta-flank output flank sequence in FASTA format (requires --csv)\n" " -s, --sam-flank input flank sequence alignment in SAM/BAM format (requires --csv)\n" " --genome-build genome build ID used to update the manifest file [" GENOME_BUILD_DFLT "]\n" "\n" "Examples:\n" " bcftools +gtc2vcf -i 5434246082_R03C01_Grn.idat\n" " bcftools +gtc2vcf 5434246082_R03C01.gtc\n" " bcftools +gtc2vcf -b HumanOmni2.5-4v1_H.bpm -c HumanOmni2.5-4v1_H.csv\n" " bcftools +gtc2vcf -e HumanOmni2.5-4v1_H.egt\n" " bcftools +gtc2vcf -c GSA-24v3-0_A1.csv -e GSA-24v3-0_A1_ClusterFile.egt -f human_g1k_v37.fasta -o " "GSA-24v3-0_A1.vcf\n" " bcftools +gtc2vcf -c HumanOmni2.5-4v1_H.csv -f human_g1k_v37.fasta 5434246082_R03C01.gtc -o " "5434246082_R03C01.vcf\n" " bcftools +gtc2vcf -f human_g1k_v37.fasta --genome-studio GenotypeReport.txt -o GenotypeReport.vcf\n" "\n" "Examples of manifest file options:\n" " bcftools +gtc2vcf -b GSA-24v3-0_A1.bpm -c GSA-24v3-0_A1.csv --beadset-order\n" " bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --fasta-flank -o GSA-24v3-0_A1.fasta\n" " bwa mem -M Homo_sapiens_assembly38.fasta GSA-24v3-0_A1.fasta -o GSA-24v3-0_A1.sam\n" " bcftools +gtc2vcf -c GSA-24v3-0_A1.csv --sam-flank GSA-24v3-0_A1.sam -o GSA-24v3-0_A1.GRCh38.csv\n" "\n"; } static int parse_tags(const char *str) { int i, flags = 0, n; char **tags = hts_readlist(str, 0, &n); for (i = 0; i < n; i++) { if (!strcasecmp(tags[i], "GT")) flags |= FORMAT_GT; else if (!strcasecmp(tags[i], "GQ")) flags |= FORMAT_GQ; else if (!strcasecmp(tags[i], "IGC")) flags |= FORMAT_IGC; else if (!strcasecmp(tags[i], "X")) flags |= FORMAT_X; else if (!strcasecmp(tags[i], "Y")) flags |= FORMAT_Y; else if (!strcasecmp(tags[i], "NORMX")) flags |= FORMAT_NORMX; else if (!strcasecmp(tags[i], "NORMY")) flags |= FORMAT_NORMY; else if (!strcasecmp(tags[i], "R")) flags |= FORMAT_R; else if (!strcasecmp(tags[i], "THETA")) flags |= FORMAT_THETA; else if (!strcasecmp(tags[i], "LRR")) flags |= FORMAT_LRR; else if (!strcasecmp(tags[i], "BAF")) flags |= FORMAT_BAF; else error("Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str, tags[i]); free(tags[i]); } if (n) free(tags); return flags; } static void list_tags(void) { error( "FORMAT/GT Number:1 Type:String .. Genotype\n" "FORMAT/GQ Number:1 Type:Integer .. Genotype Quality\n" "FORMAT/IGC Number:1 Type:Float .. Illumina GenCall Confidence Score\n" "FORMAT/BAF Number:1 Type:Float .. B Allele Frequency\n" "FORMAT/LRR Number:1 Type:Float .. Log R Ratio\n" "FORMAT/NORMX Number:1 Type:Float .. Normalized X intensity\n" "FORMAT/NORMY Number:1 Type:Float .. Normalized Y intensity\n" "FORMAT/R Number:1 Type:Float .. Normalized R value\n" "FORMAT/THETA Number:1 Type:Float .. Normalized Theta value\n" "FORMAT/X Number:1 Type:Integer .. Raw X intensity\n" "FORMAT/Y Number:1 Type:Integer .. Raw Y intensity\n"); } int run(int argc, char *argv[]) { const char *tag_list = TAG_LIST_DFLT; const char *bpm_fname = NULL; const char *csv_fname = NULL; const char *egt_fname = NULL; const char *gs_fname = NULL; const char *output_fname = "-"; const char *ref_fname = NULL; const char *pathname = NULL; const char *extra_fname = NULL; const char *sam_fname = NULL; const char *genome_build = GENOME_BUILD_DFLT; char *index_fname; char *tmp; int i, j; int flags = 0; int output_type = FT_VCF; int clevel = -1; size_t capacity = 0; int cache_size = 0; int gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0); int gtc_sample_names = 0; int bpm_check = 1; int eof_check = 1; int n_threads = 0; int record_cmd_line = 1; int write_index = 0; int binary_to_csv = 0; int beadset_order = 0; int fasta_flank = 0; faidx_t *fai = NULL; htsFile *out_fh = NULL; FILE *out_txt = NULL; static struct option loptions[] = { {"list-tags", no_argument, NULL, 'l'}, {"tags", required_argument, NULL, 't'}, {"bpm", required_argument, NULL, 'b'}, {"csv", required_argument, NULL, 'c'}, {"egt", required_argument, NULL, 'e'}, {"fasta-ref", required_argument, NULL, 'f'}, {"set-cache-size", required_argument, NULL, 1}, {"gc-window-size", required_argument, NULL, 2}, {"gtcs", required_argument, NULL, 'g'}, {"idat", no_argument, NULL, 'i'}, {"capacity", required_argument, NULL, 3}, {"adjust-clusters", no_argument, NULL, 4}, {"use-gtc-sample-names", no_argument, NULL, 5}, {"do-not-check-bpm", no_argument, NULL, 6}, {"do-not-check-eof", no_argument, NULL, 7}, {"genome-studio", required_argument, NULL, 8}, {"no-version", no_argument, NULL, 9}, {"output", required_argument, NULL, 'o'}, {"output-type", required_argument, NULL, 'O'}, {"threads", required_argument, NULL, 10}, {"extra", required_argument, NULL, 'x'}, {"verbose", no_argument, NULL, 'v'}, {"beadset-order", no_argument, NULL, 12}, {"fasta-flank", no_argument, NULL, 13}, {"sam-flank", required_argument, NULL, 's'}, {"genome-build", required_argument, NULL, 14}, {"write-index", optional_argument, NULL, 'W'}, {NULL, 0, NULL, 0}}; int c; while ((c = getopt_long(argc, argv, "h?lt:b:c:e:f:g:io:O:x:vs:W::", loptions, NULL)) >= 0) { switch (c) { case 'l': list_tags(); break; case 't': tag_list = optarg; break; case 'b': bpm_fname = optarg; break; case 'c': csv_fname = optarg; break; case 'e': egt_fname = optarg; break; case 'f': ref_fname = optarg; break; case 1: cache_size = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --set-cache-size %s\n", optarg); break; case 2: gc_win = (int)strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --gc-window-size %s\n", optarg); if (gc_win <= 0) flags |= NO_INFO_GC; break; case 'g': pathname = optarg; break; case 'i': flags |= LOAD_IDAT; break; case 3: capacity = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --capacity %s\n", optarg); break; case 4: flags |= ADJUST_CLUSTERS; break; case 5: gtc_sample_names = 1; break; case 6: bpm_check = 0; break; case 7: eof_check = 0; break; case 8: gs_fname = optarg; break; case 9: record_cmd_line = 0; break; case 'o': output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': output_type = FT_BCF_GZ; break; case 'u': output_type = FT_BCF; break; case 'z': output_type = FT_VCF_GZ; break; case 'v': output_type = FT_VCF; break; case 't': output_type = FT_TAB_TEXT; break; default: { clevel = strtol(optarg, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("The output type \"%s\" not recognised\n", optarg); } } if (optarg[1]) { clevel = strtol(optarg + 1, &tmp, 10); if (*tmp || clevel < 0 || clevel > 9) error("Could not parse argument: --compression-level %s\n", optarg + 1); } break; case 10: n_threads = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse argument: --threads %s\n", optarg); break; case 'x': extra_fname = optarg; break; case 'v': flags |= VERBOSE; break; case 12: beadset_order = 1; break; case 13: fasta_flank = 1; break; case 's': sam_fname = optarg; break; case 14: genome_build = optarg; break; case 'W': if (!(write_index = write_index_parse(optarg))) error("Unsupported index format '%s'\n", optarg); break; case 'h': case '?': default: error("%s", usage_text()); } } if ((((bpm_fname != NULL) || (csv_fname != NULL)) + (egt_fname != NULL) + (argc - optind > 0) + (pathname != NULL) == 1) && ref_fname == NULL && gs_fname == NULL) binary_to_csv = 1; if (sam_fname && (csv_fname == NULL)) error("The --sam-flank option requires the --csv option\n%s", usage_text()); if (binary_to_csv) { if (beadset_order && (bpm_fname == NULL || csv_fname == NULL)) error("The --beadset-order option requires both the --bpm and the --csv options\n%s", usage_text()); if (fasta_flank && (csv_fname == NULL)) error("The --fasta-flank option requires the --csv option\n%s", usage_text()); if (beadset_order + fasta_flank + (sam_fname != NULL) > 1) error( "Only one of --beadset-order or --fasta-flank or --sam-flank options can be " "used at once\n%s", usage_text()); } else { if (flags & LOAD_IDAT) error("The --idat option can only be used alone or with option --gtcs\n%s", usage_text()); if (beadset_order) error("The --beadset-order option can only be used with options --bpm and --csv\n%s", usage_text()); if (fasta_flank) error("The --fasta-flank option can only be used with options --bpm and --csv\n%s", usage_text()); if (!bpm_fname && !csv_fname && !gs_fname) error("Manifest file required when converting to VCF\n%s", usage_text()); if (!egt_fname && (flags & ADJUST_CLUSTERS)) error("Cluster file required when adjusting cluster centers\n%s", usage_text()); if (gs_fname && (argc - optind > 0 || pathname)) error("If a GenomeStudio final report file is provided, do not pass GTC files\n%s", usage_text()); if (gs_fname && output_type == FT_TAB_TEXT) error("If a GenomeStudio final report file is provided, you cannot output in GenomeStudio format\n%s", usage_text()); if (argc - optind > 0 && pathname) error("GTC files cannot be listed through both command interface and file list\n%s", usage_text()); if (!gs_fname && output_type != FT_TAB_TEXT && extra_fname) out_txt = get_file_handle(extra_fname); } flags |= parse_tags(tag_list); // beginning of plugin run fprintf(stderr, "gtc2vcf " GTC2VCF_VERSION " http://github.com/freeseek/gtc2vcf\n"); int nfiles = 0; char **filenames = NULL; if (pathname) { filenames = get_file_list(pathname, flags & LOAD_IDAT ? "idat" : "gtc", &nfiles); } else { nfiles = argc - optind; filenames = argv + optind; } void **files = (void **)malloc(nfiles * sizeof(void *)); // make sure the process is allowed to open enough files struct rlimit lim; getrlimit(RLIMIT_NOFILE, &lim); if (nfiles + 10 > lim.rlim_max) error("On this system you cannot open more than %ld files at once while %d is required\n", lim.rlim_max, nfiles + 10); if (nfiles + 10 > lim.rlim_cur) { lim.rlim_cur = nfiles + 10; fprintf(stderr, "Adjusting the limit of how many files can be open at once to %ld\n", lim.rlim_cur); setrlimit(RLIMIT_NOFILE, &lim); } if ((flags & ADJUST_CLUSTERS) && nfiles < 100) fprintf(stderr, "Warning: adjusting clusters with %d sample(s) is not recommended\n", nfiles); if (binary_to_csv || output_type == FT_TAB_TEXT) { out_txt = get_file_handle(output_fname); } else { char wmode[8]; set_wmode(wmode, output_type, (char *)output_fname, clevel); out_fh = hts_open(output_fname, hts_bcf_wmode(output_type)); if (out_fh == NULL) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, output_fname, strerror(errno)); if (n_threads) hts_set_threads(out_fh, n_threads); if (!ref_fname) error("VCF output requires the --fasta-ref option\n"); fai = fai_load(ref_fname); if (!fai) error("Could not load the reference %s\n", ref_fname); if (cache_size) fai_set_cache_size(fai, cache_size); if (extra_fname) out_txt = get_file_handle(extra_fname); } bpm_t *bpm = NULL; if (bpm_fname) { fprintf(stderr, "Reading BPM file %s\n", bpm_fname); bpm = bpm_init(bpm_fname, eof_check, gs_fname != NULL); flags |= BPM_LOADED; if (binary_to_csv && !csv_fname) bpm_to_csv(bpm, out_txt, flags); } if (csv_fname) { fprintf(stderr, "Reading CSV file %s\n", csv_fname); bpm = bpm_csv_init(csv_fname, bpm, gs_fname != NULL); flags |= CSV_LOADED; if (binary_to_csv && !sam_fname && !beadset_order && !fasta_flank) bpm_to_csv(bpm, out_txt, flags); } // output source sequences in FASTA format to be realigned by bwa mem if (fasta_flank) { for (i = 0; i < bpm->num_loci; i++) flank2fasta(bpm->locus_entries[i].ilmn_id, bpm->locus_entries[i].source_seq, out_txt); } // input source sequence alignments in SAM format to generate new coordinates for the // CSV manifest file if (sam_fname) { fprintf(stderr, "Reading SAM file %s\n", sam_fname); bpm = sam_csv_init(sam_fname, bpm, genome_build, flags); if (binary_to_csv) bpm_to_csv(bpm, out_txt, flags); } // the BeadSet normalization order is the only information in the BPM manifest file // missing from the CSV manifest file kstring_t str = {0, 0, NULL}; if ((flags & BPM_LOADED) && (flags & CSV_LOADED)) { int32_t norm_id_to_beadset_id[100] = {0}; for (i = 0; i < bpm->num_loci; i++) { uint8_t norm_id = bpm->norm_ids[i] % 100; if (norm_id_to_beadset_id[norm_id] != 0 && norm_id_to_beadset_id[norm_id] != bpm->locus_entries[i].beadset_id) { if (norm_id > 4) // exception for possible overflow with Omni5 arrays error("Normalization ID %d corresponds to multiple BeadSet IDs %d and %d\n", norm_id, norm_id_to_beadset_id[norm_id], bpm->locus_entries[i].beadset_id); if (bpm->norm_ids[i] < 100) continue; } norm_id_to_beadset_id[norm_id] = bpm->locus_entries[i].beadset_id; } for (i = 0, j = 0; i < 100; i++) { if (norm_id_to_beadset_id[i] == 0) continue; if (i != j) error("Normalization ID %d not corresponding to any BeadSet ID", j); if (i > 0) kputc(',', &str); kputw(norm_id_to_beadset_id[i], &str); j++; } if (beadset_order && out_txt) fprintf(out_txt, "%s,%s\n", bpm->manifest_name, str.s); } if ((flags & ADJUST_CLUSTERS) && !(flags & BPM_LOADED)) error("Cannot adjust clusters as couldn't generate the normalization lookup table\n"); egt_t *egt = NULL; if (egt_fname) { fprintf(stderr, "Reading EGT file %s\n", egt_fname); egt = egt_init(egt_fname, eof_check); if (binary_to_csv) egt_to_csv(egt, out_txt, flags & VERBOSE); else flags |= EGT_LOADED; } if (bpm && egt) { if (bpm->num_loci < egt->num_records) fprintf(stderr, "Warning: Manifest file includes less loci (%d) than records in the cluster file (%d)\n", bpm->num_loci, egt->num_records); else if (bpm->num_loci > egt->num_records) error("Manifest file includes more loci (%d) than records in the cluster file (%d)\n", bpm->num_loci, egt->num_records); } if (gs_fname) flags |= GENOME_STUDIO; for (i = 0; i < nfiles; i++) { if (flags & LOAD_IDAT) { fprintf(stderr, "Reading IDAT file %s\n", filenames[i]); idat_t *idat = idat_init(filenames[i], nfiles == 1); files[i] = (void *)idat; } else { fprintf(stderr, "Reading GTC file %s\n", filenames[i]); gtc_t *gtc = gtc_init(filenames[i], capacity); // GenCall fills the GTC SNP manifest with the BPM file name rather than // the BPM manifest name if (bpm && bpm->fn && bpm_check && strncmp(bpm->manifest_name, gtc->snp_manifest, strlen(bpm->manifest_name)) && strcmp(strrchr(bpm->fn, '/') ? strrchr(bpm->fn, '/') + 1 : bpm->fn, gtc->snp_manifest)) error( "Manifest name %s in BPM file %s does not match manifest name %s in GTC " "file %s\nUse --do-not-check-bpm to suppress this check\n", bpm->manifest_name, bpm->fn, gtc->snp_manifest, gtc->fn); files[i] = (void *)gtc; } } if (binary_to_csv && nfiles > 0) { if (flags & LOAD_IDAT) { if (nfiles == 1) idat_to_csv((idat_t *)files[0], out_txt, flags & VERBOSE); else idats_to_tsv((idat_t **)files, nfiles, out_txt); } else { if (nfiles == 1) gtc_to_csv((gtc_t *)files[0], out_txt, flags & VERBOSE); else gtcs_to_tsv((gtc_t **)files, nfiles, out_txt); } } if (!binary_to_csv) { if (nfiles == 1) fprintf(stderr, "Warning: it is recommended to convert multiple GTC files at once\n"); if (output_type == FT_TAB_TEXT) { fprintf(stderr, "Writing GenomeStudio final report file\n"); gtcs_to_gs((gtc_t **)files, nfiles, bpm, egt, out_txt, flags); } else { fprintf(stderr, "Writing VCF file\n"); bcf_hdr_t *hdr = hdr_init(fai, flags); if (bpm_fname) bcf_hdr_printf(hdr, "##BPM=%s", strrchr(bpm_fname, '/') ? strrchr(bpm_fname, '/') + 1 : bpm_fname); if (csv_fname) bcf_hdr_printf(hdr, "##CSV=%s", strrchr(csv_fname, '/') ? strrchr(csv_fname, '/') + 1 : csv_fname); if (egt_fname) bcf_hdr_printf(hdr, "##EGT=%s", strrchr(egt_fname, '/') ? strrchr(egt_fname, '/') + 1 : egt_fname); if (sam_fname) bcf_hdr_printf(hdr, "##SAM=%s", strrchr(sam_fname, '/') ? strrchr(sam_fname, '/') + 1 : sam_fname); if ((flags & BPM_LOADED) && (flags & CSV_LOADED)) bcf_hdr_printf(hdr, "##BeadSet_Order=%s", str.s); if (record_cmd_line) bcf_hdr_append_version(hdr, argc, argv, "bcftools_gtc2vcf"); if (gs_fname) { htsFile *gs_fh = hts_open(gs_fname, "r"); bcf_hdr_printf(hdr, "##GenomeStudio=%s", strrchr(gs_fname, '/') ? strrchr(gs_fname, '/') + 1 : gs_fname); gs_to_vcf(fai, bpm, egt, gs_fh, out_fh, hdr, output_fname, index_fname, write_index, flags, gc_win); } else { if (extra_fname) gtcs_to_tsv((gtc_t **)files, nfiles, out_txt); for (i = 0; i < nfiles; i++) { gtc_t *gtc = (gtc_t *)files[i]; const char *sample_name = (gtc_sample_names && gtc->sample_name) ? gtc->sample_name : gtc->display_name; if (bcf_hdr_add_sample(hdr, sample_name) < 0) error("GTC files must correspond to different samples\n"); } if (bcf_hdr_write(out_fh, hdr) < 0) error("Unable to write to output VCF file\n"); if (init_index2(out_fh, hdr, output_fname, &index_fname, write_index) < 0) error("Error: failed to initialise index for %s\n", output_fname); gtcs_to_vcf(fai, bpm, egt, (gtc_t **)files, nfiles, out_fh, hdr, flags, gc_win); } if (write_index) { if (bcf_idx_save(out_fh) < 0) { if (hts_close(out_fh) != 0) error("Close failed %s\n", strcmp(output_fname, "-") ? output_fname : "stdout"); error("Error: cannot write to index %s\n", index_fname); } free(index_fname); } if (hts_close(out_fh) != 0) error("Close failed %s\n", strcmp(output_fname, "-") ? output_fname : "stdout"); } } free(str.s); fai_destroy(fai); egt_destroy(egt); bpm_destroy(bpm); if (pathname) { for (i = 0; i < nfiles; i++) free(filenames[i]); free(filenames); } for (i = 0; i < nfiles; i++) { if (flags & LOAD_IDAT) idat_destroy((idat_t *)files[i]); else gtc_destroy((gtc_t *)files[i]); } free(files); if (out_txt && out_txt != stdout && out_txt != stderr) fclose(out_txt); return 0; } ================================================ FILE: gtc2vcf.h ================================================ /* The MIT License Copyright (c) 2018-2025 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #define min(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a < _b ? _a : _b; \ }) #define max(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a > _b ? _a : _b; \ }) // tests the end-of-file indicator for an hFILE static inline int heof(hFILE *hfile) { if (hgetc(hfile) == EOF) return 1; hfile->begin--; return 0; } // read or skip a fixed number of bytes static inline void read_bytes(hFILE *hfile, void *buffer, size_t nbytes) { if (buffer) { if (hread(hfile, buffer, nbytes) < nbytes) { error("Failed to read %ld bytes from stream\n", nbytes); } } else { int i, c = 0; for (i = 0; i < nbytes; i++) c = hgetc(hfile); if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nbytes); } } static inline char **get_file_list(const char *pathname, const char *extension, int *nfiles) { char **filenames = NULL; DIR *d = opendir(pathname); if (d) { // check if d is a directory struct dirent *dir; int mfiles = 0; int p = strlen(pathname); while ((dir = readdir(d))) { const char *ptr = strrchr(dir->d_name, '.'); if (ptr && strcmp(ptr + 1, extension) == 0) { hts_expand0(char *, *nfiles + 1, mfiles, filenames); int q = strlen(dir->d_name); filenames[*nfiles] = (char *)malloc((p + q + 2) * sizeof(char)); memcpy(filenames[*nfiles], pathname, p); filenames[*nfiles][p] = '/'; memcpy(filenames[*nfiles] + p + 1, dir->d_name, q + 1); (*nfiles)++; } } closedir(d); } else { filenames = hts_readlines(pathname, nfiles); if (!filenames) error("Failed to read from file %s\n", pathname); } if (*nfiles == 0) error("No .%s files found in %s\n", extension, pathname); return filenames; } static inline FILE *get_file_handle(const char *str) { if (!str) return NULL; FILE *ret; if (strcmp(str, "-") == 0) { ret = stdout; } else { ret = fopen(str, "w"); if (!ret) error("Failed to open %s: %s\n", str, strerror(errno)); } return ret; } static inline void flank2fasta(const char *name, const char *flank, FILE *stream) { if (!flank) return; const char *left = strchr(flank, '['); const char *middle = strchr(flank, '/'); const char *right = strchr(flank, ']'); fprintf(stream, "@%s:1\n", name); if (!left && !middle && !right) { fprintf(stream, "%s\n", flank); return; } if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank); if (*(middle - 1) == '-') fprintf(stream, "%.*s%s\n", (int)(left - flank), flank, right + 1); else fprintf(stream, "%.*s%.*s%s\n", (int)(left - flank), flank, (int)(middle - left) - 1, left + 1, right + 1); fprintf(stream, "@%s:2\n", name); if (*(middle - 1) == '-') fprintf(stream, "%.*s%.*s%s\n", (int)(left - flank), flank, (int)(right - middle) - 1, middle + 1, right + 1); else fprintf(stream, "%.*s%.*s%s\n", (int)(left - flank), flank, (int)(right - middle) - 1, middle + 1, right + 1); } static inline int bcf_hdr_name2id_flexible(const bcf_hdr_t *hdr, char *chr) { if (!chr) return -1; char buf[] = {'c', 'h', 'r', '\0', '\0', '\0'}; int rid = bcf_hdr_name2id(hdr, chr); if (rid >= 0) return rid; if (strncmp(chr, "chr", 3) == 0) rid = bcf_hdr_name2id(hdr, chr + 3); if (rid >= 0) return rid; strncpy(buf + 3, chr, 2); rid = bcf_hdr_name2id(hdr, buf); if (rid >= 0) return rid; if (strcmp(chr, "23") == 0 || strcmp(chr, "25") == 0 || strcmp(chr, "XY") == 0 || strcmp(chr, "XX") == 0 || strcmp(chr, "PAR1") == 0 || strcmp(chr, "PAR2") == 0) { rid = bcf_hdr_name2id(hdr, "X"); if (rid >= 0) return rid; rid = bcf_hdr_name2id(hdr, "chrX"); } else if (strcmp(chr, "24") == 0) { rid = bcf_hdr_name2id(hdr, "Y"); if (rid >= 0) return rid; rid = bcf_hdr_name2id(hdr, "chrY"); } else if (strcmp(chr, "26") == 0 || strcmp(chr, "MT") == 0 || strcmp(chr, "chrM") == 0) { rid = bcf_hdr_name2id(hdr, "MT"); if (rid >= 0) return rid; rid = bcf_hdr_name2id(hdr, "chrM"); } return rid; } static inline char rev_nt(char iupac) { static const char iupac_complement[128] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, '-', 0x2E, '/', 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 'T', 'V', 'G', 'H', 0x45, 0x46, 'C', 'D', 0x49, 0x4A, 'M', 0x4C, 'K', 'N', 0x4F, 0x50, 0x51, 'Y', 'S', 'A', 0x55, 'B', 'W', 0x58, 'R', 0x5A, ']', 0x5C, '[', 0x5E, 0x5F, 0x60, 't', 'v', 'g', 'h', 0x65, 0x66, 'c', 'd', 0x69, 0x6A, 'm', 0x6C, 'k', 'n', 0x6F, 0x70, 0x71, 'y', 's', 'a', 0x75, 'b', 'w', 0x78, 'r', 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, }; return iupac_complement[(int)(iupac & 0x7F)]; } static inline char mask_nt(char iupac) { static const char iupac_mask[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 0, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 0, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, }; return iupac_mask[(int)(iupac & 0x7F)]; } #define MAX_LENGTH_LEFT_ALLELE 256 static inline void flank_reverse_complement(char *flank) { // swap alleles, but only if first allele is one base pair long char *left = strchr(flank, '['); char *middle = strchr(flank, '/'); char *right = strchr(flank, ']'); if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank); char buf[MAX_LENGTH_LEFT_ALLELE]; if (middle - left - 1 > MAX_LENGTH_LEFT_ALLELE) error("Cannot swap alleles in flank sequence %s\n", flank); memmove((void *)buf, left + 1, middle - left - 1); memmove((void *)left + 1, middle + 1, right - middle - 1); *(left + (right - middle)) = '/'; memmove(left + (right - middle) + 1, (void *)buf, middle - left - 1); size_t i, len = strlen(flank); for (i = 0; i < len / 2; i++) { char tmp = flank[i]; flank[i] = rev_nt(flank[len - i - 1]); flank[len - i - 1] = rev_nt(tmp); } if (len % 2 == 1) flank[len / 2] = rev_nt(flank[len / 2]); } // this is the weird way Illumina left shifts indels // http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py static inline int flank_left_shift(char *flank) { char *left = strchr(flank, '['); char *middle = strchr(flank, '/'); char *right = strchr(flank, ']'); if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank); int n = 0; int len = (int)(right - middle) - 1; while ((left - flank >= len) && (strncmp(left - len, middle + 1, len) == 0)) { memmove(left - len, left, right - left + 1); left -= len; middle -= len; right -= len; memmove(right + 1, middle + 1, len); n += len; } const char *ptr; char nt = *(middle + 1); for (ptr = middle + 2; ptr < right; ptr++) if (*ptr != nt) nt = -1; while (nt > 0 && *(left - 1) == nt) { memmove(left - 1, left, right - left + 1); *right = nt; left--; middle--; right--; n++; } return n; } // returns 1 if the first sequence is the best alignment, and 2 if the second sequence is // if neither sequence is better or neither provides an alignment, it returns 0 // if it fails to read from the hts file, it returns -1 static inline int get_position(htsFile *hts, sam_hdr_t *sam_hdr, bam1_t *b, const char *name, const char *flank, int left_shift, const char **chromosome, int *position, int *strand) { const char *left = strchr(flank, '['); const char *middle = strchr(flank, '/'); const char *right = strchr(flank, ']'); int cnv = !left && !middle && !right; if (!cnv && (!left || !middle || !right)) error("Flank sequence is malformed: %s\n", flank); const char *chromosome_pair[2]; int position_pair[2], strand_pair[2]; int64_t aln_score_pair[2]; int idx = -1, ret; while (idx < 1 - cnv && (ret = sam_read1(hts, sam_hdr, b)) >= 0) { const char *qname = bam_get_qname(b); if (b->core.flag & BAM_FSECONDARY || b->core.flag & BAM_FSUPPLEMENTARY) continue; int qname_l = strlen(qname); if (strncmp(qname, name, qname_l - 2) != 0) error("Query ID %.*s found in SAM file but %s expected\n", qname_l - 2, qname, name); idx = qname[qname_l - 1] == '1' ? 0 : (qname[qname_l - 1] == '2' ? 1 : -1); if (idx < 0) error("Query ID %s found in SAM file does not end with :1 or :2\n", qname); chromosome_pair[idx] = sam_hdr_tid2name(sam_hdr, b->core.tid); position_pair[idx] = 0; strand_pair[idx] = -1; if (!(b->core.flag & BAM_FUNMAP)) { strand_pair[idx] = bam_is_rev(b); int n_cigar = b->core.n_cigar; const uint32_t *cigar = bam_get_cigar(b); position_pair[idx] = b->core.pos; int qlen = cnv ? (strlen(flank) + 1) / 2 : (bam_is_rev(b) ? strlen(flank) - (right - flank) : left - flank + 1); if (strchr(flank, '-')) { if (left_shift) { int len = (int)(right - middle) - 1; char nt = toupper(*(middle + 1)); const char *ptr; for (ptr = middle + 2; ptr < right; ptr++) if (*ptr != nt) nt = -1; if (bam_is_rev(b)) { ptr = right + 1; while (strncasecmp(middle + 1, ptr, len) == 0) { qlen -= len; ptr += len; } while (nt > 0 && toupper(*ptr) == nt) { qlen--; ptr++; } } else { ptr = left - len; while (ptr >= flank && (strncasecmp(ptr, middle + 1, len) == 0)) { qlen -= len; ptr -= len; } ptr += len - 1; while (nt > 0 && toupper(*ptr) == nt) { qlen--; ptr--; } } } if (idx == 0) qlen--; } int k; for (k = 0; k < n_cigar && qlen > 1; k++) { int type = bam_cigar_type(bam_cigar_op(cigar[k])); int len = bam_cigar_oplen(cigar[k]); if ((type & 1) && (type & 2)) { // consume reference sequence ( case M ) position_pair[idx] += min(len, qlen); qlen -= len; } else if (type & 1) { // consume query sequence ( case I ) qlen -= len; if (qlen <= 0) // we skipped the base pair that needed // to be localized { position_pair[idx] = 0; } } else if (type & 2) { position_pair[idx] += len; // consume reference sequence ( case D ) } } if (qlen == 1) position_pair[idx]++; } uint8_t *as = bam_aux_get(b, "AS"); aln_score_pair[idx] = bam_aux2i(as); } if (ret < -1) return -1; if (!cnv && ((aln_score_pair[0] == aln_score_pair[1] && position_pair[0] != position_pair[1]) || (position_pair[0] == 0 && position_pair[1] == 0))) { idx = -1; *chromosome = NULL; *position = 0; *strand = -1; } else { idx = cnv ? 0 : (aln_score_pair[1] > aln_score_pair[0]); *chromosome = chromosome_pair[idx]; *position = position_pair[idx]; *strand = strand_pair[idx]; } return idx + 1; } static inline void strupper(char *str) { char *s = str; while (*s) { *s = toupper((unsigned char)*s); s++; } } static inline float get_gc_ratio(const char *beg, const char *end) { int at_cnt = 0, cg_cnt = 0; const char *ptr; for (ptr = beg; ptr < end; ptr++) { int c = toupper(*ptr); if (c == 'A' || c == 'T') at_cnt++; if (c == 'C' || c == 'G') cg_cnt++; } return (float)(cg_cnt) / (float)(at_cnt + cg_cnt); } static inline int len_common_suffix(const char *s1, const char *s2, size_t n) { int ret = 0; while (ret < n && *s1 == *s2) { s1--; s2--; ret++; } return ret; } static inline int len_common_prefix(const char *s1, const char *s2, size_t n) { int ret = 0; while (ret < n && *s1 == *s2) { s1++; s2++; ret++; } return ret; } // http://github.com/Illumina/GTCtoVCF/blob/develop/BPMRecord.py // For an insertion relative to the reference, the position of the base immediately 5' to the // insertion (on the plus strand) is given. For a deletion relative to the reference, the // position of the most 5' deleted base (on the plus strand) is given static inline int get_indel_alleles(kstring_t *allele_a, kstring_t *allele_b, const char *flank, const char *ref, int win, int len, int shift) { const char *left = strchr(flank, '['); const char *middle = strchr(flank, '/'); const char *right = strchr(flank, ']'); if (!left || !middle || !right) error("Flank sequence is malformed: %s\n", flank); int del_left = len_common_suffix(left - 1, &ref[win], left - flank); int del_right = len_common_prefix(right + 1, &ref[win] + 1, strlen(right + 1)); int ins_match = strncmp(middle + 1, &ref[win], right - middle - 1) == 0; // same as indel_sequence_match int ins_left = len_common_suffix(left - 1, &ref[win] - 1, left - flank); int ins_right = len_common_prefix(right + 1, &ref[win] + (right - middle) - 1, strlen(right + 1)); int ref_is_del = (del_left >= ins_left) && (del_right >= ins_right); if ((ref_is_del && del_left * del_right == 0) || (!ref_is_del && (!ins_match || ins_left * ins_right == 0))) { // computes it again but with shifted coordinates to better match Illumina's _calculate_is_deletion() del_left = len_common_suffix(left - 1, &ref[win - shift], left - flank); del_right = len_common_prefix(right + 1, &ref[win - shift] + 1, strlen(right + 1)); ref_is_del = (del_left >= ins_left) && (del_right >= ins_right); if ((ref_is_del && del_left * del_right == 0) || (!ref_is_del && (!ins_match || ins_left * ins_right == 0))) return -1; } int allele_b_is_del = allele_b->s[0] == 'D'; allele_a->l = allele_b->l = 0; kputc(ref[win - 1 + ref_is_del], allele_a); kputc(ref[win - 1 + ref_is_del], allele_b); kputsn(ref_is_del ? middle + 1 : &ref[win], right - middle - 1, allele_b_is_del ? allele_a : allele_b); return ref_is_del; } static inline int get_allele_b_idx(char ref_base, char *allele_a, char *allele_b) { if (*allele_a == '.' && *allele_b == '.') { return -1; } else if (*allele_a == 'D' || *allele_a == 'I' || *allele_b == 'D' || *allele_b == 'I') { return 1; } else if (*allele_a == ref_base) { return 1; } else if (*allele_b == ref_base) { return 0; } else if (*allele_a == '.') { *allele_a = ref_base; return 1; } else if (*allele_b == '.') { *allele_b = ref_base; return 0; } else { return 2; } } static inline int get_allele_a_idx(int allele_b_idx) { switch (allele_b_idx) { case 0: return 1; case 1: return 0; case 2: return 1; default: return -1; } } static inline int alleles_ab_to_vcf(const char **alleles, const char *ref_base, const char *allele_a, const char *allele_b, int allele_b_idx) { switch (allele_b_idx) { case -1: alleles[0] = ref_base; return 1; case 0: alleles[0] = allele_b; if (*allele_a == '.') return 1; alleles[1] = allele_a; return 2; case 1: alleles[0] = allele_a; if (*allele_b == '.') return 1; alleles[1] = allele_b; return 2; case 2: alleles[0] = ref_base; alleles[1] = allele_a; alleles[2] = allele_b; return 3; default: return -1; } } // Petr Danecek's similar implementation in bcftools/plugins/fixref.c // http://www.illumina.com/documents/products/technotes/technote_topbot.pdf static inline int get_strand_from_top_alleles(char *allele_a, char *allele_b, const char *ref, int win, int len) { int i; char ref_base = ref[win]; int ia = (int)mask_nt(*allele_a); int ib = (int)mask_nt(*allele_b); int ir = (int)mask_nt(ref_base); // as alleles must be designated on the TOP strand, the only acceptable pairs are (A,C), // (A,G), (A, T), (C, G) switch (ia | ib) { case 1 | 2: // A and C case 1 | 4: // A and G if (ir == ia || ir == ib) return 0; else if (ref_base == rev_nt(*allele_a) || ref_base == rev_nt(*allele_b)) return 1; else return -1; // Reference allele is not A/C/G/T break; case 1 | 8: // A and T case 2 | 4: // C and G for (i = 1; i <= win; i++) { int ra = (int)mask_nt(ref[win - i]); int rb = (int)mask_nt(ref[win + i]); if (ra == 15 || rb == 15 || ra == rb) continue; // N switch (ra | rb) { case 1 | 2: // A and C case 1 | 4: // A and G case 2 | 8: // C and T case 4 | 8: // G and T return ra & (2 | 4); // A or T case 1 | 8: // A and T case 2 | 4: // C and G continue; default: return -1; // Flanking reference alleles are not valid alleles for TOP/BOT strand determination } } return -1; // Unable to determine reference sequence strand default: return -1; // Alleles are not TOP alleles } } // compute BAF and LRR from Theta and R as explained in Peiffer, D. A. et al. High-resolution genomic profiling of // chromosomal aberrations using Infinium whole-genome genotyping. Genome Res. 16, 1136–1148 (2006) static inline void get_baf_lrr(float ilmn_theta, float ilmn_r, float aa_theta, float ab_theta, float bb_theta, float aa_r, float ab_r, float bb_r, float r_mean, float *baf, float *lrr) { float r_ref; if (ilmn_theta == ab_theta) { r_ref = ab_r; *baf = 0.5f; } else if (ilmn_theta < ab_theta) { float slope = (aa_r - ab_r) / (aa_theta - ab_theta); float b = aa_r - (aa_theta * slope); r_ref = (slope * ilmn_theta) + b; *baf = 0.5f - (ab_theta - ilmn_theta) * 0.5f / (ab_theta - aa_theta); } else if (ilmn_theta > ab_theta) { float slope = (ab_r - bb_r) / (ab_theta - bb_theta); float b = ab_r - (ab_theta * slope); r_ref = (slope * ilmn_theta) + b; *baf = 1.0f - (bb_theta - ilmn_theta) * 0.5f / (bb_theta - ab_theta); } else { *lrr = -NAN; *baf = -NAN; return; } // for non-polymorphic (Illumina) markers we compute the LRR using the clusters mean *lrr = logf(ilmn_r / (isnan(r_mean) ? r_ref : r_mean)) * (float)M_LOG2E; } ================================================ FILE: gtc2vcf_plot.R ================================================ #!/usr/bin/env Rscript ### # The MIT License # # Copyright (C) 2019-2025 Giulio Genovese # # Author: Giulio Genovese # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ### options(error = function() {traceback(3); q("no", 1)}) gtc2vcf_plot_version <- '2025-08-19' suppressPackageStartupMessages(library(optparse)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(grid)) suppressPackageStartupMessages(library(gridExtra)) if (capabilities()[['cairo']]) options(bitmapType = 'cairo') parser <- OptionParser('usage: gtc2vcf_plot.R [options] --illumina|--affymetrix --vcf --chrom --pos --pdf|--png ') parser <- add_option(parser, c('--vcf'), type = 'character', help = 'input VCF file', metavar = '') parser <- add_option(parser, c('--illumina'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Illumina data') parser <- add_option(parser, c('--affymetrix'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Affymetrix data') parser <- add_option(parser, c('--birdseed'), action = 'store_true', default = FALSE, help = 'whether the input VCF file contains Affymetrix data from Birdseed') parser <- add_option(parser, c('--pdf'), type = 'character', help = 'output PDF file', metavar = '') parser <- add_option(parser, c('--png'), type = 'character', help = 'output PNG file', metavar = '') parser <- add_option(parser, c('--width'), type = 'double', default = 7.0, help = 'inches width of the output file [7.0]', metavar = '') parser <- add_option(parser, c('--height'), type = 'double', default = 7.0, help = 'inches height of the output file [7.0]', metavar = '') parser <- add_option(parser, c('--fontsize'), type = 'integer', default = 12, help = 'font size [12]', metavar = '') parser <- add_option(parser, c('--chrom'), type = 'character', help = 'chromosome', metavar = '') parser <- add_option(parser, c('--pos'), type = 'integer', help = 'chromosome position', metavar = '') parser <- add_option(parser, c('--id'), type = 'character', help = 'variant ID', metavar = '') parser <- add_option(parser, c('--samples'), type = 'character', help = 'comma-separated list of samples to include', metavar = '') parser <- add_option(parser, c('--samples-file'), type = 'character', help = 'file with list of samples to include', metavar = '') parser <- add_option(parser, c('--minimal'), action = 'store_true', default = FALSE, help = 'only plot NORMX/NORMY and BAF/LRR plots') parser <- add_option(parser, c('--zcall'), action = 'store_true', default = FALSE, help = 'plot ZCall thresholds') args <- parse_args(parser, commandArgs(trailingOnly = TRUE), convert_hyphens_to_underscores = TRUE) write(paste('gtc2vcf_plot.R', gtc2vcf_plot_version, 'http://github.com/freeseek/gtc2vcf'), stderr()) # make sure VCF is passed if (is.null(args$vcf)) {print_help(parser); stop('option --vcf is required')} if (is.null(args$chrom)) {print_help(parser); stop('option --chrom is required')} if (is.null(args$pos)) {print_help(parser); stop('option --pos is required')} if (args$illumina && args$affymetrix) {print_help(parser); stop('cannot use --illumina and --affymetrix at the same time')} if (args$illumina && args$birdseed) {print_help(parser); stop('cannot use --illumina and --birdseed at the same time')} if (args$affymetrix && args$zcall) {print_help(parser); stop('cannot use --affymetrix and --zcall at the same time')} if (is.null(args$pdf) && is.null(args$png)) {print_help(parser); stop('either --pdf or --png is required')} if (!is.null(args$pdf) && !is.null(args$png)) {print_help(parser); stop('cannot use --pdf and --png at the same time')} if (!is.null(args$png) && !capabilities('png')) {print_help(parser); stop('unable to start device PNG: no png support in this version of R\nyou need to reinstall R with support for PNG to use the --png option\n')} if (!is.null(args$samples) && !is.null(args$samples_file)) {print_help(parser); stop('cannot use --samples and --samples-file at the same time')} base <- c('CHROM', 'POS', 'ID') if (args$illumina) { info <- c('meanR_AA', 'meanR_AB', 'meanR_BB', 'meanTHETA_AA', 'meanTHETA_AB', 'meanTHETA_BB', 'devR_AA', 'devR_AB', 'devR_BB', 'devTHETA_AA', 'devTHETA_AB', 'devTHETA_BB') format <- c('GT', 'X', 'Y', 'NORMX', 'NORMY', 'R', 'THETA', 'BAF', 'LRR') if (args$zcall) { info <- c(info, c('zthresh_X', 'zthresh_Y')) } } else if (args$affymetrix) { info <- c('meanX_AA', 'meanX_AB', 'meanX_BB', 'meanY_AA', 'meanY_AB', 'meanY_BB', 'varX_AA', 'varX_AB', 'varX_BB', 'varY_AA', 'varY_AB', 'varY_BB', 'covarXY_AA', 'covarXY_AB', 'covarXY_BB') info <- c(info, paste0(info, '.1')) format <- c('GT', 'NORMX', 'NORMY', 'DELTA', 'SIZE', 'BAF', 'LRR') } else { info <- c() format <- c('GT', 'BAF', 'LRR') } fmt <- paste0('"[%', paste(base, collapse = '\\t%'), paste(c('', info), collapse = '\\t%INFO/'), paste(c('', format), collapse = '\\t%'), '\\n]"') names <- c(base, info, format) cmd <- paste0('bcftools query --format ', fmt, ' ', args$vcf, ' -r ', args$chrom, ':', args$pos, '-', args$pos) if (!is.null(args$samples)) cmd <- paste(cmd, '--samples', args$samples) if (!is.null(args$samples_file)) cmd <- paste(cmd, '--samples-file', args$samples_file) write(paste('Command:', cmd), stderr()) if (packageVersion('data.table') < '1.11.6') { df <- setNames(fread(cmd, sep = '\t', header = FALSE, na.strings = '.', data.table = FALSE), names) } else { df <- setNames(fread(cmd = cmd, sep = '\t', header = FALSE, na.strings = '.', data.table = FALSE), names) } if (!is.null(args$id)) { if (!(args$id %in% unique(df$ID))) stop('Specified ID not present at specified location') df <- df[df$ID == args$id,] } else { if ( length(unique(df$ID)) > 1 ) stop('More than one variant at the specified position, use --id to specify which variant to plot') } v <- sapply(df[, info], unique) if (args$illumina) { p1 <- ggplot(df, aes(x = Y, y = X, color = GT, shape = GT)) + geom_point(size = .5) + scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'none') p2 <- ggplot(df, aes(x = NORMY, y = NORMX, color = GT, shape = GT)) + geom_point(size = .5) + scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'none') if (args$zcall) { zthresh_X <- unique(df$zthresh_X) zthresh_Y <- unique(df$zthresh_Y) p2 <- p2 + geom_vline(xintercept = zthresh_Y, color = 'gray') + geom_hline(yintercept = zthresh_X, color = 'gray') } p3 <- ggplot(df, aes(x = THETA, y = R, color = GT, shape = GT)) + geom_point(size = .5) + scale_x_continuous(limits = c(0,1), expand = expand_scale(0)) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'none') for (gt in c('AA', 'AB', 'BB')) { t <- seq(0, 2*pi, length.out = 100) x <- unname(v[paste0('meanTHETA_', gt)]) + unname(v[paste0('devTHETA_', gt)])*cos(t) y <- unname(v[paste0('meanR_', gt)]) + unname(v[paste0('devR_', gt)])*sin(t) p3 <- p3 + annotate('path', x = x, y = y) } } else if (args$affymetrix) { p2 <- ggplot(df, aes(x = NORMX, y = NORMY, color = GT, shape = GT)) + geom_point(size = .5) + scale_x_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = c(0, .05))) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'none') p3 <- ggplot(df, aes(x = DELTA, y = SIZE, color = GT, shape = GT)) + geom_point(size = .5) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'none') for (gt in c('AA', 'AB', 'BB', 'AA.1', 'BB.1')) { a <- unname(v[paste0('varX_', gt)]) b <- unname(v[paste0('covarXY_', gt)]) c <- unname(v[paste0('varY_', gt)]) lambda1 <- (a+c)/2 + sqrt(((a-c)/2)^2+b^2) lambda2 <- (a+c)/2 - sqrt(((a-c)/2)^2+b^2) theta <- atan2(lambda1 - a, b) t <- seq(0, 2*pi, length.out = 100) x <- unname(v[paste0('meanX_', gt)]) + sqrt(lambda1)*cos(theta)*cos(t) - sqrt(lambda2)*sin(theta)*sin(t) y <- unname(v[paste0('meanY_', gt)]) + sqrt(lambda1)*sin(theta)*cos(t) + sqrt(lambda2)*cos(theta)*sin(t) if (args$birdseed) { p2 <- p2 + annotate('path', x = x, y = y) } else { p3 <- p3 + annotate('path', x = x, y = y) } } } p4 <- ggplot(df, aes(x = BAF, y = LRR, color = GT, shape = GT)) + geom_point(size = .5) + theme_bw(base_size = args$fontsize) + theme(legend.position = 'bottom', legend.box = 'horizontal') if (!is.null(args$pdf)) { pdf(args$pdf, width = args$width, height = args$height) } else { png(args$png, width = args$width, height = args$height, units = 'in', res = 150) } if (args$minimal) { grid.arrange(p2, p4, nrow = 2, ncol = 1, heights = c(3, 4), top = unique(df$ID)) } else { if (args$illumina) grid.arrange(p1, p2, p3, p4, nrow = 4, ncol = 1, heights = c(3, 3, 3, 4), top = unique(df$ID)) else if (args$affymetrix) grid.arrange(p2, p3, p4, nrow = 3, ncol = 1, heights = c(3, 3, 4), top = unique(df$ID)) else grid.arrange(p4, nrow = 1, ncol = 1, top = unique(df$ID)) } invisible(dev.off()) ================================================ FILE: idat2gtc.c ================================================ /* The MIT License Copyright (c) 2024-2026 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // the code in this file reimplements functionalities and ideas present in: // - AutoConvert (v1.6.3.1) // - GTCtoVCF // - BeadArrayFiles // these resources were provided by Illumina without license restrictions // the code in this file can be used as a replacement of the Illumina AutoCall software to convert IDAT intensity files // into GTC genotype files for Infinium arrays which was implemented over time in different proprietary software: // - AutoConvert (v1.6.3.1) - http://support.illumina.com/downloads/beeline_software_v10.html // - AutoConvert 2.0 (v2.0.1.179) - http://support.illumina.com/downloads/beeline-software-2-0.html // - IAAP CLI (v1.1) - http://support.illumina.com/downloads/iaap-genotyping-cli.html // - Array Analysis CLI (v2.1) - // http://support.illumina.com/downloads/illumina-microarray-analytics-array-analysis-cli-v2-installers.html // the Illumina AutoCall software performs three main steps: // - Normalization // - Genotyping // - Gender Estimation // if AutoConvert and AutoConvert 2.0 are run without an input cluster file, only the normalization will be performed // the normalization, clustering, and genotype calling functionalities of Illumina AutoCall were covered by the // following patents: // - http://patents.google.com/patent/US7035740 - covers normalization algorithm (2024-05-05) // - http://patents.google.com/patent/US7467117 - divisional, covers clustering and genotyping (2024-03-24) // - http://patents.google.com/patent/US20050216207 - same as US7035740 // - http://patents.google.com/patent/US20060224529 - same as US7467117 // GenCall GenTrain 2.0 uses the following algorithms: // - Normalization algorithm (version 1.1.2) // - Clustering algorithm (version 6.3.1) // - Genotyping algorithm (version 6.3.0) // GenCall GenTrain 3.0 uses the following algorithms: // - Normalization algorithm version 1.2.0 // - Clustering algorithm version 7.0.0 // - Genotyping algorithm version 7.0.0 // the Illumina GenCall Source Code (http://support.illumina.com/downloads/gencall_software.html) includes: // - NormalizationGoldenGate.cs - normalization routines (version 1.1.0) // - NormalizationInfinium.cs - normalization routines (version 1.1.2) // - GenTrain60.cs - clustering (version 6.3.1) and genotyping (6.3.0) routines // - Utils.cs - closest points to axis, MATLAB robust fit, and other MATLAB routines // the InfiniumIDATParser Java implementation of the normalization algorithm (version 1.1.2) by Jay Carey includes: // - InfiniumIDATParser.java - IDAT parsing routines (2010-02-25) // - InfiniumNormalization.java - normalization routines (version 1.1.2) (2010-01-07) // - InfiniumUtils.java - closest points to axis, MATLAB robust fit, and other MATLAB routines (2010-01-08) // this software was used in the 1000 Genomes project (Supplementary chapter 5.3 of http://doi.org/10.1038/nature15394) // as part of the intensity rank sum test (IRS test) in the Genome STRiP software // the differences between the normalization algorithm version 1.1.2 and version 1.2.0 are: // - the original implementation of the madsigma function for robust line fitting is updated as it was updated in MATLAB // - HandleScale will not use loci with missing data anymore for sub-bead pool bins with less than 192 loci // - NormalizeSingleBinSingleChannel handles Infinium I (A/T and C/G) probes for sub-bead pool bins with less than 192 // loci // for which version 1.1.2 would previously not attempt to compute a background intensity offset // each AutoCall software determines gender in a slightly different way: // - AutoConvert (v1.6.3.1) - only uses X chromosome heterozygosity and checks whether it is higher than 0.1 // - AutoConvert 2.0 (v2.0.1.179) - checks whether Y chromosome intensity R values are higher than 0.3 if autosomal call // rate is higher than 0.97 // - IAAP CLI (v1.1) - same as above but there is a bug in the determination of the autosomal call rate that includes // loci with null cluster scores as missing // - Array Analysis CLI (v2.1) - same as above but with the bug removed // we follow the approach of AutoConvert 2.0 and Array Analysis CLI as default and allow the user to use the approach of // AutoConvert if requested for inexplicable reasons, AutoConvert 2.0, IAAP CLI, and Array Analysis CLI downsample to // 10000 random autosomal loci to estimate the autosomal call rate this behavior can be suppressed by setting the // autosomal call rate threshold from 0.97 to 0.0. However, this cannot be done with Array Analysis CLI // to replicate the functionality for interoperability purposes, the following bugs were reimplemented: // matlab_robustfit0 deviates from the original MATLAB implementation (statrobustfit) to match Illumina implementation // (robustLineFit) when input option addconst/calcoffset is false by erroneously summing the vector into a scalar and // causing the adjfactor variable to be always equal to 100.0 normalization IDs are allowed to overflow beyond 255, // which happens with some probes in the Omni5 arrays, which can cause some Infinium I (G/C) probes to be normalized // together with some Infinium II probes probe pairs with missing values are still used in the normalization step as // probes with zero values the additional code included in GenTrain 3.0 in the Illumina implementation // (NormalizeSingleBinSingleChannel) calls MATLAB function trimmean on an array where some values are artificially set // to zero for no good reasons while other values are left out when determining scale_x with GenTrain 2.0 for // normalization bins with less than 192 loci we include failed loci as AA loci /**************************************** * LITERATURE MENTIONING NORMALIZATION * ****************************************/ // http://doi.org/10.1101/sqb.2003.68.69 // Fan,J.B. et al. (2003) Highly parallel SNP genotyping. Cold Spring Harb Symp Quant Biol, 68, 69–78 // first document that mentions GenCall and GenTrain // http://patents.google.com/patent/US7035740 // Kermani 2005, Artificial intelligence and global normalization methods for genotyping // explains how normalization works // http://patents.google.com/patent/US7467117 // Kermani 2006, Artificial intelligence and global normalization methods for genotyping // also explains how normaliation works(???) // http://www.illumina.com/Documents/products/technotes/technote_gencall_data_analysis_software.pdf // Illumina 2005, Illumina GenCall Data Analysis Software // it does not describe the normalization but it refers to it // http://doi.org/10.1016/j.mrfmmm.2004.07.022 // Shen 2005, High-throughput SNP genotyping on universal bead arrays // introduces the GenTrain algorithm. It explains the GenScores are computed using fuzzy logic // http://doi.org/10.1038/sj.ejhg.5201528; // Moorhead et al. 2006, Optimal genotype determination in highly multiplexed SNP data // in the supplement a normalization procedure very similar to Illumina's is proposed // http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf // http://dnatech.genomecenter.ucdavis.edu/documents/illumina_gt_normalization.pdf // Illumina 2006, Illumina’s Genotyping Data Normalization Methods // has color versions of the patent figures with details that are missing from the patent including the use of 400 // homozygotes // http://doi.org/10.1101/gr.5402306 // Peiffer et al. 2006, High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome // genotyping explains Illumina normalization with minimum details // http://www.illumina.com/documents/products/technotes/technote_cnv_algorithms.pdf // Illumina 2007, DNA Copy Number and Loss of Heterozygosity Analysis Algorithms // explains how LRR and BAF behave over CNVs // http://doi.org/10.1093/bioinformatics/btm443 // Teo et al. 2007, A genotype calling algorithm for the Illumina BeadArray platform // explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes // (paper about Illuminus caller) // http://doi.org/10.1101/gr.5686107 // Oosting et al. 2007, High-resolution copy number analysis of paraffin-embedded archival tissue using SNP BeadArrays // explains an alternative normalization strategy // http://doi.org/10.1101/gr.6861907 // Wang et al. 2007, PennCNV: An integrated hidden Markov model designed for high-resolution copy number variation // detection in whole-genome SNP genotyping data explains Illumina normalization with minimum details // http://doi.org/10.1093/bioinformatics/btn386 // Giannoulatou et al. 2008 GenoSNP: a variational Bayes within-sample SNP genotyping algorithm that does not require a // reference population explains an alternative normalization strategy still based on beadpools (paper about GenoSNP // caller) // http://doi.org/10.1186/1471-2105-9-409 // Staaf et al. 2008 Normalization of Illumina Infinium whole-genome SNP data improves copy number estimates and allelic // intensity ratios explains Illumina normalization with minimum details // http://www.illumina.com/documents/products/technotes/technote_gentrain2.pdf // Illumina 2009, Improved Cluster Generation with Gentrain2 // explains Gentrain 2.0 // http://doi.org/10.1093/bioinformatics/btp470 // Ritchie et al. 2009 R/Bioconductor software for Illumina’s Inﬁnium whole-genome genotyping BeadChips // explains an alternative normalization strategy // http://doi.org/10.1093/nar/gkp552 // LaFramboise et al. 2009 Single nucleotide polymorphism arrays: a decade of biological, computational and // technological advances explains Illumina normalization with minimum details but defines it as "The computational // workhorse in the Illumina protocol" // http://support.illumina.com/documents/products/technotes/technote_array_analysis_workflows.pdf // Illumina 2011, Microarray Data Analysis Workflows // explains how IDAT are converted to GTC with AutoCall // http://doi.org/10.1186/1471-2105-12-68 // Ritchie et al. 2011 Comparing genotyping algorithms for Illumina’s Infinium whole-genome SNP BeadChips // explains Illumina normalization with minimum details (paper comparing GenCall GenTrain 1.0, Infinium, GenoSNP, CRLMM) // http://doi.org/10.1007/978-1-61779-555-8_29 // Teo 2011 Genotype Calling for the Illumina Platform // explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes // http://doi.org/10.1093/bioinformatics/bts47 // Goldstein et al. 2012 zCall: a rare variant caller for array-based genotyping // uses Illumina normalization but no details provided // http://doi.org/10.1093/bioinformatics/btr673 // Li et al. 2012, M3 : an improved SNP calling algorithm for Illumina BeadArray data // explains Illumina normalization with minimum details (paper about M3 caller) // http://doi.org/10.1093/bioinformatics/bts180 // Shah et al. 2012, optiCall: a robust genotype-calling algorithm for rare, low-frequency and common variants // explains Illumina normalization with minimum details (paper about optiCall caller which uses Illumina normalization) // http://doi.org/10.1093/bioinformatics/btu107 // Zhou et al. 2014, iCall: a genotype-calling algorithm for rare, low-frequency and common variants on the Illumina // exome array paper about iCall which uses Illumina normalization // http://web.stat.tamu.edu/sheather/PDF/WZhou_MSProject.pdf // Zhou 2014, Segmentation-Based Detection of Mosaic Chromosomal Abnormality in Bladder Cancer Cells Using Whole Genome // SNP Array includes explanation of the normalization following Illumina's technical note // http://doi.org/10.1111/pbi.12183 // Wang,S. et al. (2014) Characterization of polyploid wheat genomic diversity using a high-density 90,000 single // nucleotide polymorphism array. Plant Biotechnol J, 12, 787–796. introduces the polyploid clustering algorithm // released by Illumina on 2013-10-07 // http://emea.illumina.com/content/dam/illumina-marketing/documents/products/technotes/gentrain3-technical-note-370-2016-015.pdf // Illumina 2016, Improved Genotype Clustering with GenTrain 3.0 // explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two // degrees of freedom rather than six // http://www.illumina.com/content/dam/illumina/gcs/assembled-assets/marketing-literature/gentrain-tech-note-m-gl-01258/gentrain-tech-note-m-gl-01258.pdf // Illumina 2023, Genotype clustering with GenTrain 3.0 // explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two // degrees of freedom rather than six #include #include #include #include #include #include #include #include #include #include #include #include #include "bcftools.h" #define IDAT2GTC_VERSION "2026-01-26" #define AUTOCALL_DATE_FORMAT_DFLT "%m/%d/%y %#I:%M %p" // equivalent to "MM/dd/yyyy h:mm tt" #define AUTOCALL_VERSION_DFLT "3.0.0" KSORT_INIT_GENERIC(float) KSORT_INIT_GENERIC(int) // void error(const char *format, ...) //{ // va_list ap; // va_start(ap, format); // vfprintf(stderr, format, ap); // va_end(ap); // exit(-1); // } // // static inline int iupac2bitmask(char iupac) //{ // const int A = 1; // const int C = 2; // const int G = 4; // const int T = 8; // if ( iupac >= 97 ) iupac -= 32; // if ( iupac == 'A' ) return A; // if ( iupac == 'C' ) return C; // if ( iupac == 'G' ) return G; // if ( iupac == 'T' ) return T; // if ( iupac == 'M' ) return A|C; // if ( iupac == 'R' ) return A|G; // if ( iupac == 'W' ) return A|T; // if ( iupac == 'S' ) return C|G; // if ( iupac == 'Y' ) return C|T; // if ( iupac == 'K' ) return G|T; // if ( iupac == 'V' ) return A|C|G; // if ( iupac == 'H' ) return A|C|T; // if ( iupac == 'D' ) return A|G|T; // if ( iupac == 'B' ) return C|G|T; // if ( iupac == 'N' ) return A|C|G|T; // return -1; // } // ///** // * mkdir_p() - create new directory for a file $fname // * @fname: the file name to create the directory for, the part after last "/" is ignored // */ // void mkdir_p(const char *fmt, ...) //{ // va_list ap; // va_start(ap, fmt); // int n = vsnprintf(NULL, 0, fmt, ap) + 2; // va_end(ap); // // char *path = (char*)malloc(n); // va_start(ap, fmt); // vsnprintf(path, n, fmt, ap); // va_end(ap); // // char *tmp = strdup(path), *p = tmp+1; // while (*p) // { // while (*p && *p!='/') p++; // if ( !*p ) break; // char ctmp = *p; // *p = 0; // int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); // if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno)); // *p = ctmp; // while ( *p && *p=='/' ) p++; // } // free(tmp); // free(path); //} /**************************************** * hFILE READING FUNCTIONS * ****************************************/ static inline ssize_t HTS_RESULT_USED md5_hread(hFILE *fp, void *buffer, size_t nbytes, hts_md5_context *md5) { ssize_t ret = hread(fp, buffer, nbytes); if (md5 && ret > 0) hts_md5_update(md5, buffer, ret); return ret; } static inline int md5_hgetc(hFILE *fp, hts_md5_context *md5) { int c = hgetc(fp); if (md5 && c != EOF) hts_md5_update(md5, &c, 1); return c; } // read or skip a fixed number of bytes static void read_bytes(hFILE *hfile, void *buffer, size_t nbytes, hts_md5_context *md5) { if (buffer) { if (md5_hread(hfile, buffer, nbytes, md5) < nbytes) { error("Failed to read %ld bytes from stream\n", nbytes); } } else { int i, c = 0; for (i = 0; i < nbytes; i++) c = md5_hgetc(hfile, md5); if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nbytes); } } // tests the end-of-file indicator for an hFILE static int heof(hFILE *hfile) { if (hgetc(hfile) == EOF) return 1; hfile->begin--; return 0; } // read or skip a fixed length array static void read_array(hFILE *hfile, void **arr, size_t *m_arr, size_t nmemb, size_t size, size_t term, hts_md5_context *md5) { if (arr) { if (!m_arr) { *arr = malloc((nmemb + term) * size); if (!*arr) error("Failed to allocate memory for array\n"); } else if (*m_arr < nmemb + term) { void *tmp = realloc(*arr, (nmemb + term) * size); if (!tmp) error("Failed to allocate memory for array\n"); *arr = tmp; *m_arr = nmemb + term; } if (md5_hread(hfile, *arr, nmemb * size, md5) < nmemb * size) { error("Failed to read %ld bytes from stream\n", nmemb * size); } } else { int i, c = 0; for (i = 0; i < nmemb * size; i++) c = md5_hgetc(hfile, md5); if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nmemb * size); } } // read or skip a length-prefixed string // http://en.wikipedia.org/wiki/LEB128#Decode_unsigned_integer static void read_pfx_string(hFILE *hfile, char **str, size_t *m_str, hts_md5_context *md5) { uint8_t byte; size_t n = 0, shift = 0; while (1) { if (md5_hread(hfile, (void *)&byte, 1, md5) < 1) { error("Failed to read 1 byte from stream\n"); } n |= (size_t)(byte & 0x7F) << shift; if (!(byte & 0x80)) break; shift += 7; } if (n || m_str) { read_array(hfile, (void **)str, m_str, n, 1, 1, md5); if (str) (*str)[n] = '\0'; } } // check whether file is compressed with gzip static int is_gzip(hFILE *hfile) { uint8_t buffer[2]; if (hpeek(hfile, (void *)buffer, 2) < 2) error("Failed to read 2 bytes from stream\n"); return (buffer[0] == 0x1f && buffer[1] == 0x8b); } static inline int hwrite_uint16(hFILE *hfile, uint16_t num) { return hwrite(hfile, &num, sizeof(uint16_t)); } static inline int hwrite_int32(hFILE *hfile, int32_t num) { return hwrite(hfile, &num, sizeof(int32_t)); } // http://en.wikipedia.org/wiki/LEB128#Encode_unsigned_integer static int hwrite_pfx_string(hFILE *hfile, const char *str) { if (!str) { hputc(0, hfile); return 0; } size_t n = strlen(str); size_t value = n; int ret = n; do { uint8_t byte = value & 0x7f; value >>= 7; if (value) byte ^= 0x80; if (hputc(byte, hfile) == EOF) return -1; ret++; } while (value); if (hwrite(hfile, str, n) < 0) return -1; return ret; } /**************************************** * IDAT FILE IMPLEMENTATION * ****************************************/ // http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py // http://github.com/HenrikBengtsson/illuminaio/blob/master/R/readIDAT.R // /humgen/cnp04/sandbox/bobh/idat_parser/src/edu/mit/broad/gapcore/apps/infinium_idat_parser/InfiniumIDATParser.java #define NUM_SNPS_READ 1000 // ID_N_CORES // #define ... 100 // ID_BACKGROUNDS - not used // #define ... 101 // ID_BACKGROUND_DEVS - not used #define ILLUMINA_ID 102 // ID_BEAD_TYPES #define SD 103 // ID_DEVS #define MEAN 104 // ID_MEANS // #define ... 105 // ID_MEDIANS - not used // #define ... 106 // ID_N_BEADS - not used #define NBEADS 107 // ID_N_GOOD_BEADS // #define ... 108 // ID_TRIMMED_MEANS - not used #define MID_BLOCK 200 // ID_ILLUMICODES #define RUN_INFO 300 // ID_PROCESS_HISTORY #define RED_GREEN 400 // ID_TENTH_PERCENTILE #define IDAT_SNP_MANIFEST 401 // ID_SAMPLE_BEADSET #define SENTRIX_BARCODE 402 // ID_BARCODE #define CHIP_TYPE 403 // ID_SENTRIX_FORMAT #define SENTRIX_POSITION 404 // ID_SECTION_LABEL #define BEADSET 405 // ID_BEADSET #define IDAT_SAMPLE_NAME 406 // ID_DNA #define DESCRIPTION 407 // ID_OPA #define IDAT_SAMPLE_PLATE 408 // ID_DNA_PLATE #define IDAT_SAMPLE_WELL 409 // ID_WELL #define IDAT_SAMPLE_COUNT 410 // ID_SAMPLE_COUNT // #define ... 411 // ID_DX - not used #define IDAT_VLN 510 // ID_VLN typedef struct { const char *chip_type; int num_snps; int num_mid_blocks; const char *chip_type_guess; } chip_type_t; static chip_type_t chip_types[] = { {"1-95um_multi-swath_for_4x5M", 4568350, 4568350, "HumanOmni5-4-v1-0"}, {"1-95um_multi-swath_for_4x5M", 4640213, 4640213, "HumanOmni5-4v1-1"}, {"1-95um_multi-swath_for_4x5M", 4685673, 4685673, "InfiniumOmni5-4v1-2"}, {"1-95um_multi-swath_for_4x5M", 4696316, 4696316, "HumanOmni5-4-v1-0"}, {"1-95um_multi-swath_for_8x2-5M", 2266191, 2266191, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266367, 2266367, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266404, 2266404, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2266406, 2266406, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2268676, 2268676, "MEGAEx_BioVU_15075710"}, {"1-95um_multi-swath_for_8x2-5M", 2315574, 2315574, "Multi-EthnicGlobal"}, {"1-95um_multi-swath_for_8x2-5M", 2389000, 2389000, "CCPMBiobankMEGA2_20002558X345183"}, {"1-95um_multi-swath_for_8x2-5M", 2508689, 2508689, "GDA-8v1-0"}, {"1-95um_multi-swath_for_8x2-5M", 2550870, 2550870, "HumanOmni2.5-8v1"}, {"1-95um_multi-swath_for_8x2-5M", 2563064, 2563064, "HumanOmni25M-8v1-1"}, {"1-95um_multi-swath_for_8x2-5M", 2575219, 2575219, "HumanOmni2.5-8v1"}, {"1-95um_multi-swath_for_8x2-5M", 2605775, 2605775, "HumanOmni25M-8v1-1"}, {"BeadChip 12x1", 55300, 55300, "humanmethylation27_270596_v1-2 ???"}, {"BeadChip 12x1Q", 191668, 191668, "CanineHD"}, {"BeadChip 12x1Q", 299260, 299260, "HumanCytoSNP-12v2-1"}, {"BeadChip 12x8", 301084, 301084, "HumanCore-12v1-0"}, {"BeadChip 12x8", 304138, 304138, "HumanExome-12v1-1"}, {"BeadChip 12x8", 567727, 567727, "HumanCoreExome-12-v1-0"}, {"BeadChip 12x8", 569060, 569060, "HumanCoreExome-12-v1-0"}, {"BeadChip 12x8", 573012, 573012, "HumanCoreExome-12-v1-1"}, {"BeadChip 12x8", 576769, 576769, "HumanCoreExome-12-v1-1"}, {"BeadChip 12x8", 622399, 622399, "humanmethylation450_15017482_v-1-2 ???"}, {"BeadChip 12x8", 722405, 722405, "HumanOmniExpress-12-v1-1"}, {"BeadChip 12x8", 734889, 734889, "HumanOmniExpress-12-v1-0"}, {"BeadChip 12x8", 736136, 736136, "HumanOmniExpress-12-v1-0"}, {"BeadChip 1x12", 577085, 8627, "HumanHap550v3"}, {"BeadChip 1x12", 661182, 49163, "HumanHap650Yv3"}, {"BeadChip 1x40", 1129736, 57373, "Human1Mv1"}, {"BeadChip 1x40 66", 1078890, 52497, "Human1Mv1"}, {"BeadChip 24x1x4", 306776, 306776, "InfiniumCore-24v1-2"}, {"BeadChip 24x1x4", 527136, 527136, "OncoArray-500K"}, {"BeadChip 24x1x4", 577781, 577781, "HumanCoreExome-24v1-0"}, {"BeadChip 24x1x4", 581261, 581261, "HumanCoreExome-24v1-2"}, {"BeadChip 24x1x4", 582684, 582684, "HumanCoreExome-24v1-1"}, {"BeadChip 24x1x4", 611866, 611866, "HumanCoreExome-24v1-4"}, {"BeadChip 24x1x4", 623302, 623302, "PsychChip_15048346"}, {"BeadChip 24x1x4", 623513, 623513, "InfiniumPsychArray-24v1-1"}, {"BeadChip 24x1x4", 638714, 638714, "PsychChip_v1-1_15073391"}, {"BeadChip 24x1x4", 647864, 647864, "InfiniumPsychArray-24v1-3"}, {"BeadChip 24x1x4", 663209, 663209, "GSA-24v1-0"}, {"BeadChip 24x1x4", 704215, 704215, "GSA-24v3-0"}, {"BeadChip 24x1x4", 708013, 708013, "DeCodeGenetics_V1_20012591"}, {"BeadChip 24x1x4", 710576, 710576, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 710606, 710606, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 710608, 710608, "GSAMD-24v1-0_20011747"}, {"BeadChip 24x1x4", 715653, 715653, "HumanOmniExpress-24v1-1"}, {"BeadChip 24x1x4", 716279, 716279, "InfiniumOmniExpress-24v1-2"}, {"BeadChip 24x1x4", 718963, 718963, "HumanOmniExpress-24-v1-0"}, {"BeadChip 24x1x4", 719234, 719234, "HumanOmniExpress-24-v1-0"}, {"BeadChip 24x1x4", 729110, 729110, "ASA-24v1-0"}, {"BeadChip 24x1x4", 733354, 733354, "GSA-24v2-0"}, {"BeadChip 24x1x4", 749019, 749019, "DeCodeGenetics_V3_20032937X331991"}, {"BeadChip 24x1x4", 751614, 751614, "GSAMD-24v3-0-EA_20034606"}, {"BeadChip 24x1x4", 766804, 766804, "JSA-24v1-0"}, {"BeadChip 24x1x4", 776509, 776509, "ASA-24v1-0"}, {"BeadChip 24x1x4", 780343, 780343, "GSAMD-24v2-0_20024620"}, {"BeadChip 24x1x4", 780509, 780509, "GSAMD-24v2-0_20024620"}, {"BeadChip 24x1x4", 818205, 818205, "GSA-24v2-0"}, {"BeadChip 2x10", 321354, 37161, "HumanHap300v2"}, {"BeadChip 2x12", 381079, 29275, "HumanCNV370v1"}, {"BeadChip 2x20", 561686, 54936, "HumanHap550v3"}, {"BeadChip 2x6Q", 1224000, 180026, "Human1M-Duov3"}, {"BeadChip 2x6Q", 1224629, 180026, "Human1M-Duov3"}, {"BeadChip 48x4", 730546, 730546, "GSA-MD-48v4-0_20098041"}, {"BeadChip 4x10", 2623923, 1300482, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2623923, 1323441, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624666, 1300941, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624666, 1323725, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2624671, 1323726, "HumanOmni2.5-4v1"}, {"BeadChip 4x10", 2655594, 1354653, "HumanOmni2.5-4v1"}, {"BeadChip 4X1X14", 1186430, 1186430, "HumanOmni1-Quad_v1-0"}, {"BeadChip 4x2Q", 376216, 186490, "HumanCNV370-Quadv3"}, {"BeadChip 4x3Q", 626122, 208778, "Human610-Quadv1"}, {"BeadChip 4x3Q", 667447, 208778, "Human660W-Quad_v1"}, {"BeadChip 8x5", 1052641, 1052641, "infinium-methylationepic-v-1-0 ???"}, {"BeadChip 8x5", 867478, 867478, "CytoSNP-850K"}, {"BeadChip 8x5", 988240, 988240, "HumanOmniExpressExome-8-v1-1"}, {"BeadChip 8x5", 989536, 989536, "HumanOmniExpressExome-8-v1-1"}, {"BeadChip 8x5", 992824, 992824, "HumanOmniExpressExome-8-v1-4"}, {"BeadChip 8x5", 996003, 996003, "HumanOmniExpressExome-8-v1-2"}, {"BeadChip 8x5", 996055, 996055, "HumanOmniExpressExome-8-v1-2"}, {"SLIDE.15028542.24x1x3", 307984, 307984, "HumanCore-24v1-0"}, {"SLIDE.15028542.24x1x3", 311460, 311460, "HumanCore-24v1-0"}, {NULL, 0, 0, NULL}}; typedef struct { char *run_time; char *block_type; char *block_pars; char *block_code; char *code_version; } RunInfo; typedef struct { char *fn; hFILE *hfile; int64_t version; int32_t number_toc_entries; uint16_t *id; int64_t *toc; int32_t num_snps; int32_t num_mid_blocks; int32_t *ilmn_id; uint16_t *sd; uint16_t *mean; uint8_t *nbeads; const uint16_t *trimmed_mean; // only used for historical purposes uint8_t *mid_block; uint8_t red_green[4]; char *snp_manifest; char *sentrix_barcode; char *chip_type; char *sentrix_position; char *beadset; char *sample_name; char *description; char *sample_plate; char *sample_well; int32_t sample_count; char *vln; RunInfo *run_infos; int32_t m_run_infos; const char *chip_type_guess; const char *imaging_date; const char *scanner_data; void *ilmn_id2index; } idat_t; KHASH_MAP_INIT_INT(32, int32_t) static int idat_read(idat_t *idat, uint16_t id) { int i; for (i = 0; i < idat->number_toc_entries && id != idat->id[i]; i++); if (i == idat->number_toc_entries) return -1; if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0) error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn); switch (id) { case NUM_SNPS_READ: read_bytes(idat->hfile, (void *)&idat->num_snps, sizeof(int32_t), NULL); break; case ILLUMINA_ID: idat->ilmn_id = (int32_t *)malloc(idat->num_snps * sizeof(int32_t)); read_bytes(idat->hfile, (void *)idat->ilmn_id, idat->num_snps * sizeof(int32_t), NULL); int ret; idat->ilmn_id2index = kh_init(32); khash_t(32) *hash = (khash_t(32) *)idat->ilmn_id2index; for (i = 0; i < idat->num_snps; i++) { khiter_t k = kh_put(32, hash, idat->ilmn_id[i], &ret); if (ret < 0) error("Unable to insert Illumina ID %d in hash table\n", idat->ilmn_id[i]); if (ret > 0) kh_val(hash, k) = kh_size(hash) - 1; else error("Duplicate Illumina ID %d in hash table\n", idat->ilmn_id[i]); } break; case SD: idat->sd = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t)); read_bytes(idat->hfile, (void *)idat->sd, idat->num_snps * sizeof(uint16_t), NULL); break; case MEAN: idat->mean = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t)); read_bytes(idat->hfile, (void *)idat->mean, idat->num_snps * sizeof(uint16_t), NULL); idat->trimmed_mean = idat->mean; break; case NBEADS: idat->nbeads = (uint8_t *)malloc(idat->num_snps * sizeof(uint8_t)); read_bytes(idat->hfile, (void *)idat->nbeads, idat->num_snps * sizeof(uint8_t), NULL); break; case MID_BLOCK: read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL); idat->mid_block = (uint8_t *)malloc(idat->num_mid_blocks * sizeof(uint8_t)); read_bytes(idat->hfile, (void *)idat->mid_block, idat->num_mid_blocks * sizeof(uint8_t), NULL); break; case RED_GREEN: read_bytes(idat->hfile, (void *)&idat->red_green, 4 * sizeof(uint8_t), NULL); break; case IDAT_SNP_MANIFEST: read_pfx_string(idat->hfile, &idat->snp_manifest, NULL, NULL); break; case SENTRIX_BARCODE: read_pfx_string(idat->hfile, &idat->sentrix_barcode, NULL, NULL); break; case CHIP_TYPE: read_pfx_string(idat->hfile, &idat->chip_type, NULL, NULL); break; case SENTRIX_POSITION: read_pfx_string(idat->hfile, &idat->sentrix_position, NULL, NULL); break; case BEADSET: read_pfx_string(idat->hfile, &idat->beadset, NULL, NULL); break; case IDAT_SAMPLE_NAME: read_pfx_string(idat->hfile, &idat->sample_name, NULL, NULL); break; case DESCRIPTION: read_pfx_string(idat->hfile, &idat->description, NULL, NULL); break; case IDAT_SAMPLE_PLATE: read_pfx_string(idat->hfile, &idat->sample_plate, NULL, NULL); break; case IDAT_SAMPLE_WELL: read_pfx_string(idat->hfile, &idat->sample_well, NULL, NULL); break; case IDAT_SAMPLE_COUNT: read_bytes(idat->hfile, (void *)&idat->sample_count, sizeof(int32_t), NULL); break; case IDAT_VLN: read_pfx_string(idat->hfile, &idat->vln, NULL, NULL); break; case RUN_INFO: read_bytes(idat->hfile, (void *)&idat->m_run_infos, sizeof(int32_t), NULL); idat->run_infos = (RunInfo *)calloc(idat->m_run_infos, sizeof(RunInfo)); for (i = 0; i < idat->m_run_infos; i++) { read_pfx_string(idat->hfile, &idat->run_infos[i].run_time, NULL, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_type, NULL, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_pars, NULL, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].block_code, NULL, NULL); read_pfx_string(idat->hfile, &idat->run_infos[i].code_version, NULL, NULL); } break; default: error("IDAT file format does not support TOC entry %d\n", id); break; } return 0; } static idat_t *idat_init(const char *fn, int load_arrays) { idat_t *idat = (idat_t *)calloc(1, sizeof(idat_t)); idat->fn = strdup(fn); idat->hfile = hopen(idat->fn, "rb"); if (idat->hfile == NULL) error("Could not open %s: %s\n", idat->fn, strerror(errno)); if (is_gzip(idat->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", idat->fn); int i; uint8_t buffer[4]; if (hread(idat->hfile, (void *)buffer, 4) < 4) error("Failed to read magic number from %s file\n", idat->fn); if (memcmp(buffer, "IDAT", 4) != 0) error("IDAT file %s format identifier is bad\n", idat->fn); read_bytes(idat->hfile, (void *)&idat->version, sizeof(int64_t), NULL); if (idat->version < 3) error("Cannot read IDAT file %s. Unsupported IDAT file format version: %ld\n", idat->fn, idat->version); read_bytes(idat->hfile, (void *)&idat->number_toc_entries, sizeof(int32_t), NULL); idat->id = (uint16_t *)malloc(idat->number_toc_entries * sizeof(uint16_t)); idat->toc = (int64_t *)malloc(idat->number_toc_entries * sizeof(int64_t)); for (i = 0; i < idat->number_toc_entries; i++) { read_bytes(idat->hfile, (void *)&idat->id[i], sizeof(uint16_t), NULL); read_bytes(idat->hfile, (void *)&idat->toc[i], sizeof(int64_t), NULL); } for (i = 0; i < idat->number_toc_entries; i++) { if (!load_arrays && idat->id[i] <= MID_BLOCK) { if (idat->id[i] == MID_BLOCK) { if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0) error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn); read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL); } continue; } idat_read(idat, idat->id[i]); } if (idat->chip_type) { const chip_type_t *ptr; for (ptr = chip_types; ptr->chip_type; ptr++) { if (strcmp(idat->chip_type, ptr->chip_type) == 0 && ptr->num_snps == idat->num_snps && ptr->num_mid_blocks == idat->num_mid_blocks) idat->chip_type_guess = ptr->chip_type_guess; } } for (i = 0; i < idat->m_run_infos; i++) { if (strcmp(idat->run_infos[i].block_type, "Scan") != 0) continue; idat->imaging_date = idat->run_infos[i].run_time; idat->scanner_data = idat->run_infos[i].block_pars; } return idat; } static void idat_destroy(idat_t *idat) { if (!idat) return; if (hclose(idat->hfile) < 0) error("Error closing IDAT file %s\n", idat->fn); free(idat->fn); free(idat->id); free(idat->toc); free(idat->snp_manifest); free(idat->sentrix_barcode); free(idat->chip_type); free(idat->sentrix_position); free(idat->beadset); free(idat->sample_name); free(idat->description); free(idat->sample_plate); free(idat->sample_well); free(idat->vln); int i; for (i = 0; i < idat->m_run_infos; i++) { free(idat->run_infos[i].run_time); free(idat->run_infos[i].block_type); free(idat->run_infos[i].block_pars); free(idat->run_infos[i].block_code); free(idat->run_infos[i].code_version); } free(idat->run_infos); free(idat->ilmn_id); free(idat->sd); free(idat->mean); free(idat->nbeads); free(idat->mid_block); if (idat->ilmn_id2index) kh_destroy(32, idat->ilmn_id2index); free(idat); } static void idat_to_csv(const idat_t *idat, FILE *stream, int verbose) { int i; fprintf(stream, "Illumina, Inc.\n"); fprintf(stream, "[Heading]\n"); fprintf(stream, "Descriptor File Name,%s\n", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn); fprintf(stream, "IDAT file version,%ld\n", idat->version); fprintf(stream, "Number of TOC entries,%d\n", idat->number_toc_entries); fprintf(stream, "Probes Count,%d\n", idat->num_snps); fprintf(stream, "Mid Blocks Count,%d\n", idat->num_mid_blocks); fprintf(stream, "Red Green,%02x %02x %02x %02x\n", idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3]); fprintf(stream, "SNP Manifest,%s\n", idat->snp_manifest ? idat->snp_manifest : ""); fprintf(stream, "Sentrix Barcode,%s\n", idat->sentrix_barcode); fprintf(stream, "Chip Type,%s\n", idat->chip_type); fprintf(stream, "Sentrix Position,%s\n", idat->sentrix_position); fprintf(stream, "BeadSet,%s\n", idat->beadset ? idat->beadset : ""); fprintf(stream, "Sample Name,%s\n", idat->sample_name ? idat->sample_name : ""); fprintf(stream, "Description,%s\n", idat->description ? idat->description : ""); fprintf(stream, "Sample Plate,%s\n", idat->sample_plate ? idat->sample_plate : ""); fprintf(stream, "Sample Well,%s\n", idat->sample_well ? idat->sample_well : ""); fprintf(stream, "Sample Count,%d\n", idat->sample_count); fprintf(stream, "Vln,%s\n", idat->vln ? idat->vln : ""); fprintf(stream, "Chip Prefix (Guess),%s\n", idat->chip_type_guess ? idat->chip_type_guess : "Unknown"); fprintf(stream, "[Assay]\n"); fprintf(stream, "IlmnID,Sd,Mean,Nbeads\n"); if (verbose) { for (i = 0; i < idat->num_snps; i++) fprintf(stream, "%d,%d,%d,%d\n", idat->ilmn_id[i], idat->sd[i], idat->mean[i], idat->nbeads[i]); fprintf(stream, "[Mid Blocks]\n"); for (i = 0; i < idat->num_mid_blocks; i++) fprintf(stream, "%d\n", idat->mid_block[i]); } else { fprintf(stream, "... use --verbose to visualize Assay data ...\n"); fprintf(stream, "[Mid Blocks]\n"); fprintf(stream, "... use --verbose to visualize Mid Blocks data ...\n"); } fprintf(stream, "[Run Infos]\n"); for (i = 0; i < idat->m_run_infos; i++) { fprintf(stream, "%s\t%s\t%s\t%s\t%s\n", idat->run_infos[i].run_time, idat->run_infos[i].block_type, idat->run_infos[i].block_pars, idat->run_infos[i].block_code, idat->run_infos[i].code_version); } } static void idats_to_tsv(idat_t **idats, int n, FILE *stream) { fprintf(stream, "idat\tnumber_probes\tnumber_mid_blocks\tred_green\tmanifest_file\tsentrix_" "barcode\tchip_type\t" "sentrix_position\tbeadset\tsample_name\tdescription\tsample_plate\tsample_" "well\tsample_count\tvln\t" "chip_type_guess\tscan_date\tscanner_data\n"); int i; for (i = 0; i < n; i++) { idat_t *idat = idats[i]; fprintf(stream, "%s\t%d\t%d\t%02x %02x %02x " "%02x\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn, idat->num_snps, idat->num_mid_blocks, idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3], idat->snp_manifest ? idat->snp_manifest : "", idat->sentrix_barcode, idat->chip_type, idat->sentrix_position, idat->beadset ? idat->beadset : "", idat->sample_name ? idat->sample_name : "", idat->description ? idat->description : "", idat->sample_plate ? idat->sample_plate : "", idat->sample_well ? idat->sample_well : "", idat->sample_count, idat->vln ? idat->vln : "", idat->chip_type_guess ? idat->chip_type_guess : "Unknown", idat->imaging_date ? idat->imaging_date : "", idat->scanner_data ? idat->scanner_data : ""); } } /**************************************** * GTC FILE IMPLEMENTATION * ****************************************/ // http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumGTCFile.java // http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/GenotypeCalls.py #define NUM_SNPS 1 #define PLOIDY 2 // AutoConvert 2.0 #define PLOIDY_TYPE 3 // AutoConvert 2.0 #define GTC_SAMPLE_NAME 10 #define GTC_SAMPLE_PLATE 11 #define GTC_SAMPLE_WELL 12 #define CLUSTER_FILE 100 #define GTC_SNP_MANIFEST 101 #define IMAGING_DATE 200 #define AUTOCALL_DATE 201 #define AUTOCALL_VERSION 300 #define NORMALIZATION_TRANSFORMS 400 #define CONTROLS_X 500 #define CONTROLS_Y 501 #define RAW_X 1000 #define RAW_Y 1001 #define GENOTYPES 1002 #define BASE_CALLS 1003 #define GENOTYPE_SCORES 1004 #define SCANNER_DATA 1005 #define CALL_RATE 1006 #define GENDER 1007 #define LOGR_DEV 1008 #define GC10 1009 #define DX 1010 #define SAMPLE_DATA 1011 #define B_ALLELE_FREQS 1012 // AutoConvert 2.0 #define LOGR_RATIOS 1013 // AutoConvert 2.0 #define PERCENTILES_X 1014 // AutoConvert 2.0 #define PERCENTILES_Y 1015 // AutoConvert 2.0 #define SLIDE_IDENTIFIER 1016 // AutoConvert 2.0 // static const char *code2genotype[] = { // "NC", "AA", "AB", "BB", "NULL", "A", "B", "AAA", // "AAB", "ABB", "BBB", "AAAA", "AAAB", "AABB", "ABBB", "BBBB", // "AAAAA", "AAAAB", "AAABB", "AABBB", "ABBBB", "BBBBB", "AAAAAA", "AAAAAB", // "AAAABB", "AAABBB", "AABBBB", "ABBBBB", "BBBBBB", "AAAAAAA", "AAAAAAB", "AAAAABB", // "AAAABBB", "AAABBBB", "AABBBBB", "ABBBBBB", "BBBBBBB", "AAAAAAAA", "AAAAAAAB", "AAAAAABB", // "AAAAABBB", "AAAABBBB", "AAABBBBB", "AABBBBBB", "ABBBBBBB", "BBBBBBBB"}; typedef struct { int32_t version; float offset_x; float offset_y; float scale_x; float scale_y; float shear; float theta; float cvx; float cvy; float nn12; float rr12; float taa; float tbb; } XForm; typedef char BaseCall[2]; typedef struct { char *scanner_name; int32_t pmt_green; int32_t pmt_red; char *scanner_version; char *imaging_user; } ScannerData; typedef struct { float p50gc; int32_t num_calls; int32_t num_no_calls; int32_t num_intensity_only; } SampleData; typedef uint16_t Percentiles[3]; typedef struct { char *fn; hFILE *hfile; int32_t version; int32_t number_toc_entries; uint16_t *id; int32_t *toc; int32_t num_snps; int32_t ploidy; int32_t ploidy_type; char *sample_name; char *sample_plate; char *sample_well; char *cluster_file; char *snp_manifest; char *imaging_date; char *autocall_date; char *autocall_version; XForm *normalization_transforms; size_t m_normalization_transforms; uint16_t *controls_x; size_t m_controls_x; uint16_t *controls_y; size_t m_controls_y; ScannerData scanner_data; float call_rate; char gender; float logr_dev; float p10gc; int32_t dx; SampleData sample_data; Percentiles percentiles_x; Percentiles percentiles_y; char *sentrix_id; char *display_name; float *sin_theta; // precomputed sine transforms float *cos_theta; // precomputed cosine transforms uint16_t *raw_x; size_t m_raw_x; uint16_t *raw_y; size_t m_raw_y; uint8_t *genotypes; size_t m_genotypes; BaseCall *base_calls; size_t m_base_calls; float *genotype_scores; size_t m_genotype_scores; float *b_allele_freqs; size_t m_b_allele_freqs; float *logr_ratios; size_t m_logr_ratios; } gtc_t; // returns the length of a string including the variable-length prefix encoding the number of characters static int leb128_strlen(const char *s) { if (!s) return 1; size_t n = strlen(s); size_t value = n++; while (value >>= 7) n++; return n; } static int gtc_write(const gtc_t *gtc, const char *fn, int gtc_file_version) { hFILE *hfile = hopen(fn, "wb"); if (hfile == NULL) error("Could not open %s: %s\n", fn, strerror(errno)); const uint8_t header[4] = {'g', 't', 'c', gtc_file_version}; if (hwrite(hfile, header, 4) < 0) return -1; int32_t number_toc_entries = gtc_file_version == 3 ? 24 : 31; if (hwrite_int32(hfile, number_toc_entries) < 0) return -1; int offset = 4 + sizeof(int32_t) + number_toc_entries * (sizeof(uint16_t) + sizeof(int32_t)); if (hwrite_uint16(hfile, NUM_SNPS) < 0) return -1; if (hwrite_int32(hfile, gtc->num_snps) < 0) return -1; if (gtc_file_version != 3) { if (hwrite_uint16(hfile, PLOIDY) < 0) return -1; if (hwrite_int32(hfile, gtc->ploidy) < 0) return -1; if (hwrite_uint16(hfile, PLOIDY_TYPE) < 0) return -1; if (hwrite_int32(hfile, gtc->ploidy_type) < 0) return -1; } if (hwrite_uint16(hfile, GTC_SAMPLE_NAME) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->sample_name); if (hwrite_uint16(hfile, GTC_SAMPLE_PLATE) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->sample_plate); if (hwrite_uint16(hfile, GTC_SAMPLE_WELL) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->sample_well); if (hwrite_uint16(hfile, CLUSTER_FILE) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->cluster_file); if (hwrite_uint16(hfile, GTC_SNP_MANIFEST) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->snp_manifest); if (hwrite_uint16(hfile, IMAGING_DATE) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->imaging_date); if (hwrite_uint16(hfile, AUTOCALL_DATE) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->autocall_date); if (hwrite_uint16(hfile, AUTOCALL_VERSION) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->autocall_version); if (hwrite_uint16(hfile, NORMALIZATION_TRANSFORMS) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->m_normalization_transforms * sizeof(XForm) + sizeof(int32_t); if (hwrite_uint16(hfile, CONTROLS_X) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->m_controls_x * sizeof(uint16_t) + sizeof(int32_t); if (hwrite_uint16(hfile, CONTROLS_Y) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->m_controls_y * sizeof(uint16_t) + sizeof(int32_t); if (hwrite_uint16(hfile, RAW_X) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(uint16_t) + sizeof(int32_t); if (hwrite_uint16(hfile, RAW_Y) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(uint16_t) + sizeof(int32_t); if (hwrite_uint16(hfile, GENOTYPES) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(uint8_t) + sizeof(int32_t); if (hwrite_uint16(hfile, BASE_CALLS) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(BaseCall) + sizeof(int32_t); if (hwrite_uint16(hfile, GENOTYPE_SCORES) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(float) + sizeof(int32_t); if (hwrite_uint16(hfile, SCANNER_DATA) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->scanner_data.scanner_name) + sizeof(float) + sizeof(float) + leb128_strlen(gtc->scanner_data.scanner_version) + leb128_strlen(gtc->scanner_data.imaging_user); if (hwrite_uint16(hfile, CALL_RATE) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(float); if (hwrite_uint16(hfile, GENDER) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(char); if (hwrite_uint16(hfile, LOGR_DEV) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(float); if (hwrite_uint16(hfile, GC10) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(float); if (hwrite_uint16(hfile, DX) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(int32_t); if (hwrite_uint16(hfile, SAMPLE_DATA) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(SampleData); if (gtc_file_version != 3) { if (hwrite_uint16(hfile, B_ALLELE_FREQS) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(float) + sizeof(int32_t); if (hwrite_uint16(hfile, LOGR_RATIOS) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += gtc->num_snps * sizeof(float) + sizeof(int32_t); if (hwrite_uint16(hfile, PERCENTILES_X) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(Percentiles); if (hwrite_uint16(hfile, PERCENTILES_Y) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += sizeof(Percentiles); if (hwrite_uint16(hfile, SLIDE_IDENTIFIER) < 0) return -1; if (hwrite_int32(hfile, offset) < 0) return -1; offset += leb128_strlen(gtc->sentrix_id); } if (hwrite_pfx_string(hfile, gtc->sample_name) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->sample_plate) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->sample_well) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->cluster_file) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->snp_manifest) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->imaging_date) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->autocall_date) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->autocall_version) < 0) return -1; if (hwrite(hfile, (const void *)>c->m_normalization_transforms, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->normalization_transforms, gtc->m_normalization_transforms * sizeof(XForm)) < 0) return -1; if (hwrite(hfile, (const void *)>c->m_controls_x, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->controls_x, gtc->m_controls_x * sizeof(uint16_t)) < 0) return -1; if (hwrite(hfile, (const void *)>c->m_controls_y, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->controls_y, gtc->m_controls_y * sizeof(uint16_t)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->raw_x, gtc->num_snps * sizeof(uint16_t)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->raw_y, gtc->num_snps * sizeof(uint16_t)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->genotypes, gtc->num_snps * sizeof(uint8_t)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->base_calls, gtc->num_snps * sizeof(BaseCall)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->genotype_scores, gtc->num_snps * sizeof(float)) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->scanner_data.scanner_name) < 0) return -1; if (hwrite(hfile, >c->scanner_data.pmt_green, sizeof(float)) < 0) return -1; if (hwrite(hfile, >c->scanner_data.pmt_red, sizeof(float)) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->scanner_data.scanner_version) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->scanner_data.imaging_user) < 0) return -1; if (hwrite(hfile, >c->call_rate, sizeof(float)) < 0) return -1; if (hwrite(hfile, >c->gender, sizeof(char)) < 0) return -1; if (hwrite(hfile, >c->logr_dev, sizeof(float)) < 0) return -1; if (hwrite(hfile, >c->p10gc, sizeof(float)) < 0) return -1; if (hwrite(hfile, >c->dx, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, >c->sample_data, sizeof(SampleData)) < 0) return -1; if (gtc_file_version != 3) { if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->b_allele_freqs, gtc->num_snps * sizeof(float)) < 0) return -1; if (hwrite(hfile, (const void *)>c->num_snps, sizeof(int32_t)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->logr_ratios, gtc->num_snps * sizeof(float)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->percentiles_x, sizeof(Percentiles)) < 0) return -1; if (hwrite(hfile, (const void *)gtc->percentiles_y, sizeof(Percentiles)) < 0) return -1; if (hwrite_pfx_string(hfile, gtc->sentrix_id) < 0) return -1; } if (hclose(hfile) < 0) error("Error closing GTC file %s\n", fn); return 0; } static void gtc_destroy(gtc_t *gtc) { if (!gtc) return; if (gtc->hfile && hclose(gtc->hfile) < 0) error("Error closing GTC file %s\n", gtc->fn); free(gtc->fn); free(gtc->id); free(gtc->toc); free(gtc->sample_name); free(gtc->sample_plate); free(gtc->sample_well); free(gtc->cluster_file); free(gtc->snp_manifest); free(gtc->imaging_date); free(gtc->autocall_date); free(gtc->autocall_version); free(gtc->normalization_transforms); free(gtc->controls_x); free(gtc->controls_y); free(gtc->scanner_data.scanner_name); free(gtc->scanner_data.scanner_version); free(gtc->scanner_data.imaging_user); free(gtc->sentrix_id); free(gtc->display_name); free(gtc->sin_theta); free(gtc->cos_theta); free(gtc->raw_x); free(gtc->raw_y); free(gtc->genotypes); free(gtc->base_calls); free(gtc->genotype_scores); free(gtc->b_allele_freqs); free(gtc->logr_ratios); free(gtc); } /**************************************** * BPM FILE IMPLEMENTATION * ****************************************/ // http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/BeadPoolManifest.py typedef struct { int32_t version; uint8_t norm_id; // Normalization lookups from manifest. This indexes into list of // normalization transforms read from GTC file char *ilmn_id; // IlmnID (probe identifier) of locus char *name; // Name (variant identifier) of locus int32_t index; char *ilmn_strand; // TOP BOT PLUS MINUS or Top Bot P M char *snp; // SNP value for locus (e.g., [A/C]) char *chrom; // Chromosome for the locus (e.g., XY) char *ploidy; char *species; char *map_info; // Mapping location of locus char *customer_strand; int32_t address_a; // AddressA ID of locus char *allele_a_probe_seq; // CSV files or BPM files with version 4 data block int32_t address_b; // AddressB ID of locus (0 if none) char *allele_b_probe_seq; // CSV files or BPM files with version 4 data block (empty if // none) char *genome_build; char *source; char *source_version; char *source_strand; char *source_seq; // CSV files or BPM files with version 4 data block char *top_genomic_seq; // CSV files or BPM files with version 4 data block int32_t beadset_id; // CSV files uint8_t exp_clusters; uint8_t intensity_only; uint8_t assay_type; // Identifies type of assay (0 - Infinium II, 1 - Infinium I (A/T), // 2 - Infinium I (G/C) uint8_t assay_type_csv; float frac_a; float frac_c; float frac_g; float frac_t; char *ref_strand; // RefStrand annotation } LocusEntry; // retrieve assay type following (allele_a_probe_seq, source_seq) -> assay_type map // (...W., ...W[./.]W...) -> 1 // (...S., ...S[./.]S...) -> 2 // (...S., ...S[./.]W...) -> 1 // (...S., ...W[./.]S...) -> 1 // (...W., ...S[./.]W...) -> 2 // (...W., ...W[./.]S...) -> 2 static uint8_t get_assay_type(const char *allele_a_probe_seq, const char *allele_b_probe_seq, const char *source_seq) { if (!allele_a_probe_seq || !source_seq) return 0xFF; if (!allele_b_probe_seq) return 0; const char *left = strchr(source_seq, '['); const char *right = strchr(source_seq, ']'); if (!left || !right) error("Source sequence is malformed: %s\n", source_seq); char trail_left = toupper(*(left - 1)); char trail_right = toupper(*(right + 1)); if ((trail_left == 'A' || trail_left == 'T') && (trail_right == 'A' || trail_right == 'T')) return 1; if ((trail_left == 'C' || trail_left == 'G') && (trail_right == 'C' || trail_right == 'G')) return 2; int i = 2; while (!(iupac2bitmask(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]) & iupac2bitmask(allele_b_probe_seq[strlen(allele_b_probe_seq) - i]))) i++; char trail_a_probe_seq = toupper(allele_a_probe_seq[strlen(allele_a_probe_seq) - i]); if (trail_a_probe_seq == 'C' || trail_a_probe_seq == 'G' || trail_a_probe_seq == 'S') return 1; if (trail_a_probe_seq == 'A' || trail_a_probe_seq == 'T' || trail_a_probe_seq == 'W') return 2; // these weird rule were deduced from manifests for array GDA_PGx-8v1-0_20042614 if (trail_a_probe_seq == 'Y' && trail_right == 'G') return 1; if (trail_a_probe_seq == 'Y' && trail_right == 'T') return 1; if (trail_a_probe_seq == 'Y' && trail_right == 'A') return 2; if (trail_a_probe_seq == 'K' && trail_right == 'C') return 1; if (trail_a_probe_seq == 'K' && trail_right == 'A') return 2; if (trail_a_probe_seq == 'M' && trail_right == 'G') return 1; if (trail_a_probe_seq == 'M' && trail_right == 'T') return 2; if (trail_a_probe_seq == 'R' && trail_right == 'C') return 1; if (trail_a_probe_seq == 'R' && trail_right == 'T') return 2; fprintf(stderr, "Warning: Unable to retrieve assay type: %s %s %s\n", allele_a_probe_seq, allele_b_probe_seq, source_seq); return 0xFF; } static void locusentry_read(LocusEntry *locus_entry, hFILE *hfile, hts_md5_context *md5) { locus_entry->norm_id = 0xFF; read_bytes(hfile, (void *)&locus_entry->version, sizeof(int32_t), md5); if (locus_entry->version < 4 || locus_entry->version == 5 || locus_entry->version > 8) error("Locus version %d in manifest file not supported\n", locus_entry->version); read_pfx_string(hfile, &locus_entry->ilmn_id, NULL, md5); read_pfx_string(hfile, &locus_entry->name, NULL, md5); read_pfx_string(hfile, NULL, NULL, md5); read_pfx_string(hfile, NULL, NULL, md5); read_pfx_string(hfile, NULL, NULL, md5); read_bytes(hfile, (void *)&locus_entry->index, sizeof(int32_t), md5); read_pfx_string(hfile, NULL, NULL, md5); read_pfx_string(hfile, &locus_entry->ilmn_strand, NULL, md5); read_pfx_string(hfile, &locus_entry->snp, NULL, md5); read_pfx_string(hfile, &locus_entry->chrom, NULL, md5); read_pfx_string(hfile, &locus_entry->ploidy, NULL, md5); read_pfx_string(hfile, &locus_entry->species, NULL, md5); read_pfx_string(hfile, &locus_entry->map_info, NULL, md5); read_pfx_string(hfile, &locus_entry->top_genomic_seq, NULL, md5); // only version 4 read_pfx_string(hfile, &locus_entry->customer_strand, NULL, md5); read_bytes(hfile, (void *)&locus_entry->address_a, sizeof(int32_t), md5); read_bytes(hfile, (void *)&locus_entry->address_b, sizeof(int32_t), md5); read_pfx_string(hfile, &locus_entry->allele_a_probe_seq, NULL, md5); // only version 4 read_pfx_string(hfile, &locus_entry->allele_b_probe_seq, NULL, md5); // only version 4 read_pfx_string(hfile, &locus_entry->genome_build, NULL, md5); read_pfx_string(hfile, &locus_entry->source, NULL, md5); read_pfx_string(hfile, &locus_entry->source_version, NULL, md5); read_pfx_string(hfile, &locus_entry->source_strand, NULL, md5); read_pfx_string(hfile, &locus_entry->source_seq, NULL, md5); // only version 4 if (locus_entry->source_seq) { char *ptr = strchr(locus_entry->source_seq, '-'); if (ptr && *(ptr - 1) == '/') { *ptr = *(ptr - 2); *(ptr - 2) = '-'; } } if (locus_entry->version >= 6) { read_bytes(hfile, NULL, 1, md5); read_bytes(hfile, (void *)&locus_entry->exp_clusters, sizeof(int8_t), md5); read_bytes(hfile, (void *)&locus_entry->intensity_only, sizeof(int8_t), md5); read_bytes(hfile, (void *)&locus_entry->assay_type, sizeof(uint8_t), md5); if (locus_entry->assay_type < 0 || locus_entry->assay_type > 2) error("Format error in reading assay type from locus entry\n"); if (locus_entry->address_b == 0 && locus_entry->assay_type != 0) error("Manifest format error: Assay type is inconsistent with address B\n"); if (locus_entry->address_b != 0 && locus_entry->assay_type == 0) error("Manifest format error: Assay type is inconsistent with address B\n"); } else { locus_entry->assay_type = get_assay_type(locus_entry->allele_a_probe_seq, locus_entry->allele_b_probe_seq, locus_entry->source_seq); } if (locus_entry->version >= 7) { read_bytes(hfile, &locus_entry->frac_a, sizeof(float), md5); read_bytes(hfile, &locus_entry->frac_c, sizeof(float), md5); read_bytes(hfile, &locus_entry->frac_t, sizeof(float), md5); read_bytes(hfile, &locus_entry->frac_g, sizeof(float), md5); } if (locus_entry->version >= 8) read_pfx_string(hfile, &locus_entry->ref_strand, NULL, md5); } typedef struct { char *fn; hFILE *hfile; // bpm file htsFile *fp; // csv file int32_t version; char *manifest_name; // Name of manifest char *control_config; // Control description from manifest int32_t num_loci; // Number of loci in manifest int32_t *indexes; char **names; // Names of loci from manifest void *names2index; uint8_t *norm_ids; LocusEntry *locus_entries; uint8_t *norm_lookups; char **header; size_t m_header; char unsigned md5_buf[16]; } bpm_t; static uint8_t *bpm_norm_lookups(bpm_t *bpm) { int i; uint8_t sorted_norm_ids[256]; for (i = 0; i < 256; i++) sorted_norm_ids[i] = 0xFF; for (i = 0; i < bpm->num_loci; i++) { int norm_id = bpm->locus_entries[i].norm_id; sorted_norm_ids[norm_id] = norm_id; } int j = 0; for (i = 0; i < 256; i++) if (sorted_norm_ids[i] != 0xFF) sorted_norm_ids[j++] = sorted_norm_ids[i]; uint8_t *norm_lookups = (uint8_t *)malloc(256 * sizeof(uint8_t *)); memset((void *)norm_lookups, 0xFF, 256 * sizeof(uint8_t *)); for (i = 0; i < j; i++) norm_lookups[sorted_norm_ids[i]] = i; return norm_lookups; } static bpm_t *bpm_init(const char *fn, int eof_check, int make_dict, int checksum) { bpm_t *bpm = (bpm_t *)calloc(1, sizeof(bpm_t)); bpm->fn = strdup(fn); bpm->hfile = hopen(bpm->fn, "rb"); if (bpm->hfile == NULL) error("Could not open %s: %s\n", bpm->fn, strerror(errno)); if (is_gzip(bpm->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", bpm->fn); hts_md5_context *md5 = checksum ? hts_md5_init() : NULL; int i; uint8_t buffer[4]; if (md5_hread(bpm->hfile, (void *)buffer, 4, md5) < 4) error("Failed to read magic number from %s file\n", bpm->fn); if (memcmp(buffer, "BPM", 3) != 0) error("BPM file %s format identifier is bad\n", bpm->fn); if (buffer[3] != 1) error("BPM file %s version is unknown\n", bpm->fn); read_bytes(bpm->hfile, (void *)&bpm->version, sizeof(int32_t), md5); if (bpm->version & 0x1000) bpm->version ^= 0x1000; if (bpm->version > 5 || bpm->version < 3) error("BPM file %s version %d is unsupported\n", bpm->fn, bpm->version); read_pfx_string(bpm->hfile, &bpm->manifest_name, NULL, md5); if (bpm->version > 1) read_pfx_string(bpm->hfile, &bpm->control_config, NULL, md5); read_bytes(bpm->hfile, (void *)&bpm->num_loci, sizeof(int32_t), md5); read_array(bpm->hfile, (void **)&bpm->indexes, NULL, bpm->num_loci, sizeof(int32_t), 0, md5); bpm->names = (char **)malloc(bpm->num_loci * sizeof(char *)); for (i = 0; i < bpm->num_loci; i++) read_pfx_string(bpm->hfile, &bpm->names[i], NULL, md5); if (make_dict) { bpm->names2index = khash_str2int_init(); for (i = 0; i < bpm->num_loci; i++) { if (khash_str2int_has_key(bpm->names2index, bpm->names[i])) error("Illumina probe %s present multiple times in file %s\n", bpm->names[i], fn); khash_str2int_inc(bpm->names2index, bpm->names[i]); } } read_array(bpm->hfile, (void **)&bpm->norm_ids, NULL, bpm->num_loci, sizeof(uint8_t), 0, md5); bpm->locus_entries = (LocusEntry *)malloc(bpm->num_loci * sizeof(LocusEntry)); LocusEntry locus_entry; for (i = 0; i < bpm->num_loci; i++) { memset(&locus_entry, 0, sizeof(LocusEntry)); locusentry_read(&locus_entry, bpm->hfile, md5); int idx = locus_entry.index - 1; if (idx < 0 || idx >= bpm->num_loci) error("Locus entry index %d is out of boundaries\n", locus_entry.index); if (bpm->norm_ids[idx] > 100) error("Manifest format error: read invalid normalization ID %d\n", bpm->norm_ids[idx]); // To mimic the flawed byte-wrapping behavior from GenomeStudio, AutoCall, and // IAAP, this value is allowed to overflow beyond 255, which happens with some // probes in the Omni5 arrays bpm->norm_ids[idx] += 100 * locus_entry.assay_type; locus_entry.norm_id = bpm->norm_ids[idx]; memcpy(&bpm->locus_entries[idx], &locus_entry, sizeof(LocusEntry)); } bpm->norm_lookups = bpm_norm_lookups(bpm); for (i = 0; i < bpm->num_loci; i++) { if (i != bpm->locus_entries[i].index - 1) error("Manifest format error: read invalid number of assay entries\n"); } if (bpm->locus_entries[0].version < 8) fprintf(stderr, "Warning: RefStrand annotation missing from manifest file %s\n", bpm->fn); read_bytes(bpm->hfile, (void *)&bpm->m_header, sizeof(int32_t), md5); bpm->header = (char **)malloc(bpm->m_header * sizeof(char *)); for (i = 0; i < bpm->m_header; i++) read_pfx_string(bpm->hfile, &bpm->header[i], NULL, md5); if (!heof(bpm->hfile)) { if (eof_check) error( "BPM reader did not reach the end of file %s at position %ld\nUse --do-not-check-eof to suppress this " "check\n", bpm->fn, htell(bpm->hfile)); if (checksum) while (md5_hgetc(bpm->hfile, md5) != EOF); } if (md5) { hts_md5_final(bpm->md5_buf, md5); hts_md5_destroy(md5); } return bpm; } static void bpm_destroy(bpm_t *bpm) { if (!bpm) return; int i; if (bpm->hfile && hclose(bpm->hfile) < 0) error("Error closing BPM file %s\n", bpm->fn); free(bpm->fn); if (bpm->fp && hts_close(bpm->fp) < 0) error("Error closing CSV file %s\n", bpm->fp->fn); free(bpm->manifest_name); free(bpm->control_config); free(bpm->indexes); if (bpm->names) { for (i = 0; i < bpm->num_loci; i++) free(bpm->names[i]); free(bpm->names); } khash_str2int_destroy(bpm->names2index); free(bpm->norm_ids); for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; free(locus_entry->ilmn_id); free(locus_entry->name); free(locus_entry->ilmn_strand); free(locus_entry->snp); free(locus_entry->chrom); free(locus_entry->ploidy); free(locus_entry->species); free(locus_entry->map_info); free(locus_entry->customer_strand); free(locus_entry->allele_a_probe_seq); free(locus_entry->allele_b_probe_seq); free(locus_entry->genome_build); free(locus_entry->source); free(locus_entry->source_version); free(locus_entry->source_strand); free(locus_entry->source_seq); free(locus_entry->top_genomic_seq); free(locus_entry->ref_strand); } free(bpm->locus_entries); free(bpm->norm_lookups); for (i = 0; i < bpm->m_header; i++) free(bpm->header[i]); free(bpm->header); free(bpm); } /**************************************** * EGT FILE IMPLEMENTATION * ****************************************/ // http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumEGTFile.java // http://github.com/Illumina/BeadArrayFiles/blob/develop/module/ClusterFile.py typedef struct { int32_t N; // Number of samples assigned to cluster during training float r_dev; // R (intensity) std deviation value float r_mean; // R (intensity) mean value float theta_dev; // Theta std devation value float theta_mean; // Theta mean value } ClusterStats; typedef struct { float cluster_separation; // A score measure the separation between genotype clusters float total_score; // The GenTrain score float original_score; // The original score before editing this cluster uint8_t edited; // Whether this cluster has been manually manipulated } ClusterScore; typedef struct { ClusterStats aa_cluster_stats; // Describes AA genotype cluster ClusterStats ab_cluster_stats; // Describes AB genotype cluster ClusterStats bb_cluster_stats; // Describes BB genotype cluster float intensity_threshold; // Intensity threshold for no-call ClusterScore cluster_score; // Various scores for cluster int32_t address; // Bead type identifier for probe A float r_mean; // precomputed clusters mean } ClusterRecord; typedef struct { char *fn; hFILE *hfile; int32_t version; char *gencall_version; // The GenCall version char *cluster_version; // The clustering algorithm version char *call_version; // The genotyping algorithm version char *normalization_version; // The normalization algorithm version char *date_created; // The date the cluster file was created (e.g., 3/9/2017 2:18:30 PM) uint8_t is_wgt; int32_t data_block_version; char *opa; char *manifest_name; // The manifest name used to build this cluster file int32_t num_records; ClusterRecord *cluster_records; char **names; // Names of records from manifest void *names2index; char unsigned md5_buf[16]; } egt_t; static void clusterscore_read(ClusterScore *clusterscore, hFILE *hfile, hts_md5_context *md5) { read_bytes(hfile, (void *)&clusterscore->cluster_separation, sizeof(float), md5); read_bytes(hfile, (void *)&clusterscore->total_score, sizeof(float), md5); read_bytes(hfile, (void *)&clusterscore->original_score, sizeof(float), md5); read_bytes(hfile, (void *)&clusterscore->edited, sizeof(uint8_t), md5); } static void clusterrecord_read(ClusterRecord *clusterrecord, hFILE *hfile, int32_t data_block_version, hts_md5_context *md5) { read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.N, sizeof(int32_t), md5); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.N, sizeof(int32_t), md5); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.N, sizeof(int32_t), md5); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.r_mean, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.r_mean, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.r_mean, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_dev, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->aa_cluster_stats.theta_mean, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->ab_cluster_stats.theta_mean, sizeof(float), md5); read_bytes(hfile, (void *)&clusterrecord->bb_cluster_stats.theta_mean, sizeof(float), md5); if (data_block_version >= 7) { read_bytes(hfile, (void *)&clusterrecord->intensity_threshold, sizeof(float), md5); read_bytes(hfile, NULL, 14 * sizeof(float), md5); } else { clusterrecord->intensity_threshold = NAN; } } static egt_t *egt_init(const char *fn, int eof_check, int checksum) { int i; egt_t *egt = (egt_t *)calloc(1, sizeof(egt_t)); egt->fn = strdup(fn); egt->hfile = hopen(egt->fn, "rb"); if (egt->hfile == NULL) error("Could not open %s: %s\n", egt->fn, strerror(errno)); if (is_gzip(egt->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", egt->fn); hts_md5_context *md5 = checksum ? hts_md5_init() : NULL; read_bytes(egt->hfile, (void *)&egt->version, sizeof(int32_t), md5); if (egt->version != 3) error("EGT cluster file version %d not supported\n", egt->version); read_pfx_string(egt->hfile, &egt->gencall_version, NULL, md5); read_pfx_string(egt->hfile, &egt->cluster_version, NULL, md5); read_pfx_string(egt->hfile, &egt->call_version, NULL, md5); read_pfx_string(egt->hfile, &egt->normalization_version, NULL, md5); read_pfx_string(egt->hfile, &egt->date_created, NULL, md5); read_bytes(egt->hfile, (void *)&egt->is_wgt, sizeof(uint8_t), md5); if (egt->is_wgt != 1) error("Only WGT cluster file version supported\n"); read_pfx_string(egt->hfile, &egt->manifest_name, NULL, md5); read_bytes(egt->hfile, (void *)&egt->data_block_version, sizeof(int32_t), md5); if (egt->data_block_version < 5 || egt->data_block_version == 6 || egt->data_block_version > 9) error("Data block version %d in cluster file not supported\n", egt->data_block_version); read_pfx_string(egt->hfile, &egt->opa, NULL, md5); read_bytes(egt->hfile, (void *)&egt->num_records, sizeof(int32_t), md5); egt->cluster_records = (ClusterRecord *)malloc(egt->num_records * sizeof(ClusterRecord)); for (i = 0; i < egt->num_records; i++) clusterrecord_read(&egt->cluster_records[i], egt->hfile, egt->data_block_version, md5); for (i = 0; i < egt->num_records; i++) clusterscore_read(&egt->cluster_records[i].cluster_score, egt->hfile, md5); // toss useless strings such as aa_ab_bb/aa_ab/aa_bb/ab_bb for (i = 0; i < egt->num_records; i++) read_pfx_string(egt->hfile, NULL, NULL, md5); egt->names = (char **)malloc(egt->num_records * sizeof(char *)); egt->names2index = khash_str2int_init(); for (i = 0; i < egt->num_records; i++) { read_pfx_string(egt->hfile, &egt->names[i], NULL, md5); if (khash_str2int_has_key(egt->names2index, egt->names[i])) error("Illumina probe %s present multiple times in file %s\n", egt->names[i], fn); khash_str2int_inc(egt->names2index, egt->names[i]); } for (i = 0; i < egt->num_records; i++) read_bytes(egt->hfile, (void *)&egt->cluster_records[i].address, sizeof(int32_t), md5); int32_t aa_n, ab_n, bb_n; for (i = 0; i < egt->num_records; i++) { read_bytes(egt->hfile, (void *)&aa_n, sizeof(int32_t), md5); read_bytes(egt->hfile, (void *)&ab_n, sizeof(int32_t), md5); read_bytes(egt->hfile, (void *)&bb_n, sizeof(int32_t), md5); if (egt->cluster_records[i].aa_cluster_stats.N != aa_n || egt->cluster_records[i].ab_cluster_stats.N != ab_n || egt->cluster_records[i].bb_cluster_stats.N != bb_n) error("Cluster counts don't match with EGT cluster file %s\n", egt->fn); } if (egt->data_block_version == 9) read_bytes(egt->hfile, NULL, egt->num_records * sizeof(float), md5); if (eof_check && !heof(egt->hfile)) error( "EGT reader did not reach the end of file %s at position %ld\nUse --do-not-check-eof to suppress this " "check\n", egt->fn, htell(egt->hfile)); if (!heof(egt->hfile)) { if (eof_check) error( "EGT reader did not reach the end of file %s at position %ld\nUse --do-not-check-eof to suppress this " "check\n", egt->fn, htell(egt->hfile)); if (checksum) while (md5_hgetc(egt->hfile, md5) != EOF); } if (md5) { hts_md5_final(egt->md5_buf, md5); hts_md5_destroy(md5); } for (i = 0; i < egt->num_records; i++) { ClusterStats *aa = &egt->cluster_records[i].aa_cluster_stats; ClusterStats *ab = &egt->cluster_records[i].ab_cluster_stats; ClusterStats *bb = &egt->cluster_records[i].bb_cluster_stats; egt->cluster_records[i].r_mean = (aa->N * aa->r_mean + ab->N * ab->r_mean + bb->N * bb->r_mean) / (aa->N + ab->N + bb->N); } return egt; } static void egt_destroy(egt_t *egt) { if (!egt) return; int i; if (hclose(egt->hfile) < 0) error("Error closing EGT file %s\n", egt->fn); free(egt->fn); free(egt->gencall_version); free(egt->cluster_version); free(egt->call_version); free(egt->normalization_version); free(egt->date_created); free(egt->opa); free(egt->manifest_name); free(egt->cluster_records); for (i = 0; i < egt->num_records; i++) free(egt->names[i]); free(egt->names); khash_str2int_destroy(egt->names2index); free(egt); } // static void egt_to_csv(const egt_t *egt, FILE *stream, int verbose) { // fprintf(stream, "Illumina, Inc.\n"); // fprintf(stream, "[Heading]\n"); // fprintf(stream, "Descriptor File Name,%s\n", strrchr(egt->fn, '/') ? strrchr(egt->fn, '/') + 1 : egt->fn); // fprintf(stream, "GenCall version,%s\n", egt->gencall_version); // fprintf(stream, "Clustering algorithm version,%s\n", egt->cluster_version); // fprintf(stream, "Genotyping algorithm version,%s\n", egt->call_version); // fprintf(stream, "Normalization algorithm version,%s\n", egt->normalization_version); // fprintf(stream, "Date Manufactured,%s\n", egt->date_created); // fprintf(stream, "Manifest name used to build this cluster file,%s\n", egt->manifest_name); // fprintf(stream, "OPA,%s\n", egt->opa ? egt->opa : ""); // fprintf(stream, "Loci Count,%d\n", egt->num_records); // fprintf(stream, "[Assay]\n"); // fprintf(stream, // "Name,AA.N,AA.R_dev,AA.R_mean,AA.Theta_dev,AA.Theta_mean,AB.N,AB.R_dev,AB.R_mean,AB." // "Theta_dev,AB.Theta_mean,BB.N,BB.R_dev,BB.R_mean,BB.Theta_dev,BB.Theta_mean,Intensity " // "Threshold,Cluster Separation,GenTrain Score,Original Score,Edited,Address\n"); // if (verbose) { // int i; // for (i = 0; i < egt->num_records; i++) { // ClusterRecord *cluster_record = &egt->cluster_records[i]; // fprintf(stream, "%s,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%d,%f,%f,%f,%f,%f,%f,%f,%f,%d,%d\n", egt->names[i], // cluster_record->aa_cluster_stats.N, cluster_record->aa_cluster_stats.r_dev, // cluster_record->aa_cluster_stats.r_mean, cluster_record->aa_cluster_stats.theta_dev, // cluster_record->aa_cluster_stats.theta_mean, cluster_record->ab_cluster_stats.N, // cluster_record->ab_cluster_stats.r_dev, cluster_record->ab_cluster_stats.r_mean, // cluster_record->ab_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_mean, // cluster_record->bb_cluster_stats.N, cluster_record->bb_cluster_stats.r_dev, // cluster_record->bb_cluster_stats.r_mean, cluster_record->bb_cluster_stats.theta_dev, // cluster_record->bb_cluster_stats.theta_mean, cluster_record->intensity_threshold, // cluster_record->cluster_score.cluster_separation, cluster_record->cluster_score.total_score, // cluster_record->cluster_score.original_score, cluster_record->cluster_score.edited, // cluster_record->address); // } // } else { // fprintf(stream, "... use --verbose to visualize Assay data ...\n"); // } // } /**************************************** * MATLAB ROBUST FIT ROUTINES * ****************************************/ // the code for these routines was derived from the Statistics and Machine Learning Toolbox in MATLAB // Illumina implemented the whole robustfit() function despite the fact that they only needed // the one dimensional case of it that could easily do away with matrices // the implementation here reimplements one dimensional linear regression to solve the linear least squares problem // and so doing away with the need for matrix routines for computing the QR matrix factorization // the original implementation of robustfit() from Tom Lane was the one adopted in GenTrain 2.0: // http://github.com/iarsenal95/computer_vision/blob/master/final_project/MATLAB/boosting/weightedstats/private/statrobustfit.m // in 2002 Tom Lane realized that the MATLAB implementation of the madsigma() sub-routine was problematic: // http://groups.google.com/g/comp.soft-sys.matlab/c/Raf-VYUh9yY/m/gIi16wAR4VQJ // this must have led to the new version of madsigma() being adopted in GenTrain 3.0: // http://github.com/stephane-on/Spectral_analysis/blob/master/statrobustfit.m inline static double sqr(double x) { return x * x; } inline static float sqrf(float x) { return x * x; } // equivalent to MATLAB linsolve(x,y) // http://www.mathworks.com/help/matlab/ref/linsolve.html static int matlab_linsolve0(int n, const float *x, const float *y, double *m) { int i; double sumx2 = 0.0; double sumxy = 0.0; for (i = 0; i < n; i++) { sumx2 += sqr((double)x[i]); sumxy += (double)x[i] * (double)y[i]; } if (sumx2 == 0) return 1; *m = sumxy / sumx2; return 0; } // equivalent to MATLAB linsolve([ones(n,1), x],y) // http://www.mathworks.com/help/matlab/ref/linsolve.html static int matlab_linsolve1(int n, const float *x, const float *y, double *b, double *m) { int i; double sumx2 = 0.0; double sumxy = 0.0; double sumx = 0.0; double sumy = 0.0; for (i = 0; i < n; i++) { sumx2 += sqr((double)x[i]); sumxy += (double)y[i] * (double)x[i]; sumx += (double)x[i]; sumy += (double)y[i]; } double denom = (double)n * sumx2 - sumx * sumx; if (denom == 0) return 1; *m = (n * sumxy - sumx * sumy) / denom; *b = (sumy * sumx2 - sumx * sumxy) / denom; return 0; } // equivalent to MATLAB wfit(y,x,w) which is equivalent to linsolve(diag(sqrt(w))*x,diag(sqrt(w))*y) // stats/private/statrobustfit.m static int matlab_wfit0(int n, const float *y, const float *x, const double *w, double *m) { int i; double wsumx2 = 0.0; double wsumxy = 0.0; for (i = 0; i < n; i++) { wsumx2 += w[i] * sqr((double)x[i]); wsumxy += w[i] * (double)x[i] * (double)y[i]; } if (wsumx2 == 0) return 1; *m = wsumxy / wsumx2; return 0; } // equivalent to MATLAB wfit(y,[ones(n,1),x],w) which is equivalent to // linsolve(diag(sqrt(w))*[ones(n,1),x],diag(sqrt(w))*y) stats/private/statrobustfit.m static int matlab_wfit1(int n, const float *y, const float *x, const double *w, double *b, double *m) { int i; double wsumx2 = 0.0; double wsumxy = 0.0; double wsumx = 0.0; double wsumy = 0.0; double wsum = 0.0; for (i = 0; i < n; i++) { wsumx2 += w[i] * sqr((double)x[i]); wsumxy += w[i] * (double)x[i] * (double)y[i]; wsumx += w[i] * (double)x[i]; wsumy += w[i] * (double)y[i]; wsum += w[i]; } double denom = wsum * wsumx2 - wsumx * wsumx; if (denom == 0) return 1; *m = (wsum * wsumxy - wsumx * wsumy) / denom; *b = (wsumy * wsumx2 - wsumx * wsumxy) / denom; return 0; } // http://www.mathworks.com/help/stats/nanmean.html static float matlab_nanmean(int n, const float *vals) { if (n == 0) return NAN; int i, j; double sum = 0.0; for (i = 0, j = 0; i < n; i++) { if (!isnan(vals[i])) { sum += vals[i]; j++; } } return (float)(sum / (double)j); } // http://www.mathworks.com/help/matlab/ref/mean.html static float matlab_mean(int n, const float *vals) { if (n == 0) return NAN; int i; double sum = 0.0; for (i = 0; i < n; i++) sum += vals[i]; return (float)(sum / (double)n); } // the input array does not need to be sorted // http://www.mathworks.com/help/matlab/ref/median.html static float matlab_median(int n, float *vals) { if (n == 0) return 0.0f; ks_introsort_float((size_t)n, vals); if (n % 2 == 1) return vals[n / 2]; return (vals[n / 2 - 1] + vals[n / 2]) * 0.5f; } // stats/private/statrobustfit.m // function s = madsigma(r,p) // %MADSIGMA Compute sigma estimate using MAD of residuals from 0 // rs = sort(abs(r)); // s = median(rs(max(1,p):end)) / 0.6745; % 0.6745 ~ qnorm(0.75) static double matlab_madsigma_new(int n, const double *r, int p) { int i; float *rs = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) rs[i] = (float)fabs(r[i]); ks_introsort_float((size_t)n, rs); double s = (double)matlab_median(n - (p - 1), rs + (p - 1)) / 0.6745; if (s == 0.0) s = 0.5 * (double)matlab_mean(n, rs); free(rs); return s; } // a separate implementation from Illumina can be found in function madsigma in file Utils.cs // the code follows the original implementation from Tom Lane in 2000 // stats/private/statrobustfit.m // function s = madsigma(r,p); // %MADSIGMA Compute sigma estimate using MAD of residuals // m = median(r); // rs = sort(abs(r-m)); // if (abs(m) > rs(end)) // % Unexpectedly all residuals are very small // rs = sort(abs(r)); // end // s = median(rs(p:end)) / 0.6745; % 0.6745 ~ qnorm(0.75) // if (s==0), s = .5*mean(rs); end static double matlab_madsigma_old(int n, const double *r, int p) { int i; float *rs = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) rs[i] = (float)r[i]; float m = matlab_median((size_t)n, rs); for (i = 0; i < n; i++) rs[i] = fabsf(rs[i] - m); ks_introsort_float((size_t)n, rs); if (fabsf(m) > rs[n - 1]) { for (i = 0; i < n; i++) rs[i] = fabsf((float)r[i]); ks_introsort_float((size_t)n, rs); } double s = (double)matlab_median(n - (p - 1), rs + (p - 1)) / 0.6745; if (s == 0.0) s = 0.5 * (double)matlab_mean(n, rs); free(rs); return s; } // roughly equivalent to MATLAB robustfit(x,y,'bisquare',4.685,'off') // http://www.mathworks.com/help/stats/robustfit.html // stats/private/statrobustfit.m // stats/private/statrobustwfun.m static void matlab_robustfit0(int n, const float *x, const float *y, double (*madsigma)(int, const double *, int), float *out_m) { int i; double *r = (double *)malloc(n * sizeof(double)); double *w = (double *)malloc(n * sizeof(double)); double m, m0 = 0.0; if (matlab_linsolve0(n, x, y, &m)) error("Error while running linsolve0\n"); // [Q,R] = qr(x,0); // R = [sqrt(sum(x.^2)] // E = X/R = [x/sqrt(sum(x.^2)] // h = min(.9999, sum(E.*E,2)) = min(.9999, x.^2 / sum(x.^2)) // adjfactor = 1 ./ sqrt(1-h) // as GenCall messed up the implementation, here we use instead // h = min(.9999, sum(E.*E)) = min(.9999, sum(x.^2) / sum(x.^2)) = .9999 // adjfactor = 1 / sqrt(1 - 0.9999) ~ 100; double adjfactor = 100.0 + 24832 * DBL_EPSILON; int iter = 0; do { // as Illumina messed up the implementation, here we use adjfactor instead of adjfactor[i] for (i = 0; i < n; i++) r[i] = ((double)y[i] - m * (double)x[i]) * adjfactor; double s = madsigma(n, r, 1); if (s == 0.0) s = 1.0; for (i = 0; i < n; i++) { r[i] *= 1.0 / (s * 4.685); w[i] = fabs(r[i]) < 1 ? sqr(1.0 - sqr(r[i])) : 0.0; } m0 = m; if (matlab_wfit0(n, y, x, w, &m)) error("Error while running wfit0\n"); iter++; } while (iter < 50 && fabs(m - m0) > 1e-6 * (double)fmaxf((float)fabs(m), (float)fabs(m0))); free(r); free(w); *out_m = (float)m; } // roughly equivalent to MATLAB robustfit(x,y,'bisquare',4.685,'on') // http://www.mathworks.com/help/stats/robustfit.html // stats/private/statrobustfit.m // stats/private/statrobustwfun.m static void matlab_robustfit1(int n, const float *x, const float *y, double (*madsigma)(int, const double *, int), float *out_b, float *out_m) { #ifdef __GNUC__ if (n <= 0) __builtin_unreachable(); // to prevent an unnecessary "may be used uninitialized" warning #endif int i; double *adjfactor = (double *)malloc(n * sizeof(double)); double *r = (double *)malloc(n * sizeof(double)); double *w = (double *)malloc(n * sizeof(double)); double b, m, b0 = 0.0, m0 = 0.0; if (matlab_linsolve1(n, x, y, &b, &m)) error( "Error while running linsolve1\nFailed to normalize and gencall\nThis typically happens when the wrong " "manifest file is used\n"); // [Q,R] = qr([ones(n,1),x],0); // R = [-sqrt(n), -sum(x)/sqrt(n); 0, sqrt(sum(x.^2)-sum(x)^2/n)] // E = X/R = [-ones(n,1)/sqrt(n), (sum(x)/n-x)/sqrt(sum(x.^2)-sum(x)^2/n)] // h = min(.9999, sum(E.*E,2)) = min(.9999, (n*x.^2 - 2*sum(x)*x + sum(x.^2))/(n*sum(x.^2) - sum(x)^2)) double sumx = 0.0; double sumx2 = 0.0; for (i = 0; i < n; i++) { sumx += (double)x[i]; sumx2 += sqr((double)x[i]); } double denom = (double)n * sumx2 - sqr(sumx); for (i = 0; i < n; i++) { double h = fmin(.9999, ((double)n * sqr((double)x[i]) - 2.0 * sumx * (double)x[i] + sumx2) / denom); adjfactor[i] = 1.0 / sqrt(1.0 - h); } int iter = 0; do { for (i = 0; i < n; i++) r[i] = ((double)y[i] - b - m * (double)x[i]) * adjfactor[i]; double s = madsigma(n, r, 2); if (s == 0.0) s = 1.0; for (i = 0; i < n; i++) { r[i] *= 1.0 / (s * 4.685); w[i] = fabs(r[i]) < 1 ? sqr(1.0 - sqr(r[i])) : 0.0; } b0 = b; m0 = m; if (matlab_wfit1(n, y, x, w, &b, &m)) error("Error while running wfit1\n"); iter++; } while (iter < 50 && (fabs(b - b0) > 1e-6 * (double)fmaxf((float)fabs(b), (float)fabs(b0)) || fabs(m - m0) > 1e-6 * (double)fmaxf((float)fabs(m), (float)fabs(m0)))); free(adjfactor); free(r); free(w); *out_b = (float)b; *out_m = (float)m; } /**************************************** * NEAREST NEIGHBOR ROUTINES * ****************************************/ // a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs // It seems like Illumina at first used a function with O(n^2) complexity for the same task and then when they switched // from GoldenGate to larger Infinium arrays this solution did not scale anymore. This led to a reimplementation in C as // the C# version was not fast enough. For this reason AutoConvert, an almost entirely C# executable, requires this // specific function as unmanaged C code while IAAP and ACLI have their equivalent version in C#, maybe because by then // computers had become fast enough int elementsInBin[12]; int *binData[12]; int elementsInShiftedBin[11]; int *binDataShifted[11]; // a separate implementation from Illumina of this function can be found in function ClosestPointsB int findClosestSitesToPointsAlongAxis(int n_raw, float *raw_x, float *raw_y, int n_axis, float *axis_x, float *axis_y, int *ret) { int i; float *raw_a = NULL; float *raw_b = NULL; float *axis_a = NULL; float axis_max_val; float bin_width; int bin_idx; float quotient; float reminder; int *curr_bin_data; int curr_bin_size; float curr_axis_x; float curr_axis_y; float x_dist; float y_dist; double best_val; int best_idx; int j; int curr_idx; double sq_dist; double axis_max_dist; int use_y = 1; int use_x = 1; for (i = 0; i < n_axis; i++) { if (axis_x[i] > 0.0001) { use_y = 0; break; } } for (i = 0; i < n_axis; i++) { if (axis_y[i] > 0.0001) { use_x = 0; break; } } if (use_y) { raw_a = raw_y; raw_b = raw_x; axis_a = axis_y; } else if (use_x) { raw_a = raw_x; raw_b = raw_y; axis_a = axis_x; } else { return -1; } axis_max_val = axis_a[n_axis - 1]; bin_width = axis_max_val / 12.0f; axis_max_dist = (double)bin_width; for (i = 0; i < n_raw; i++) { if ((double)raw_b[i] > axis_max_dist) continue; bin_idx = (int)(raw_a[i] / bin_width); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; elementsInBin[bin_idx]++; bin_idx = (int)(raw_a[i] / bin_width - 0.5f); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 10) bin_idx = 10; elementsInShiftedBin[bin_idx]++; } for (i = 0; i <= 11; i++) { binData[i] = (int *)malloc((size_t)elementsInBin[i] * sizeof(int)); elementsInBin[i] = 0; if (i == 11) continue; binDataShifted[i] = (int *)malloc((size_t)elementsInShiftedBin[i] * sizeof(int)); elementsInShiftedBin[i] = 0; } for (i = 0; i < n_raw; i++) { if ((double)raw_b[i] > axis_max_dist) continue; bin_idx = (int)(raw_a[i] / bin_width); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; binData[bin_idx][elementsInBin[bin_idx]] = i; elementsInBin[bin_idx]++; bin_idx = (int)(raw_a[i] / bin_width - 0.5f); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 10) bin_idx = 10; binDataShifted[bin_idx][elementsInShiftedBin[bin_idx]] = i; elementsInShiftedBin[bin_idx]++; } for (i = 0; i < n_axis; i++) { quotient = axis_a[i] / bin_width; bin_idx = (int)quotient; reminder = quotient - (float)bin_idx; curr_bin_data = NULL; curr_bin_size = 0; if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; if (0.25f <= reminder && reminder <= 0.75f) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { if (reminder < 0.25f) { if (bin_idx == 0) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { curr_bin_data = binDataShifted[bin_idx - 1]; curr_bin_size = elementsInShiftedBin[bin_idx - 1]; } } else if (bin_idx == 11) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { curr_bin_data = binDataShifted[bin_idx]; curr_bin_size = elementsInShiftedBin[bin_idx]; } } curr_axis_x = axis_x[i]; curr_axis_y = axis_y[i]; best_val = 1e20; best_idx = -1; for (j = 0; j < curr_bin_size; j++) { curr_idx = curr_bin_data[j]; x_dist = raw_x[curr_idx] - curr_axis_x; y_dist = raw_y[curr_idx] - curr_axis_y; sq_dist = (double)(x_dist * x_dist + y_dist * y_dist); if (sq_dist < best_val) { best_val = sq_dist; best_idx = curr_idx; } } ret[i] = best_idx; } for (i = 0; i <= 11; i++) { free((void *)binData[i]); elementsInBin[i] = 0; if (i > 10) continue; free((void *)binDataShifted[i]); elementsInShiftedBin[i] = 0; } return 0; } // a separate implementation from Illumina of this function can be found in function ClosestPointsSlow // as explained in the patent, this approach is slow as it runs in O(n^2) static int *closest_points_slow(int nref, const float *xref, const float *yref, int n, float *x, float *y) { int i, j, *closest_sites = (int *)malloc(n * sizeof(int)); for (i = 0; i < n; i++) { float xv = x[i]; float yv = y[i]; double mindist = (xv - xref[0]) * (xv - xref[0]) + (yv - yref[0]) * (yv - yref[0]); int mini = 0; for (j = 1; j < nref; j++) { double dist = (xv - xref[j]) * (xv - xref[j]) + (yv - yref[j]) * (yv - yref[j]); if (dist < mindist) { mindist = dist; mini = j; } } closest_sites[i] = mini; } return closest_sites; } #define SAMPLE 2000 // mentioned in file Utils.cs // a separate implementation from Illumina of this function can be found in function ClosestPoints static int *closest_points(int nref, float *xref, float *yref, int n, float *x, float *y) { if (nref < SAMPLE) return closest_points_slow(nref, xref, yref, n, x, y); int *closest_sites = (int *)malloc(n * sizeof(int)); findClosestSitesToPointsAlongAxis(nref, xref, yref, n, x, y, closest_sites); return closest_sites; } /**************************************** * MATLAB UTILS ROUTINES * ****************************************/ // a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs // the input array does need to be sorted static float percentile(int n, const float *vals, int percentile) { if (n == 0) return NAN; int i1 = n * percentile / 100; float f = (float)(n * percentile) / 100.0f - (float)i1; if (f < 0.5f) { i1--; } if (i1 < 0) { return vals[0]; } if (i1 >= n - 1) { return vals[n - 1]; } float x1 = 100.0f * ((float)i1 + 0.5f) / (float)n; float x2 = 100.0f * ((float)(i1 + 1) + 0.5f) / (float)n; float y1 = (float)vals[i1]; float y2 = (float)vals[i1 + 1]; float m = (y2 - y1) / (x2 - x1); return y1 + m * ((float)percentile - x1); } // http://www.mathworks.com/help/matlab/ref/iqr.html static float matlab_iqr(int n, const float *vals) { int i; float *vs = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) vs[i] = vals[i]; ks_introsort_float((size_t)n, vs); float iqr = percentile(n, vs, 75) - percentile(n, vs, 25); free(vs); return iqr; } // http://www.mathworks.com/help/stats/trimmean.html static float matlab_trimmean(int n, float *vals, int percent) { ks_introsort_float((size_t)n, vals); float high = percentile(n, vals, 100 - percent / 2); float low = percentile(n, vals, percent / 2); double sum = 0.0; int i, count = 0; for (i = 0; i < n; i++) { if (vals[i] >= low && vals[i] <= high) { sum += (double)vals[i]; count++; } } return (float)sum / (float)count; } // http://www.mathworks.com/help/matlab/ref/linspace.html static float *matlab_linspace(int n, float minv, float maxv) { int i; float *vals = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) vals[i] = minv + (maxv - minv) * (float)i / (float)(n - 1); return vals; } // the input array does not need to be sorted // http://www.mathworks.com/help/matlab/ref/unique.html static int matlab_unique(int n, int *indices) { int i, j; ks_introsort_int((size_t)n, indices); for (i = 0; indices[i] == -1; i++); indices[0] = indices[i++]; for (j = 1; i < n; i++) if (indices[i] != indices[i - 1]) indices[j++] = indices[i]; return j; } // the input arrays do not need to be sorted // http://www.mathworks.com/help/matlab/ref/union.html static int *matlab_union(int na, const int *a, int nb, const int *b, int *n) { int i, *c = (int *)malloc((na + nb) * sizeof(int)); for (i = 0; i < na; i++) c[i] = a[i]; for (i = 0; i < nb; i++) c[i + na] = b[i]; *n = matlab_unique(na + nb, c); return c; } // http://www.mathworks.com/help/matlab/ref/min.html static float matlab_min(int n, const float *vals) { int i; float minval = FLT_MAX; for (i = 0; i < n; i++) { if (isnan(vals[i])) continue; if (vals[i] < minval) minval = vals[i]; } return minval; } // http://www.mathworks.com/help/matlab/ref/max.html static float matlab_max(int n, const float *vals) { int i; float maxval = -FLT_MAX; for (i = 0; i < n; i++) { if (isnan(vals[i])) continue; if (vals[i] > maxval) maxval = vals[i]; } return maxval; } /**************************************** * NORMALIZATION ROUTINES * ****************************************/ // a thorough explanation of the normalization steps can be found in the document // Kermani, B. G. Artificial intelligence and global normalization methods for genotyping. U.S. Patent No. 7,035,740 // (2005-09-29) http://patents.google.com/patent/US7035740 Peiffer, D. A. et al. High-resolution genomic profiling of // chromosomal aberrations using Infinium whole-genome genotyping. Genome Res., 16, 1136–1148 (2006-08-09) // http://doi-org.ezp-prod1.hul.harvard.edu/10.1101/gr.5402306 // Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006-09-26) // http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf // Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016) // http://emea.illumina.com/content/dam/illumina-marketing/documents/products/technotes/gentrain3-technical-note-370-2016-015.pdf // http://www.illumina.com/content/dam/illumina/gcs/assembled-assets/marketing-literature/gentrain-tech-note-m-gl-01258/gentrain-tech-note-m-gl-01258.pdf // a separate implementation from Illumina of these functions in GenCall can be found in file NormalizationInfinium.cs // http://support.illumina.com/downloads/gencall_software.html // Illumina software includes three normalization protocols for Infinium arrays: // 1.1.0 Normalization10 not used // 1.1.2 Normalization111+Normalization10 used in AutoConvert and IAAP Genotyping CLI with option --gentrain-id 2 // 1.2.0 NormalizationDragonfish+Normalization111_Dragonfish+Normalization10_Dragonfish used in AutoConvert 2.0, IAAP // Genotyping CLI, and Array Analysis CLI we implement version 1.1.2 and 1.2.0 for interoperability purposes with // existing Illumina cluster files // Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome // genotyping. Genome Res., 16, 1136–1148 (2006-08-09) The data for each BeadChip is self-normalized using infor- mation // contained within the array. This normalization algo- rithm removes outliers, adjusts for channel-dependent back- // ground and global intensity differences, and also scales the data. // The X and Y color channels undergo an affine coordinate trans- // formation to make the data appear as canonical as possible with // the homozygotes lying along the transformed x- and y-axes. The // following five steps are applied: (1) outlier removal; (2) a trans- // lation correction in which the asymptotes are fitted to candidate // AA and BB homozygotes; the intersection of these fit lines de- // fines the translated origin; (3) rotational correction: the angle of // the AA homozygote asymptote with respect to the translated // X-axis is used to define the rotational correction; (4) shear cor- // rection: the angle of the BB homozygote asymptote with respect // to the translated and rotated y-axis is used to define the shear // correction; (5) scaling correction: statistical centroids are com- // puted for the candidate AA homozygotes to define an x-axis scal- // ing parameter, and for candidate BB homozygotes to define a // y-axis scaling parameter. The translated, rotated, shear-corrected // data are normalized to a scale of ∼1 using the scaling parameters #define SAMPLING 400 // mentioned in Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006) #define ROBUST_THRESHOLD \ 192 // mentioned in Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016) // a separate implementation from Illumina can be found in functions RemoveOutliers from classes // Normalization10_Dragonfish and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. // No. 970-2006-010 (2006-09-26) Outlier SNPs are removed from consideration during normalization parameter estimation. // These SNPs are only considered outliers during the normalization process and are not excluded from downstream // analysis. A SNP is considered an outlier if its intensity meets any of the following criteria: // - Its value of x, y, or x/(x+y) is smaller than either the 5th smallest or the 1st percentile (whichever is smaller) // of those values across all SNPs. // - Its value of x, y, or x/(x+y) is larger than either the 5th largest or the 99 th percentile (whichever is larger) // of those values across all SNPs. static void remove_outliers(int *n, float *x, float *y) { if (*n < SAMPLING) return; int i, j; float *xs = (float *)malloc(*n * sizeof(float)); float *ys = (float *)malloc(*n * sizeof(float)); float *ts = (float *)malloc(*n * sizeof(float)); for (i = 0; i < *n; i++) { xs[i] = x[i]; ys[i] = y[i]; ts[i] = y[i] / (FLT_MIN * FLT_EPSILON + x[i] + y[i]); } ks_introsort_float((size_t)(*n), xs); ks_introsort_float((size_t)(*n), ys); ks_introsort_float((size_t)(*n), ts); int M = 5; int Nb = 1; float tcut1a = ts[M - 1]; float tcut2a = ts[*n - M + 1 - 1]; float xcut1a = xs[M - 1]; float xcut2a = xs[*n - M + 1 - 1]; float ycut1a = ys[M - 1]; float ycut2a = ys[*n - M + 1 - 1]; float tcut1b = percentile(*n, ts, Nb); float tcut2b = percentile(*n, ts, 100 - Nb); float xcut1b = percentile(*n, xs, Nb); float xcut2b = percentile(*n, xs, 100 - Nb); float ycut1b = percentile(*n, ys, Nb); float ycut2b = percentile(*n, ys, 100 - Nb); float tcut1 = fminf(tcut1a, tcut1b); float tcut2 = fmaxf(tcut2a, tcut2b); float xcut1 = fminf(xcut1a, xcut1b); float xcut2 = fmaxf(xcut2a, xcut2b); float ycut1 = fminf(ycut1a, ycut1b); float ycut2 = fmaxf(ycut2a, ycut2b); for (i = 0, j = 0; i < *n; i++) { if (y[i] <= ycut1 || x[i] <= xcut1 || y[i] >= ycut2 || x[i] >= xcut2) { continue; } double t = y[i] / (double)(y[i] + x[i]); if (t <= tcut1 || t >= tcut2) { continue; } x[j] = x[i]; y[j] = y[i]; j++; } *n = j; free(xs); free(ys); free(ts); } // a separate implementation from Illumina can be found in function RemoveOffset from class Normalization10_Dragonfish // and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 // (2006-09-26) a. An x-sweep is performed by sampling 400 points along the x-axis, from the smallest x value to the // largest. The closest SNP to each sampled point along the axis is added to the set of candidate homozygote As. b. The // same analysis is performed along the y-axis to find the candidate homozygote Bs. c. A straight line is fit into // candidate homozygote A alleles. d. A straight line is fit into candidate homozygote B alleles. e. The intercept of // the two lines is computed, and this coordinate corresponds to offset_x and offset_y. static void remove_offset(int n, float *x, float *y, int *naa, int **iaa, int *nbb, int **ibb, double (*madsigma)(int, const double *, int), float *offset_x, float *offset_y) { if (n < ROBUST_THRESHOLD) { *offset_x = 0.0f; *offset_y = 0.0f; return; } int i; float mx = matlab_min(n, x); float my = matlab_min(n, y); float *xt = (float *)malloc(n * sizeof(float)); float *yt = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) { xt[i] = x[i] - mx; yt[i] = y[i] - my; } float *xsweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, xt)); float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, yt)); float *zeros = (float *)calloc(SAMPLING, sizeof(float)); *iaa = closest_points(n, xt, yt, SAMPLING, xsweep, zeros); *ibb = closest_points(n, xt, yt, SAMPLING, zeros, ysweep); *naa = matlab_unique(SAMPLING, *iaa); *nbb = matlab_unique(SAMPLING, *ibb); float *xaa = (float *)malloc(*naa * sizeof(float)); float *yaa = (float *)malloc(*naa * sizeof(float)); for (i = 0; i < *naa; i++) { xaa[i] = xt[(*iaa)[i]]; yaa[i] = yt[(*iaa)[i]]; } float *xbb = (float *)malloc(*nbb * sizeof(float)); float *ybb = (float *)malloc(*nbb * sizeof(float)); for (i = 0; i < *nbb; i++) { xbb[i] = xt[(*ibb)[i]]; ybb[i] = yt[(*ibb)[i]]; } float baa, maa; float bbb, mbb; matlab_robustfit1(*naa, xaa, yaa, madsigma, &baa, &maa); matlab_robustfit1(*nbb, ybb, xbb, madsigma, &bbb, &mbb); float ox = (bbb + mbb * baa) / (1.0f - mbb * maa); float oy = baa + maa * ox; *offset_x = ox + mx; *offset_y = oy + my; for (i = 0; i < n; i++) { x[i] -= *offset_x; y[i] -= *offset_y; } free(xt); free(yt); free(xsweep); free(ysweep); free(zeros); free(xaa); free(yaa); free(xbb); free(ybb); } // a separate implementation from Illumina can be found in function HandleRotation from class Normalization10_Dragonfish // and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 // (2006-09-26) a. The points are corrected for translation and another x-sweep is performed to determine a set of // control points. b. A straight line is fit into the control points. The angle between this line and the x-axis defines // the amount of rotation in the data. This angle corresponds to the theta parameter. static void handle_rotation(int n, float *x, float *y, int *naa, int **iaa, int *nbb, int **ibb, double (*madsigma)(int, const double *, int), float *theta) { if (n < ROBUST_THRESHOLD) { *theta = 0.0f; return; } int i; float *xsweep = matlab_linspace(SAMPLING, matlab_min(n, x), matlab_max(n, x)); float *ysweep = matlab_linspace(SAMPLING, matlab_min(n, y), matlab_max(n, y)); float *zeros = (float *)calloc(SAMPLING, sizeof(float)); int *tiaa = closest_points(n, x, y, SAMPLING, xsweep, zeros); int *tibb = closest_points(n, x, y, SAMPLING, zeros, ysweep); int naa_in = *naa, *iaa_in = *iaa; int nbb_in = *nbb, *ibb_in = *ibb; *iaa = matlab_union(naa_in, iaa_in, SAMPLING, tiaa, naa); *ibb = matlab_union(nbb_in, ibb_in, SAMPLING, tibb, nbb); float *tx = (float *)malloc(*naa * sizeof(float)); float *ty = (float *)malloc(*naa * sizeof(float)); for (i = 0; i < *naa; i++) { tx[i] = x[(*iaa)[i]]; ty[i] = y[(*iaa)[i]]; } float m; matlab_robustfit0(*naa, tx, ty, madsigma, &m); double taa = atan((double)m); double ct = cos(taa); double st = sin(taa); for (i = 0; i < n; i++) { float tmp = x[i]; x[i] = (float)(ct * (double)x[i] + st * (double)y[i]); y[i] = (float)((0.0 - st) * (double)tmp + ct * (double)y[i]); } *theta = (float)taa; free(xsweep); free(ysweep); free(zeros); free(iaa_in); free(ibb_in); free(tiaa); free(tibb); free(tx); free(ty); } // a separate implementation from Illumina can be found in function HandleShear from class Normalization10_Dragonfish // and Normalization10 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 // (2006-09-26) a. The points are corrected for rotation and another y-sweep is performed to determine a set of control // points. b. A straight line is fit to these control points. The angle of this line identifies the shear parameter static void handle_shear(int n, float *x, float *y, int *nbb, int **ibb, double (*madsigma)(int, const double *, int), float *shear) { if (n < ROBUST_THRESHOLD) { *shear = 0.0f; return; } int i; float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, y)); float *zeros = (float *)calloc(SAMPLING, sizeof(float)); int *tibb = closest_points(n, x, y, SAMPLING, zeros, ysweep); int nbb_in = *nbb, *ibb_in = *ibb; *ibb = matlab_union(nbb_in, ibb_in, SAMPLING, tibb, nbb); float *tx = (float *)malloc(*nbb * sizeof(float)); float *ty = (float *)malloc(*nbb * sizeof(float)); for (i = 0; i < *nbb; i++) { tx[i] = x[(*ibb)[i]]; ty[i] = y[(*ibb)[i]]; } float m; matlab_robustfit0(*nbb, ty, tx, madsigma, &m); double tbb = atan((double)m); double shy = tan(tbb); for (i = 0; i < n; i++) x[i] = (float)((double)x[i] - shy * (double)y[i]); *shear = (float)shy; free(ibb_in); free(ysweep); free(zeros); free(tibb); free(tx); free(ty); } // a separate implementation from Illumina can be found in function HandleScale from classes Normalization10_Dragonfish // and Normalization10 0.7413 ~ 1/(2*qnorm(0.75)) static void base_handle_scale(int n, float *x, float *y, int gentrain_version, float *scale_x, float *scale_y) { int i, naa, nbb, *iaa, *ibb; // this should never happen for (i = 0; i < n; i++) { if (x[i] < 0.0f) x[i] = 0.0f; if (y[i] < 0.0f) y[i] = 0.0f; } if (n < ROBUST_THRESHOLD) { float *t = (float *)malloc(n * sizeof(float)); // for GenTrain 2.0 we replicate the bug by allowing failed probes as AA points for (i = 0; i < n; i++) t[i] = x[i] > 0.0f || y[i] > 0.0f || gentrain_version == 2 ? (float)(180.0 * M_1_PI * atan2((double)y[i], (double)x[i])) : NAN; naa = 0; nbb = 0; for (i = 0; i < n; i++) { if (t[i] < 10.0f) naa++; if (t[i] > 80.0f) nbb++; } iaa = (int *)malloc(naa * sizeof(int)); ibb = (int *)malloc(nbb * sizeof(int)); naa = 0; nbb = 0; for (i = 0; i < n; i++) { if (t[i] < 10.0f) iaa[naa++] = i; if (t[i] > 80.0f) ibb[nbb++] = i; } free(t); } else { float *xsweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, x)); float *ysweep = matlab_linspace(SAMPLING, 0.0f, matlab_max(n, y)); float *zeros = (float *)calloc(SAMPLING, sizeof(float)); iaa = closest_points(n, x, y, SAMPLING, xsweep, zeros); ibb = closest_points(n, x, y, SAMPLING, zeros, ysweep); naa = matlab_unique(SAMPLING, iaa); nbb = matlab_unique(SAMPLING, ibb); free(xsweep); free(ysweep); free(zeros); } float *xaa = (float *)malloc(naa * sizeof(float)); float *ybb = (float *)malloc(nbb * sizeof(float)); for (i = 0; i < naa; i++) xaa[i] = x[iaa[i]]; for (i = 0; i < nbb; i++) ybb[i] = y[ibb[i]]; if (n < ROBUST_THRESHOLD) { *scale_x = matlab_trimmean(naa, xaa, 20); *scale_y = matlab_trimmean(nbb, ybb, 20); } else { *scale_x = 0.5f * matlab_trimmean(naa, xaa, 50) + 0.7413f * matlab_iqr(naa, xaa); *scale_y = 0.5f * matlab_trimmean(nbb, ybb, 50) + 0.7413f * matlab_iqr(nbb, ybb); } for (i = 0; i < n; i++) { x[i] /= *scale_x; y[i] /= *scale_y; } free(iaa); free(ibb); free(xaa); free(ybb); } // a separate implementation from Illumina can be found in function HandleScale from class Normalization111_Dragonfish // and Normalization111 Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 // (2006-09-26) a. The points are corrected for shear, and another x-sweep is performed to identify a set of virtual // points. b. A statistical robust measure of the mean of these control points is used to determine scale_x. c. A // Y-sweep is done, and some virtual points are identified via triangulation. A statistical robust measure of the mean // of these control points is used to determine scale_y. static void handle_scale(int n, float *x, float *y, int gentrain_version, float *scale_x, float *scale_y) { if (n < ROBUST_THRESHOLD) { base_handle_scale(n, x, y, gentrain_version, scale_x, scale_y); return; } int i; int naa = 0; int nbb = 0; float xthrsh = 0.1f * percentile(n, x, 99); float ythrsh = 0.1f * percentile(n, y, 99); for (i = 0; i < n; i++) { if (x[i] > 5.0f * y[i] && x[i] > xthrsh) naa++; if (y[i] > 5.0f * x[i] && y[i] > ythrsh) nbb++; } int *iaa = (int *)malloc(naa * sizeof(int)); int *ibb = (int *)malloc(nbb * sizeof(int)); naa = 0; nbb = 0; for (i = 0; i < n; i++) { if (x[i] > 5.0f * y[i] && x[i] > xthrsh) iaa[naa++] = i; if (y[i] > 5.0f * x[i] && y[i] > ythrsh) ibb[nbb++] = i; } float *xaa = (float *)malloc(naa * sizeof(float)); float *ybb = (float *)malloc(nbb * sizeof(float)); for (i = 0; i < naa; i++) xaa[i] = x[iaa[i]]; for (i = 0; i < nbb; i++) ybb[i] = y[ibb[i]]; float xscale = matlab_trimmean(naa, xaa, 50); float yscale = matlab_trimmean(nbb, ybb, 50); for (i = 0; i < n; i++) { x[i] /= xscale; y[i] /= yscale; } *scale_x = (float)xscale; *scale_y = (float)yscale; free(iaa); free(ibb); free(xaa); free(ybb); } static void get_nn12_rr12(int n, const float *x, const float *y, float *nn12, float *rr12) { int i, j; float *xs = (float *)malloc(n * sizeof(float)); float *ys = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) { xs[i] = x[i]; ys[i] = y[i]; } ks_introsort_float((size_t)n, xs); ks_introsort_float((size_t)n, ys); float xthrsh = 0.2f * percentile(n, xs, 95); float ythrsh = 0.2f * percentile(n, ys, 95); int count = 0; for (i = 0; i < n; i++) if (x[i] < xthrsh && y[i] < ythrsh) count++; *nn12 = (float)count / (float)n; if (count) { float *xy = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) xy[i] = x[i] + y[i]; float *xy12 = (float *)malloc(count * sizeof(float)); for (i = 0, j = 0; i < n; i++) if (x[i] < xthrsh && y[i] < ythrsh) xy12[j++] = xy[i]; float mean_xy12 = matlab_nanmean(count, xy12); float mean_xy = matlab_nanmean(n, xy); *rr12 = mean_xy12 / mean_xy; free(xy); free(xy12); } else { *rr12 = 1.0f; } free(xs); free(ys); } // a separate implementation from Illumina can be found in function NormalizeSingleBin from class // Normalization111_Dragonfish static void normalize_single_bin(int n, float *x, float *y, int gentrain_version, XForm *xform) { int naa, *iaa = NULL, nbb, *ibb = NULL; double (*madsigma)(int, const double *, int) = gentrain_version == 3 ? matlab_madsigma_new : matlab_madsigma_old; xform->version = 1; remove_outliers(&n, x, y); remove_offset(n, x, y, &naa, &iaa, &nbb, &ibb, madsigma, &xform->offset_x, &xform->offset_y); get_nn12_rr12(n, x, y, &xform->nn12, &xform->rr12); handle_rotation(n, x, y, &naa, &iaa, &nbb, &ibb, madsigma, &xform->theta); free(iaa); handle_shear(n, x, y, &nbb, &ibb, madsigma, &xform->shear); free(ibb); handle_scale(n, x, y, gentrain_version, &xform->scale_x, &xform->scale_y); xform->taa = (float)((double)(xform->theta * 180.0f) * M_1_PI); xform->tbb = (float)((atan((double)xform->shear) - (double)xform->theta) * 180.0 * M_1_PI); } // a separate implementation from Illumina can be found in function MirrorData from class NormalizationDragonfish static void mirror_data(int n, float *x, float *y) { int i; for (i = 0; i < n; i++) { if (y[i] > x[i]) { float tmp = x[i]; x[i] = y[i]; y[i] = tmp; } } } // a separate implementation from Illumina can be found in function GetAA_Values from class NormalizationDragonfish static int *get_aa_values(int n, const float *r, const float *t, int *naa) { int i, j; int *iaa = (int *)malloc(n * sizeof(int)); for (i = 0, j = 0; i < n; i++) if (t[i] < 0.1f && !isnan(r[i]) && r[i] != FLT_MIN * FLT_EPSILON) iaa[j++] = i; *naa = j; return iaa; } // a separate implementation from Illumina can be found in functions RectToPolar from classes // Normalization111_Dragonfish and Normalization111 static void rect_to_polar(int n, float *x, float *y) { int i; float *r = x; float *t = y; for (i = 0; i < n; i++) { if (x[i] == 0.0f && y[i] == 0.0f) { r[i] = NAN; t[i] = NAN; continue; } float tmp = x[i]; r[i] = x[i] < 0.0f && y[i] < 0.0f ? FLT_MIN * FLT_EPSILON : fabsf(x[i]) + fabsf(y[i]); t[i] = (float)(atan2((double)y[i], (double)tmp) * M_2_PI); } } // a separate implementation from Illumina can be found in function NormalizeSingleBinSingleChannel from class // NormalizationDragonfish Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016) // In the sample intensity normalization process, specific // groups of loci are normalized together in “normalization // bins.” Due to differences in probe design, Infinium I loci // (two probes per locus) and Infinium II loci (one probe per // locus) are normalized in separate bins. If the number of loci // in a normalization bin is small (< 192 loci), the normalization // process can be negatively impacted. With the low bead // pool complexity supported on the Infinium XT platform, // the occurrence of small normalization bins may be more // prevalent, especially with normalization bins consisting // of Infinium I loci. With the GenTrain 2.0 algorithm, small // normalization bin size negatively impacts the normalization // of intensity data for the given locus (Figure 1A). // The GenTrain 3.0 algorithm improves the normalization // of small bins by taking advantage of the special nature // of Infinium I assay data, where the signal intensity for // both alleles originates in the same color channel. This // affords the possibility to fit a normalization model with // only two free parameters, instead of six. When applied to // the same data mishandled by GenTrain 2.0, GenTrain 3.0 // improves the performance of the intensity normalization // and generates tight clusters (Figure 1B). The GenTrain 3.0 // algorithm applies the improved normalization model for any // normalization bin containing fewer than 192 Infinium I loci. static void normalize_single_bin_single_channel(int n, float *x, float *y, XForm *xform) { int i, j, k; mirror_data(n, x, y); float *aux = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) aux[i] = y[i]; ks_introsort_float((size_t)n, aux); float ythrsh = percentile(n, aux, 2); for (i = 0; i < n; i++) { x[i] -= ythrsh; y[i] -= ythrsh; aux[i] = y[i]; } int naa; rect_to_polar(n, x, y); float *r = x; float *t = y; int *iaa = get_aa_values(n, r, t, &naa); for (i = 0; i < naa; i++) aux[i] = aux[iaa[i]]; float ymean = matlab_trimmean(naa, aux, 20) - 50.0f; // here we replicate the bug in the normalization protocol for (i = 0, j = 0; i < n; i++) if (!isnan(r[i])) j++; for (i = 0, k = 0; i < j; i++) if (!isnan(r[i])) r[k++] = r[i]; for (i = k; i < j; i++) r[i] = 0.0f; float rmean = matlab_trimmean(j, r, 20) - 2.0f * ymean; ythrsh += ymean; xform->version = 1; xform->offset_x = ythrsh; xform->offset_y = ythrsh; xform->theta = 0.0f; xform->shear = 0.0f; xform->scale_x = rmean; xform->scale_y = rmean; xform->rr12 = 1.0f; free(iaa); free(aux); } // a separate implementation from Illumina can be found in function Normalize from class NormalizationDragonfish static XForm *normalize(int n, const uint16_t *xin, const uint16_t *yin, const uint8_t *norm_ids, int gentrain_version, size_t *n_xforms) { int i, j, max_count = 0, counts[256]; *n_xforms = 0; memset(counts, 0, 256 * sizeof(int)); int *aux = (int32_t *)malloc(n * sizeof(int)); // count size of sub-bead pool bins and sort coordinates by bin for (i = 0; i < n; i++) { aux[i] = (norm_ids[i] << 23) + i; counts[norm_ids[i]]++; } ks_introsort_int((size_t)n, aux); // compute number of sub-bead pool bins and size of the largest bin for (i = 0, j = 0; i < 256; i++) { if (counts[i]) { if (counts[i] > max_count) max_count = counts[i]; counts[j++] = counts[i]; } } *n_xforms = j; XForm *xform = (XForm *)calloc(*n_xforms, sizeof(XForm)); float *x = (float *)malloc(max_count * sizeof(float)); float *y = (float *)malloc(max_count * sizeof(float)); // compute the normalization transform one sub-bead pool bin at a time int k = 0; for (i = 0; i < *n_xforms; i++) { if (counts[i] < 10) error("Error in normalization. Not enough good loci. Found %d\n", counts[i]); uint8_t norm_id = aux[k] >> 23; for (j = 0; j < counts[i]; j++) { int idx = aux[k] & 0x7FFFFF; x[j] = (float)xin[idx]; y[j] = (float)yin[idx]; k++; } if (gentrain_version == 3 && norm_id >= 100 && counts[i] < ROBUST_THRESHOLD) { // GenTrain 3.0 and Infinium I normalize_single_bin_single_channel(counts[i], x, y, &xform[i]); } else { // GenTrain 2.0 or Infinium II normalize_single_bin(counts[i], x, y, gentrain_version, &xform[i]); } } free(aux); free(x); free(y); return xform; } /**************************************** * MATH ROUTINES * ****************************************/ // a separate implementation from Illumina of these functions in GenCall can be found in file Utils.cs // http://www.mathworks.com/help/fuzzy/zmf.html static float matlab_zmf(float x, float a, float b) { if (a >= b) error("Invalid arguments for zmf (a >= b)"); if (x <= a) return 1; if (a < x && x <= (a + b) / 2.0f) return 1.0f - 2.0f * sqrf((x - a) / (b - a)); if ((a + b) / 2.0f < x && x <= b) return 2.0f * sqrf((x - b) / (b - a)); return 0; } // http://www.mathworks.com/help/fuzzy/smf.html static float matlab_smf(float x, float a, float b) { if (a >= b) return x >= (a + b) / 2.0f ? 1.0f : 0.0f; if (x <= a) return 0; if (a < x && x <= (a + b) / 2.0f) return 2.0f * sqrf((x - a) / (b - a)); if ((a + b) / 2.0f < x && x <= b) return 1.0f - 2.0f * sqrf((x - b) / (b - a)); return 1; } // http://www.mathworks.com/help/stats/normpdf.html static double matlab_normpdf_vleft(float x, float mu, float sigma) { if (sigma <= 0.0f) return NAN; if (x < mu) return 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma; return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma; } // http://www.mathworks.com/help/stats/normpdf.html static double matlab_normpdf_vright(float x, float mu, float sigma) { if (sigma <= 0.0f) return NAN; if (x > mu) return 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma; return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma; } // http://www.mathworks.com/help/stats/normpdf.html static double matlab_normpdf(float x, float mu, float sigma) { if (sigma <= 0.0f) return NAN; return exp(-0.5 * sqr((double)((x - mu) / sigma))) * 0.5 * M_2_SQRTPI * M_SQRT1_2 / (double)sigma; } /**************************************** * GENOTYPE CALLING ROUTINES * ****************************************/ // compute normalized intensities (http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf) // a separate implementation from Illumina can be found in function Transform from class NormalizationTransform static inline void raw_x_y2norm_x_y(uint16_t raw_x, uint16_t raw_y, float offset_x, float offset_y, float cos_theta, float sin_theta, float shear, float scale_x, float scale_y, float *norm_x, float *norm_y) { float temp_x = (float)raw_x - offset_x; float temp_y = (float)raw_y - offset_y; float temp_x2 = cos_theta * temp_x + sin_theta * temp_y; float temp_y2 = -sin_theta * temp_x + cos_theta * temp_y; float temp_x3 = temp_x2 - shear * temp_y2; *norm_x = temp_x3 < 0.0f ? 0.0f : temp_x3 / scale_x; *norm_y = temp_y2 < 0.0f ? 0.0f : temp_y2 / scale_y; } // compute Theta and R from raw intensities static inline void norm_x_y2ilmn_theta_r(float norm_x, float norm_y, float *ilmn_theta, float *ilmn_r) { if (norm_x == 0.0f && norm_y == 0.0f) { *ilmn_theta = (float)NAN; *ilmn_r = (float)NAN; return; } *ilmn_theta = (float)(atan2((double)norm_y, (double)norm_x) * M_2_PI); if (norm_x < 0.0f && norm_y < 0.0f) { *ilmn_r = FLT_MIN * FLT_EPSILON; } else { *ilmn_r = fabsf(norm_x) + fabsf(norm_y); } } // http://stackoverflow.com/questions/23392321/most-efficient-way-to-find-median-of-three-integers static inline float median3(float a, float b, float c) { return fmaxf(fminf(a, b), fminf(fmaxf(a, b), c)); } // a separate implementation from Illumina can be found in function gen_std_flair from class GenTrain62 or in file // GenTrain60.cs static ClusterRecord *gen_std_flair(const ClusterRecord *cluster_record) { ClusterRecord *out_cluster = (ClusterRecord *)malloc(sizeof(ClusterRecord)); memcpy(out_cluster, cluster_record, sizeof(ClusterRecord)); int Mtight = 3; int Mloose = 3; float z1 = 0.5f * (cluster_record->ab_cluster_stats.theta_mean - cluster_record->aa_cluster_stats.theta_mean) / (cluster_record->ab_cluster_stats.theta_dev + cluster_record->aa_cluster_stats.theta_dev); float z2 = 0.5f * (cluster_record->bb_cluster_stats.theta_mean - cluster_record->ab_cluster_stats.theta_mean) / (cluster_record->bb_cluster_stats.theta_dev + cluster_record->ab_cluster_stats.theta_dev); float mz = fminf(z1, z2); float alpha = fmaxf((1.0f / (float)Mtight) * mz, 1.0f); float beta = fminf((1.0f / (float)Mloose) * mz, 1.0f); float eta = alpha * beta; out_cluster->aa_cluster_stats.theta_dev *= eta; out_cluster->ab_cluster_stats.theta_dev *= eta; out_cluster->bb_cluster_stats.theta_dev *= eta; float min_dispersion_t = 0.02f; if (out_cluster->aa_cluster_stats.theta_dev < min_dispersion_t) out_cluster->aa_cluster_stats.theta_dev = min_dispersion_t; if (out_cluster->ab_cluster_stats.theta_dev < min_dispersion_t) out_cluster->ab_cluster_stats.theta_dev = min_dispersion_t; if (out_cluster->bb_cluster_stats.theta_dev < min_dispersion_t) out_cluster->bb_cluster_stats.theta_dev = min_dispersion_t; float min_dispersion_r_cte = 0.2f; int M = 7; float min_dispersion_r = min_dispersion_r_cte; // compute median of the three values float med = median3(cluster_record->aa_cluster_stats.theta_dev, cluster_record->ab_cluster_stats.theta_dev, cluster_record->bb_cluster_stats.theta_dev); if (min_dispersion_r < med) min_dispersion_r = med; float min_dispersion_r_aa = cluster_record->aa_cluster_stats.r_mean / (float)M; float min_dispersion_r_ab = cluster_record->ab_cluster_stats.r_mean / (float)M; float min_dispersion_r_bb = cluster_record->bb_cluster_stats.r_mean / (float)M; if (min_dispersion_r_aa < min_dispersion_r) min_dispersion_r_aa = min_dispersion_r; if (min_dispersion_r_ab < min_dispersion_r) min_dispersion_r_ab = min_dispersion_r; if (min_dispersion_r_bb < min_dispersion_r) min_dispersion_r_bb = min_dispersion_r; if (out_cluster->aa_cluster_stats.r_dev < min_dispersion_r_aa) out_cluster->aa_cluster_stats.r_dev = min_dispersion_r_aa; if (out_cluster->ab_cluster_stats.r_dev < min_dispersion_r_ab) out_cluster->ab_cluster_stats.r_dev = min_dispersion_r_ab; if (out_cluster->bb_cluster_stats.r_dev < min_dispersion_r_bb) out_cluster->bb_cluster_stats.r_dev = min_dispersion_r_bb; return out_cluster; } // a separate implementation from Illumina can be found in function modilik from class GenTrain62 or in file // GenTrain60.cs this function computes the likelihood for each cluster static void modilik(ClusterRecord *c, float t, float r, double *Laa, double *Lab, double *Lbb) { double alpha = 100.0; // what is the relevance of this? *Laa = 0.0; *Lab = 0.0; *Lbb = 0.0; // computes the Mahalanobis distance double daa = alpha * (double)fabsf(t - c->aa_cluster_stats.theta_mean) + (double)fabsf(r - c->aa_cluster_stats.r_mean); double dab = alpha * (double)fabsf(t - c->ab_cluster_stats.theta_mean) + (double)fabsf(r - c->ab_cluster_stats.r_mean); double dbb = alpha * (double)fabsf(t - c->bb_cluster_stats.theta_mean) + (double)fabsf(r - c->bb_cluster_stats.r_mean); int bCovered = 0; if (daa <= dbb && dab <= dbb && c->aa_cluster_stats.r_mean <= c->ab_cluster_stats.r_mean && !isnan(t)) { *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev); *Lab = matlab_normpdf_vright(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev); bCovered = 1; } if (daa <= dab && dbb <= dab && c->aa_cluster_stats.r_mean <= c->bb_cluster_stats.r_mean && !isnan(t)) { *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev); *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev); bCovered = 1; } if (dab <= daa && dbb <= daa && c->ab_cluster_stats.r_mean <= c->bb_cluster_stats.r_mean && !isnan(t)) { *Lab = matlab_normpdf_vleft(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev); *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev); bCovered = 1; } if (daa <= dbb && dab <= dbb && c->aa_cluster_stats.r_mean > c->ab_cluster_stats.r_mean && !isnan(t)) { *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev); *Lab = matlab_normpdf_vright(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev); bCovered = 1; } if (dab <= daa && dbb <= daa && c->ab_cluster_stats.r_mean > c->bb_cluster_stats.r_mean && !isnan(t)) { *Lab = matlab_normpdf_vleft(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev); *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev); bCovered = 1; } if (daa <= dab && dbb <= dab && c->aa_cluster_stats.r_mean > c->bb_cluster_stats.r_mean && !isnan(t)) { *Laa = matlab_normpdf_vleft(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev) * matlab_normpdf_vright(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev); *Lbb = matlab_normpdf_vright(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev) * matlab_normpdf_vleft(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev); bCovered = 1; } if (!bCovered) { *Laa = matlab_normpdf(t, c->aa_cluster_stats.theta_mean, c->aa_cluster_stats.theta_dev) * matlab_normpdf(r, c->aa_cluster_stats.r_mean, c->aa_cluster_stats.r_dev); *Lab = matlab_normpdf(t, c->ab_cluster_stats.theta_mean, c->ab_cluster_stats.theta_dev) * matlab_normpdf(r, c->ab_cluster_stats.r_mean, c->ab_cluster_stats.r_dev); *Lbb = matlab_normpdf(t, c->bb_cluster_stats.theta_mean, c->bb_cluster_stats.theta_dev) * matlab_normpdf(r, c->bb_cluster_stats.r_mean, c->bb_cluster_stats.r_dev); } } // a separate implementation from Illumina can be found in function computeScoreCallPrelim from class GenTrain62 or in // file GenTrain60.cs Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004) To call // genotypes for an individual’s DNA, the calling algorithm takes the DNA’s intensity values and the information // generated by the clustering algorithm; subsequently, it then identifies to which cluster the data for any specific // locus (of the DNA of interest) corre- sponds. The DNA data is first normalized (using the same procedure as for the // clustering algorithm). The calling operation (classification) is performed using a Bayesian model. The score for each // call (GenCall Score) is the product of the GenTrain Score and a data-to-model fit score. After scoring all the loci // in the DNA of interest, the application computes a composite score for that DNA (DNA Score). Subsequently, the // GenCall score of each locus for this DNA is further penalized by the DNA Score. Shen,R. et al. High-throughput SNP // genotyping on universal bead arrays. Mutat Res, 573, 70–82 (2005-06-03) A quality score, the GenCall score, is // calculated for each SNP call, reflecting the degree of separation be- tween homozygote and heterozygote clusters for // that SNP and the placement of the individual call within a cluster. To make a genotype call, the software looks at // many factors but one of the first is the distribution of beads of the same type and in this way outliers are rejected // to ensure genotyping accuracy. The GenCall score is composed of various sub-scores, of which the most important one // is the clustering score. This score is a locus-specific score, and is computed by a fuzzy logic inference system. It // varies from 0.0 to 1.0, and correlates with accuracy of the genotype call. GenCall scores have been shown to // correlate with the accuracy of the genotyping call. static float compute_score_call_prelim(float r, float t, const ClusterRecord *cluster_record, uint8_t *iAPmax) { if (r < cluster_record->intensity_threshold) { *iAPmax = (uint8_t)0; return (float)NAN; } double omega = 1.0; double Den = 1.0 + 3.0 * omega; ClusterRecord *c = gen_std_flair(cluster_record); // likelihoods double Laa; double Lab; double Lbb; if (isnan(t)) { Laa = Lab = Lbb = NAN; } else { modilik(c, t, r, &Laa, &Lab, &Lbb); if (isnan(Laa)) Laa = 0.0; if (isnan(Lab)) Lab = 0.0; if (isnan(Lbb)) Lbb = 0.0; } int N = c->aa_cluster_stats.N + c->ab_cluster_stats.N + c->bb_cluster_stats.N; // priors double Paa = ((double)((float)c->aa_cluster_stats.N / (float)N) + omega) / Den; double Pab = ((double)((float)c->ab_cluster_stats.N / (float)N) + omega) / Den; double Pbb = ((double)((float)c->bb_cluster_stats.N / (float)N) + omega) / Den; double Evidence = Laa * Paa + Lab * Pab + Lbb * Pbb; // posteriors double APaa = Laa * Paa / Evidence; double APab = Lab * Pab / Evidence; double APbb = Lbb * Pbb / Evidence; // if (APaa >= APab && APaa >= APbb) *iAPmax = (uint8_t)1; // AA else if (APab >= APaa && APab >= APbb) *iAPmax = (uint8_t)2; // AB else if (APbb >= APaa && APbb >= APab) *iAPmax = (uint8_t)3; // BB else *iAPmax = (uint8_t)0; // NC double mx = 0.0; double scndmx = 0.0; if (APaa > APab) { mx = APaa; scndmx = APab; } else { mx = APab; scndmx = APaa; } if (APbb > mx) { scndmx = mx; mx = APbb; } else if (APbb > scndmx) { scndmx = APbb; } double ap_ratio = mx / (DBL_MIN * DBL_EPSILON + scndmx); double ap_lod = log10(ap_ratio); double score_ap = matlab_smf((float)ap_lod, 0.0f, 2.0f); float score_r1 = matlab_smf(r, 0.0f, 0.1f); float score_r2 = 0.0f; float score_r3 = 0.0f; float score_r4 = 0.0f; int numClusters = 0; if (c->aa_cluster_stats.N > 0) numClusters++; if (c->ab_cluster_stats.N > 0) numClusters++; if (c->bb_cluster_stats.N > 0) numClusters++; float score_misclust; if (numClusters == 1 && c->ab_cluster_stats.N == 0) score_misclust = 0.7f; else if (numClusters != 3) score_misclust = 0.95f; else score_misclust = 1.0f; float score_t = 1.0f; float RdropBegin = 6.0f; float RdropEnd = 12.0f; switch (*iAPmax) { case 1: // AA score_t = matlab_zmf(t, c->aa_cluster_stats.theta_mean + 2.0f * c->aa_cluster_stats.theta_dev, c->aa_cluster_stats.theta_mean + 6.0f * c->aa_cluster_stats.theta_dev); score_r2 = matlab_smf(r, 0.0f, c->aa_cluster_stats.r_mean / 10.0f); score_r3 = matlab_smf(r, c->aa_cluster_stats.r_mean - 6.0f * c->aa_cluster_stats.r_dev, c->aa_cluster_stats.r_mean - 2.0f * c->aa_cluster_stats.r_dev); score_r4 = matlab_zmf(r, c->aa_cluster_stats.r_mean + RdropBegin * c->aa_cluster_stats.r_dev, c->aa_cluster_stats.r_mean + RdropEnd * c->aa_cluster_stats.r_dev); break; case 2: // AB score_t = matlab_zmf(fabsf((t - c->ab_cluster_stats.theta_mean) / c->ab_cluster_stats.theta_dev), 2.0f, 6.0f); score_r2 = matlab_smf(r, 0.0f, c->ab_cluster_stats.r_mean / 10.0f); score_r3 = matlab_smf(r, c->ab_cluster_stats.r_mean - 6.0f * c->ab_cluster_stats.r_dev, c->ab_cluster_stats.r_mean - 2.0f * c->ab_cluster_stats.r_dev); score_r4 = matlab_zmf(r, c->ab_cluster_stats.r_mean + RdropBegin * c->ab_cluster_stats.r_dev, c->ab_cluster_stats.r_mean + RdropEnd * c->ab_cluster_stats.r_dev); break; case 3: // BB score_t = matlab_smf(t, c->bb_cluster_stats.theta_mean - 6.0f * c->bb_cluster_stats.theta_dev, c->bb_cluster_stats.theta_mean - 2.0f * c->bb_cluster_stats.theta_dev); score_r2 = matlab_smf(r, 0.0f, c->bb_cluster_stats.r_mean / 10.0f); score_r3 = matlab_smf(r, c->bb_cluster_stats.r_mean - 6.0f * c->bb_cluster_stats.r_dev, c->bb_cluster_stats.r_mean - 2.0f * c->bb_cluster_stats.r_dev); score_r4 = matlab_zmf(r, c->bb_cluster_stats.r_mean + RdropBegin * c->bb_cluster_stats.r_dev, c->bb_cluster_stats.r_mean + RdropEnd * c->bb_cluster_stats.r_dev); break; } float score_r = score_r1 * score_r2 * score_r3 * score_r4; float score_n = 1.0f; if (isnan(cluster_record->cluster_score.total_score)) score_t = score_r = (float)NAN; if (isnan(t)) score_t = score_r = score_n = (float)NAN; float score_call_prelim = (float)score_ap * cluster_record->cluster_score.total_score * score_t * score_r * score_n * score_misclust; free(c); return score_call_prelim; } // http://www.mathworks.com/help/fuzzy/gbellmf.html static double matlab_gbellmf(double x, double a, double b, double c) { double tmp = sqr((x - c) / a); if (tmp == 0.0 && b == 0.0) return 0.5; return 1.0 / (1.0 + pow(tmp, b)); } // a separate implementation from Illumina can be found in function gencall_score_map from class GenTrain62 or in file // GenTrain60.cs 0.35 = 0.5 * 0.7 0.504 = 0.8 × 0.7 × 0.9 1.71 = 0.9 * 1.9 1.08 = 0.9 * 1.2 static double gencall_score_map(double x) { return pow(x, 0.35) * matlab_gbellmf(x, 0.504, 1.71, 1.08); } static inline char rev_allele(char allele) { static const char allele_complement[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'T', 0, 'G', 'D', 0, 0, 'C', 0, 'I', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'A', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; if (allele > 95) return 0; return allele_complement[(int)allele]; } // a separate implementation from Illumina can be found in function GetBaseCall from class AutoCallPollerThread static void get_base_call(const char *snp, const char *ilmn_strand, uint8_t genotype, BaseCall *base_call) { char a = toupper(ilmn_strand[0]) == 'T' ? snp[1] : rev_allele(snp[1]); char b = toupper(ilmn_strand[0]) == 'T' ? snp[3] : rev_allele(snp[3]); switch (genotype) { case 1: (*base_call)[0] = a; (*base_call)[1] = a; return; case 2: (*base_call)[0] = a; (*base_call)[1] = b; return; case 3: (*base_call)[0] = b; (*base_call)[1] = b; return; } (*base_call)[0] = '-'; (*base_call)[1] = '-'; } // a separate implementation from Illumina can be found in function MakeCalls from class AutoCallPollerThread static void make_calls(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt, float gencall_cutoff, int allow_missing_clusters) { int i, n = bpm->num_loci; gtc->sample_data.num_calls = 0; gtc->sample_data.num_intensity_only = 0; for (i = 0; i < n; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; uint16_t raw_x = gtc->raw_x[i]; uint16_t raw_y = gtc->raw_y[i]; float norm_x = -NAN; float norm_y = -NAN; float ilmn_theta = -NAN; float ilmn_r = -NAN; if (raw_x || raw_y) { int norm_id = bpm->norm_lookups[bpm->norm_ids[i]]; XForm *xform = >c->normalization_transforms[norm_id]; raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y); norm_x_y2ilmn_theta_r(norm_x, norm_y, &ilmn_theta, &ilmn_r); int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) { if (allow_missing_clusters) { fprintf(stderr, "Warning: Illumina probe %s not found in cluster file\n", locus_entry->name); gtc->genotype_scores[i] = 0.0f; gtc->genotypes[i] = 0; } else { error("Illumina probe %s not found in cluster file\nUse --allow-missing-clusters to allow this\n", locus_entry->name); } } else { ClusterRecord *cluster_record = &egt->cluster_records[idx]; float min_dispersion_r = 0.1f; if (cluster_record->aa_cluster_stats.r_dev < min_dispersion_r) cluster_record->aa_cluster_stats.r_dev = min_dispersion_r; if (cluster_record->ab_cluster_stats.r_dev < min_dispersion_r) cluster_record->ab_cluster_stats.r_dev = min_dispersion_r; if (cluster_record->bb_cluster_stats.r_dev < min_dispersion_r) cluster_record->bb_cluster_stats.r_dev = min_dispersion_r; uint8_t genotype; float score_call_prelim = compute_score_call_prelim(ilmn_r, ilmn_theta, cluster_record, &genotype); float score_call = (float)gencall_score_map(score_call_prelim); gtc->genotype_scores[i] = isnan(score_call) ? 0.0f : score_call; gtc->genotypes[i] = genotype; } } else { gtc->genotype_scores[i] = 0.0f; gtc->genotypes[i] = 0; } if (gtc->genotype_scores[i] < gencall_cutoff) gtc->genotypes[i] = 0; if (locus_entry->intensity_only) { gtc->genotypes[i] = 0; gtc->sample_data.num_intensity_only++; } if (gtc->genotypes[i]) gtc->sample_data.num_calls++; get_base_call(locus_entry->snp, locus_entry->ilmn_strand, gtc->genotypes[i], >c->base_calls[i]); } gtc->sample_data.num_no_calls = gtc->num_snps - gtc->sample_data.num_intensity_only - gtc->sample_data.num_calls; gtc->call_rate = (float)gtc->sample_data.num_calls / ((float)gtc->num_snps - (float)gtc->sample_data.num_intensity_only + FLT_MIN * FLT_EPSILON); } typedef struct { int version; int min_loci; int max_loci; int min_x_loci; int min_y_loci; float call_rate_threshold; float y_threshold; float x_threshold; float x_het_rate_threshold; } gender_t; // a separate implementation from Illumina can be found in function EstimateGender from class AutoCallPollerThread // TODO what happened here to gender->max_loci? static void estimate_gender(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt, const gender_t *gender) { int i, n = bpm->num_loci; int x_count = 0; int x_hets_count = 0; int x_non_missing_count = 0; int y_count = 0; int auto_count = 0; int auto_non_missing_count = 0; float *r_x = (float *)malloc(n * sizeof(float)); float *r_y = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; const char *chrom = strncasecmp(locus_entry->chrom, "CHR", 3) == 0 ? locus_entry->chrom + 3 : locus_entry->chrom; int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) continue; ClusterRecord *cluster_record = &egt->cluster_records[idx]; int norm_id = bpm->norm_lookups[bpm->norm_ids[i]]; XForm *xform = >c->normalization_transforms[norm_id]; float norm_x, norm_y, t; if (cluster_record->cluster_score.total_score != 0.0f) { if (strcmp(chrom, "X") == 0) { raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y); norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r_x[x_count]); if (gtc->genotypes[i] == 2) x_hets_count++; if (gtc->genotypes[i]) x_non_missing_count++; x_count++; } else if (strcmp(chrom, "Y") == 0) { raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y); norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r_y[y_count]); y_count++; } else if (strcmp(chrom, "XY") != 0 && strcmp(chrom, "MT") != 0) { auto_count++; if (gtc->genotypes[i]) auto_non_missing_count++; } } } gtc->gender = 'U'; if (gender->version == 1 || y_count < gender->min_y_loci || auto_count < gender->min_loci) { if (x_non_missing_count > gender->min_x_loci) { gtc->gender = !((double)((float)x_hets_count / (float)x_non_missing_count) > gender->x_het_rate_threshold) ? 'M' : 'F'; } } else if (auto_count > 0 && (double)auto_non_missing_count / (double)auto_count > gender->call_rate_threshold) { for (i = 0; i < y_count; i++) if (isnan(r_y[i]) || isinf(r_y[i])) r_y[i] = 0.0f; float y_med = matlab_median(y_count, r_y); if ((double)y_med > gender->y_threshold) { gtc->gender = 'M'; } else if (x_count < gender->min_x_loci) { gtc->gender = 'F'; } else { for (i = 0; i < x_count; i++) if (isnan(r_x[i]) || isinf(r_x[i])) r_x[i] = 0.0f; float x_med = matlab_median(x_count, r_x); gtc->gender = (double)x_med > gender->x_threshold ? 'F' : 'U'; } } free(r_x); free(r_y); } // compute BAF and LRR from Theta and R as explained in Peiffer, D. A. et al. High-resolution genomic profiling of // chromosomal aberrations using Infinium whole-genome genotyping. Genome Res. 16, 1136–1148 (2006) // Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome // genotyping. Genome Res., 16, 1136–1148 (2006-08-09) static inline void get_baf_lrr(float ilmn_theta, float ilmn_r, float aa_theta, float ab_theta, float bb_theta, float aa_r, float ab_r, float bb_r, float r_mean, float *baf, float *lrr) { float r_ref; if (ilmn_theta == ab_theta) { r_ref = ab_r; *baf = 0.5f; } else if (ilmn_theta < ab_theta) { r_ref = aa_r + (ilmn_theta - aa_theta) * (aa_r - ab_r) / (aa_theta - ab_theta); *baf = (ilmn_theta - aa_theta) / (ab_theta - aa_theta) * 0.5f; } else if (ilmn_theta > ab_theta) { r_ref = ab_r + (ilmn_theta - ab_theta) * (bb_r - ab_r) / (bb_theta - ab_theta); *baf = 0.5f + (ilmn_theta - ab_theta) / (bb_theta - ab_theta) * 0.5f; } else { *lrr = -NAN; *baf = -NAN; return; } *lrr = ilmn_r != 0.0f ? (float)log2(ilmn_r / (isnan(r_mean) ? r_ref : r_mean)) : -FLT_MAX; } // a separate implementation from Illumina can be found in functions CalculateLogRDev and CalculateBAlleleFreq from // class AutoCallPollerThread static void calculate_baf_lrr(gtc_t *gtc, const bpm_t *bpm, const egt_t *egt) { int i, count = 0, n = bpm->num_loci; double sum = 0.0; double sum2 = 0.0; for (i = 0; i < n; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) continue; ClusterRecord *c = &egt->cluster_records[idx]; float baf = -NAN; float lrr = -NAN; if ((gtc->raw_x[i] || gtc->raw_y[i]) && c) { int norm_id = bpm->norm_lookups[bpm->norm_ids[i]]; XForm *xform = >c->normalization_transforms[norm_id]; float norm_x, norm_y, t, r; raw_x_y2norm_x_y(gtc->raw_x[i], gtc->raw_y[i], xform->offset_x, xform->offset_y, gtc->cos_theta[norm_id], gtc->sin_theta[norm_id], xform->shear, xform->scale_x, xform->scale_y, &norm_x, &norm_y); norm_x_y2ilmn_theta_r(norm_x, norm_y, &t, &r); get_baf_lrr(t, r, c->aa_cluster_stats.theta_mean, c->ab_cluster_stats.theta_mean, c->bb_cluster_stats.theta_mean, c->aa_cluster_stats.r_mean, c->ab_cluster_stats.r_mean, c->bb_cluster_stats.r_mean, locus_entry->intensity_only ? c->r_mean : NAN, &baf, &lrr); } gtc->b_allele_freqs[i] = baf < 0.0 ? 0.0f : baf > 1.0 ? 1.0f : (float)baf; gtc->logr_ratios[i] = (float)lrr; char start_chrom = strncasecmp(locus_entry->chrom, "CHR", 3) == 0 ? toupper(locus_entry->chrom[3]) : toupper(locus_entry->chrom[0]); if (!locus_entry->intensity_only && (start_chrom != 'X' && start_chrom != 'Y' && start_chrom != 'M') && !isinf(lrr) && !isnan(lrr)) { sum += (double)lrr; sum2 += sqr((double)lrr); count++; } } gtc->logr_dev = (float)sqrt(sum2 / (double)count - sqr(sum / (double)count)); } // a separate implementation from Illumina can be found in function CalculateIntensityPercentiles from class // AutoCallPollerThread static void calculate_intensity_percentiles(gtc_t *gtc) { int i, n = gtc->num_snps; float *xs = (float *)malloc(n * sizeof(float)); float *ys = (float *)malloc(n * sizeof(float)); for (i = 0; i < n; i++) { xs[i] = (float)gtc->raw_x[i]; ys[i] = (float)gtc->raw_y[i]; } ks_introsort_float((size_t)n, xs); ks_introsort_float((size_t)n, ys); gtc->percentiles_x[0] = (uint16_t)percentile(n, xs, 5); gtc->percentiles_x[1] = (uint16_t)percentile(n, xs, 50); gtc->percentiles_x[2] = (uint16_t)percentile(n, xs, 95); gtc->percentiles_y[0] = (uint16_t)percentile(n, ys, 5); gtc->percentiles_y[1] = (uint16_t)percentile(n, ys, 50); gtc->percentiles_y[2] = (uint16_t)percentile(n, ys, 95); free(xs); free(ys); } // a separate implementation from Illumina can be found in function ComputeSampleStats from class AutoCallPollerThread // Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004) // GenCall Scores may be averaged among DNAs and // among loci for purposes of evaluating the quality of the // genotyping within a particular DNA or locus. For example, // we often evaluate “GC10” and “GC50” scores that are calcu- // lated by taking the 10th percentile and the 50th percentile // (median) of the GenCall Scores for a certain locus, respec- // tively. Using GC10 and GC50 Scores, a user may choose // to fail particularly poor performing loci, for instance, // by discarding loci with GC10 of 0.1 or lower. Also, a series // of aggregate statistics (i.e., average) of the GC10 or GC50 // scores for each DNA can be used to identify low-quality // DNAs (for instance, a user may discard DNA samples with // average GC10 scores of 0.2 or lower). The GenCall Score // can also be used in situations where users have a mini- // mum required call rate. This rate translates to making // calls on a certain percentile of the data. Users can sort // all their genotypes based on the GenCall Score, and then // choose the top (Nth) percentile of interest for their study. static void compute_sample_stats(gtc_t *gtc, const bpm_t *bpm, float gencall_cutoff) { int i, j, n = gtc->num_snps; float *gs = (float *)malloc(n * sizeof(float)); for (i = 0, j = 0; i < n; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; if (gtc->genotype_scores[i] > gencall_cutoff && !locus_entry->intensity_only) gs[j++] = (float)gtc->genotype_scores[i]; } ks_introsort_float((size_t)j, gs); gtc->p10gc = percentile(j, gs, 10); gtc->sample_data.p50gc = percentile(j, gs, 50); free(gs); } /**************************************** * CREATE NEW GTC STRUCTURE * ****************************************/ // a separate implementation from Illumina can be found in class MD5ChecksumFile of the Array Analysis CLI static char *basename(const char *fn, const char unsigned *md5_buf) { const char str[] = "(MD5Checksum="; char *ptr = strrchr(fn, '/'); if (ptr) ptr++; else ptr = (char *)fn; char *ret; if (md5_buf) { int len = strlen(ptr); ret = (char *)malloc((len + 47) * sizeof(char)); memcpy((void *)ret, (void *)ptr, (size_t)len); ptr = ret + len; memcpy((void *)ptr, &str, sizeof(str) - 1); ptr += sizeof(str) - 1; hts_md5_hex(ptr, md5_buf); ptr += 32; *(ptr++) = ')'; *ptr = 0; } else { ret = strdup(ptr); } return ret; } // TODO this should be done once only for the BPM structure static int32_t *get_control_addresses(const char *str, int *n_addresses) { int i, j; int moff = 0, *off = NULL; int moff2 = 0, *off2 = NULL; int32_t *addresses = NULL; int m_addresses = 0; *n_addresses = 0; char *s = strdup(str); int noff = ksplit_core(s, '\n', &moff, &off); for (i = 0; i < noff; i++) { char *ptr = strchr(&s[off[i]], ','); *ptr = '\0'; int noff2 = ksplit_core(&s[off[i]], ':', &moff2, &off2); hts_expand(int32_t, *n_addresses + noff2, m_addresses, addresses); for (j = 0; j < noff2; j++) { char *endptr; addresses[*n_addresses + j] = (int32_t)strtol(&s[off[i] + off2[j]], &endptr, 10); } *n_addresses += noff2; } free(s); free(off); free(off2); return addresses; } static char *get_string_parameter(const char *str, const char *id) { const char *ptr = strstr(str, id); if (!ptr) return NULL; ptr += strlen(id); if (*ptr != '=') return NULL; ptr++; const char *ptr2 = strchr(ptr, '|'); return strndup(ptr, ptr2 ? ptr2 - ptr : strlen(ptr)); } static int32_t get_int32_parameter(const char *str, const char *id) { const char *ptr = strstr(str, id); if (!ptr) return 0; ptr += strlen(id); if (*ptr != '=') return 0; ptr++; char *endptr; return (int32_t)strtol(ptr, &endptr, 10); } // a separate implementation from Illumina can be found in function LoadSampleSection from class SampleData // AutoConvert used the creation time of the IDAT file for the imaging date field of the GTC file: // imagingDate = fileInfo.CreationTime.ToLongDateString() + " " + fileInfo.CreationTime.ToLongTimeString(); // this was later updated to instead use the imaging date field of the last Scan entry in the IDAT file static void load_sample_section(gtc_t *gtc, const idat_t *idat, int imaging_date) { int i; RunInfo *run_info = NULL; for (i = 0; i < idat->m_run_infos; i++) if (strcmp(idat->run_infos[i].block_type, "Scan") == 0) run_info = &idat->run_infos[i]; if (run_info) { gtc->imaging_date = imaging_date ? strdup(run_info->run_time) : NULL; gtc->scanner_data.scanner_name = get_string_parameter(run_info->block_pars, "sherlockID"); gtc->scanner_data.pmt_green = get_int32_parameter(run_info->block_pars, "PMTGainCY3"); gtc->scanner_data.pmt_red = get_int32_parameter(run_info->block_pars, "PMTGainCY5"); gtc->scanner_data.scanner_version = strdup(run_info->code_version); gtc->scanner_data.imaging_user = get_string_parameter(run_info->block_pars, "Username"); } } static int32_t get32_index(void *dict, int32_t key) { khash_t(32) *hash = (khash_t(32) *)dict; khiter_t k = kh_get(32, hash, key); if (k == kh_end(hash)) return -1; return kh_val(hash, k); } // a separate implementation from Illumina can be found in function fillArray from class SampleData static void fill_array(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, gtc_t *gtc) { int i; int32_t idx1, idx2; for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; if (locus_entry->assay_type == 0) { // 0 - Infinium II probes idx1 = get32_index(red_idat->ilmn_id2index, locus_entry->address_a); idx2 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_a); if (idx1 == -1 || idx2 == -1) continue; // warning? if (red_idat->nbeads[idx1] >= 2 && grn_idat->nbeads[idx2] >= 2) { gtc->raw_x[i] = red_idat->trimmed_mean[idx1]; gtc->raw_y[i] = grn_idat->trimmed_mean[idx2]; } } else if (locus_entry->assay_type == 1) { // 1 - Infinium I (A/T) probes idx1 = get32_index(red_idat->ilmn_id2index, locus_entry->address_a); idx2 = get32_index(red_idat->ilmn_id2index, locus_entry->address_b); if (idx1 == -1 || idx2 == -1) continue; // warning? if (red_idat->nbeads[idx1] >= 2 && red_idat->nbeads[idx2] >= 2) { gtc->raw_x[i] = red_idat->trimmed_mean[idx1]; gtc->raw_y[i] = red_idat->trimmed_mean[idx2]; } } else if (locus_entry->assay_type == 2) { // 2 - Infinium I (G/C) probes idx1 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_a); idx2 = get32_index(grn_idat->ilmn_id2index, locus_entry->address_b); if (idx1 == -1 || idx2 == -1) continue; // warning? if (grn_idat->nbeads[idx1] >= 2 && grn_idat->nbeads[idx2] >= 2) { gtc->raw_x[i] = grn_idat->trimmed_mean[idx1]; gtc->raw_y[i] = grn_idat->trimmed_mean[idx2]; } } else { error("Assay type %d for probe %s not valid\n", locus_entry->assay_type, locus_entry->ilmn_id); } } } // a separate implementation from Illumina can be found in function fillControlsArray from class SampleData static void fill_controls_array(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, gtc_t *gtc) { int i, n_controls; int32_t idx1, idx2; int *control_addresses = get_control_addresses(bpm->control_config, &n_controls); gtc->m_controls_x = n_controls; gtc->m_controls_y = n_controls; gtc->controls_x = (uint16_t *)calloc(n_controls, sizeof(uint16_t)); gtc->controls_y = (uint16_t *)calloc(n_controls, sizeof(uint16_t)); for (i = 0; i < n_controls; i++) { idx1 = get32_index(red_idat->ilmn_id2index, control_addresses[i]); idx2 = get32_index(grn_idat->ilmn_id2index, control_addresses[i]); if (idx1 == -1 || idx2 == -1) continue; // warning? gtc->controls_x[i] = red_idat->trimmed_mean[idx1]; gtc->controls_y[i] = grn_idat->trimmed_mean[idx2]; } free(control_addresses); } // a separate implementation from Illumina can be found in function Process from class AutoCallPollerThread static gtc_t *gtc_init(const idat_t *grn_idat, const idat_t *red_idat, const bpm_t *bpm, const egt_t *egt, int gentrain_version, int gtc_file_version, float gencall_cutoff, int sample_name, int checksums, int imaging_date, const char *autocall_date_format, const char *autocall_version, int allow_missing_clusters, const gender_t *gender) { if (!grn_idat || !red_idat || !bpm) return NULL; gtc_t *gtc = (gtc_t *)calloc(1, sizeof(gtc_t)); gtc->version = gtc_file_version; gtc->ploidy = 2; gtc->ploidy_type = 1; gtc->sample_name = red_idat->sample_name ? strdup(red_idat->sample_name) : NULL; char *ptr, *ptr2; if (!gtc->sample_name && sample_name) { ptr = strrchr(grn_idat->fn, '/'); if (ptr) ptr++; else ptr = grn_idat->fn; ptr2 = strstr(ptr, "_Grn.idat"); gtc->sample_name = strndup(ptr, ptr2 - ptr); } gtc->sample_plate = red_idat->sample_plate ? strdup(red_idat->sample_plate) : NULL; gtc->sample_well = red_idat->sample_well ? strdup(red_idat->sample_well) : NULL; gtc->sentrix_id = red_idat->sentrix_barcode ? strdup(red_idat->sentrix_barcode) : NULL; if (egt) gtc->cluster_file = basename(egt->fn, checksums && egt ? egt->md5_buf : NULL); gtc->snp_manifest = basename(bpm->fn, checksums ? bpm->md5_buf : NULL); time_t timer; char buffer[26]; struct tm *tm_info; timer = time(NULL); tm_info = localtime(&timer); strftime(buffer, 26, autocall_date_format, tm_info); gtc->autocall_date = strdup(buffer); gtc->autocall_version = strdup(autocall_version); gtc->num_snps = bpm->num_loci; gtc->raw_x = (uint16_t *)calloc(gtc->num_snps, sizeof(uint16_t)); gtc->raw_y = (uint16_t *)calloc(gtc->num_snps, sizeof(uint16_t)); gtc->genotypes = (uint8_t *)calloc(gtc->num_snps, sizeof(uint8_t)); gtc->base_calls = (BaseCall *)malloc(gtc->num_snps * sizeof(BaseCall)); memset(gtc->base_calls, '-', gtc->num_snps * sizeof(BaseCall)); gtc->genotype_scores = (float *)calloc(gtc->num_snps, sizeof(float)); gtc->b_allele_freqs = (float *)calloc(gtc->num_snps, sizeof(float)); gtc->logr_ratios = (float *)calloc(gtc->num_snps, sizeof(float)); fill_array(grn_idat, red_idat, bpm, gtc); fprintf(stderr, "Normalizing...\n"); gtc->normalization_transforms = normalize(gtc->num_snps, gtc->raw_x, gtc->raw_y, bpm->norm_ids, gentrain_version, >c->m_normalization_transforms); gtc->sin_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float)); gtc->cos_theta = (float *)malloc(gtc->m_normalization_transforms * sizeof(float)); int i; for (i = 0; i < gtc->m_normalization_transforms; i++) { gtc->sin_theta[i] = (float)sin((double)gtc->normalization_transforms[i].theta); gtc->cos_theta[i] = (float)cos((double)gtc->normalization_transforms[i].theta); } if (egt) { fprintf(stderr, "Calling...\n"); make_calls(gtc, bpm, egt, gencall_cutoff, allow_missing_clusters); fprintf(stderr, "Call rate: %.7f\n", gtc->call_rate); estimate_gender(gtc, bpm, egt, gender); fprintf(stderr, "Gender: %s\n", gtc->gender == 'M' ? "Male" : gtc->gender == 'F' ? "Female" : "Unknown"); calculate_baf_lrr(gtc, bpm, egt); compute_sample_stats(gtc, bpm, gencall_cutoff); } calculate_intensity_percentiles(gtc); fill_controls_array(grn_idat, red_idat, bpm, gtc); load_sample_section(gtc, red_idat, imaging_date); return gtc; } /**************************************** * PLUGIN * ****************************************/ const char *about(void) { return "Convert Illumina IDAT files for Infinium arrays to GTC files.\n"; } static const char *usage_text(void) { return "\n" "About: convert Illumina IDAT files for Infinium arrays to GTC files.\n" "(version " IDAT2GTC_VERSION " http://github.com/freeseek/idat2vcf)\n" "[ Kermani, B. G. Artificial intelligence and global normalization methods for\n" " genotyping. U.S. Patents No. 7,035,740 (2005-09-29) and 7,467,117 (2006-10-05) ]\n" "[ Peiffer, D. A. et al. High-resolution genomic profiling of chromosomal aberrations\n" " using Infinium whole-genome genotyping. Genome Res., 16, 1136–1148 (2006-08-09) ]\n" "[ Illumina, Inc. Illumina GenCall Data Analysis Software. Pub. No. 370-2004-009 (2004) ]\n" "[ Illumina, Inc. Illumina’s Genotyping Data Normalization Methods. Pub. No. 970-2006-010 (2006-09-26) ]\n" "[ Illumina, Inc. Improved Cluster Generation with Gentrain2. Pub. No. 037-2009-015 (2009-01-26)]\n" "[ Illumina, Inc. Improved Genotype Clustering with GenTrain 3.0. Pub. No. 370-2016-015-A (2016) ]\n" "Usage: bcftools +idat2gtc --bpm [options]\n" "\n" "Plugin options:\n" " -b, --bpm BPM manifest file\n" " -e, --egt EGT cluster file\n" " -i, --idats IDAT files from directory\n" " -g, --grn-idats file with list of green IDATs\n" " -r, --red-idats file with list of red IDATs\n" " -o, --output write output to a directory\n" " -v, --gentrain-version whether to use GenTrain 2.0 (2) or GenTrain 3.0 (3) for " "normalization [3]\n" " -c, --gencall-cutoff cutoff score for GenCall algorithm [0.15]\n" " --snp-map create SNP map file\n" " --do-not-check-eof do not check whether the BPM and EGT readers reach the end of the " "file\n" " --preset Illumina AutoCall software to emulate [4]\n" " AutoConvert (1), AutoConvert 2.0 (2), IAAP CLI (3), Array Analysis " "CLI (4)\n" "GTC output files options:\n" " --gtc-version whether use the old (3) or the new (5) GTC file format [5]\n" " --no-sample-name leave sample name empty if missing from IDAT files\n" " --no-checksums do not include cluster and manifest files checksums\n" " --no-imaging-date do not include imaging date\n" " --autocall-date AutoCall date format to use [" AUTOCALL_DATE_FORMAT_DFLT "]\n" " --autocall-version AutoCall version label to use [" AUTOCALL_VERSION_DFLT "]\n" " --allow-missing-clusters BPM manifest file variants can be missing from the EGT cluster file" "\n" "Gender estimation options:\n" " --gender-version whether to only use heterozygosity (1) or also intensities (2) " "[2]\n" " --min-loci minimum number of autosomal loci for gender estimation [100]\n" " --max-loci maximum number of autosomal loci for gender estimation [10000]\n" " --min-x-loci minimum number of X loci for gender estimation [20]\n" " --min-y-loci minimum number of Y loci for gender estimation [20]\n" " --call-rate-threshold threshold for autosomal call rate for gender estimation [0.0]\n" " --y-threshold threshold for Y intensity for gender estimation [0.3]\n" " --x-threshold threshold for X intensity for gender estimation [0.9]\n" " --x-het-rate-threshold threshold for X Het Rate for gender estimation [0.1]\n" "\n" "Examples:\n" " bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt \\\n" " 5434246082_R03C01_Grn.idat 5434246082_R03C01_Red.idat\n" " bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --snp-map " "GSA-24v3-0_A1.bpm.csv\n" " bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --gentrain-version 2 " "--gtc-version 3 \\\n" " --no-sample-name --no-checksums --no-imaging-date --autocall-date \"\" --autocall-version 1.6.3.1 " "--gender-version 1\n" " bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --no-sample-name \\\n" " --no-checksums --autocall-date \"\" --autocall-version 2.0.1.179 --min-loci 10 --max-loci 100\n" " bcftools +idat2gtc --bpm GSA-24v3-0_A1.bpm --egt GSA-24v3-0_A1_ClusterFile.egt --no-sample-name \\\n" " --no-checksums --autocall-date \"\"\n" "\n"; } static inline FILE *get_file_handle(const char *str) { if (!str) return NULL; FILE *ret; if (strcmp(str, "-") == 0) { ret = stdout; } else { ret = fopen(str, "w"); if (!ret) error("Failed to open %s: %s\n", str, strerror(errno)); } return ret; } // to recapitulate the .NET behavior of ToString() I need to add a small value // http://stackoverflow.com/questions/2085449 // http://stackoverflow.com/questions/11085052 // http://stackoverflow.com/questions/14325214 static double round_adjust(double x) { double y = 5e-8, z; if (x > 1.0) { z = 1.0; while (x > z) { y *= 10.0; z *= 10.0; } } else { z = 0.1; while (x < z) { y *= 0.1; z *= 0.1; } } return x + y; } // this is the same file that can be generated by AutoConvert, AutoConvert 2.0 or by Picard // BpmToNormalizationManifestCsv most likely this file was generated by AutoConvert to allow other software such as // Illuminus, GenoSNP, Birdseed, optiCall, zCall, and iCall, to normalize intensities across sub-bead pools // http://gatk.broadinstitute.org/hc/en-us/articles/360057440631-BpmToNormalizationManifestCsv-Picard // a separate implementation from Illumina can be found in function CreateSNPMapFile from class AutoCallPollerThread static void snp_map_write(const bpm_t *bpm, const egt_t *egt, const char *fn) { int i; FILE *out_txt = get_file_handle(fn); fprintf(out_txt, "Index,Name,Chromosome,Position,GenTrain Score,SNP,ILMN Strand,Customer Strand,NormID\n"); for (i = 0; i < bpm->num_loci; i++) { LocusEntry *locus_entry = &bpm->locus_entries[i]; const char *chrom = strncasecmp(locus_entry->chrom, "CHR", 3) == 0 ? locus_entry->chrom + 3 : locus_entry->chrom; double gentrain_score = NAN; if (egt) { int idx; int ret = khash_str2int_get(egt->names2index, locus_entry->name, &idx); if (ret < 0) error("Illumina probe %s not found in cluster file\n", locus_entry->name); ClusterRecord *cluster_record = &egt->cluster_records[idx]; gentrain_score = round_adjust(cluster_record->cluster_score.total_score); } fprintf(out_txt, "%d,%s,%s,%s,%.4f,%s,%s,%s,%d\n", locus_entry->index, locus_entry->name, chrom, locus_entry->map_info, gentrain_score, locus_entry->snp, locus_entry->ilmn_strand, locus_entry->source_strand, locus_entry->norm_id); } if (out_txt != stdout && out_txt != stderr) fclose(out_txt); } void mkdir_p(const char *fmt, ...); int run(int argc, char *argv[]) { const char *bpm_fname = NULL; const char *egt_fname = NULL; const char *snp_map_fname = NULL; const char *idat_pathname = NULL; const char *grn_idat_fname = NULL; const char *red_idat_fname = NULL; const char *output_pathname = "."; const char *autocall_date_format = AUTOCALL_DATE_FORMAT_DFLT; const char *autocall_version = AUTOCALL_VERSION_DFLT; char *tmp; int gentrain_version = 3; float gencall_cutoff = 0.15; int eof_check = 1; int preset = 4; int gtc_file_version = 5; int sample_name = 1; int checksums = 1; int imaging_date = 1; int allow_missing_clusters = 0; gender_t gender; gender.version = 2; // 1 in AutoConvert gender.min_loci = 100; // version 2 gender.max_loci = 10000; // version 2 for downsampling gender.min_x_loci = 20; // shared between version 1 and 2 gender.min_y_loci = 20; // version 2 gender.call_rate_threshold = 0.0; // changed from 0.97 gender.y_threshold = 0.3; // version 2 gender.x_threshold = 0.9; // version 2 gender.x_het_rate_threshold = 0.1; // version 1 static struct option loptions[] = {{"bpm", required_argument, NULL, 'b'}, {"egt", required_argument, NULL, 'e'}, {"idats", required_argument, NULL, 'i'}, {"grn-idats", required_argument, NULL, 'g'}, {"red-idats", required_argument, NULL, 'r'}, {"output", required_argument, NULL, 'o'}, {"gentrain-version", required_argument, NULL, 'v'}, {"gencall-cutoff", required_argument, NULL, 'c'}, {"snp-map", required_argument, NULL, 1}, {"do-not-check-eof", no_argument, NULL, 2}, {"preset", required_argument, NULL, 3}, {"gtc-version", required_argument, NULL, 4}, {"no-sample-name", no_argument, NULL, 5}, {"no-cheksums", no_argument, NULL, 6}, {"no-imaging-date", no_argument, NULL, 7}, {"autocall-date", required_argument, NULL, 8}, {"autocall-version", required_argument, NULL, 9}, {"allow-missing-clusters", no_argument, NULL, 10}, {"gender-version", no_argument, NULL, 11}, {"min-loci", no_argument, NULL, 12}, {"max-loci", no_argument, NULL, 13}, {"min-x-loci", no_argument, NULL, 14}, {"min-y-loci", no_argument, NULL, 15}, {"call-rate-threshold", no_argument, NULL, 16}, {"y-threshold", no_argument, NULL, 17}, {"x-threshold", no_argument, NULL, 18}, {"x-het-rate-threshold", no_argument, NULL, 19}, {NULL, 0, NULL, 0}}; int c; while ((c = getopt_long(argc, argv, "h?b:e:i:g:r:o:v:c:", loptions, NULL)) >= 0) { switch (c) { case 'b': bpm_fname = optarg; break; case 'e': egt_fname = optarg; break; case 'i': idat_pathname = optarg; break; case 'g': grn_idat_fname = optarg; break; case 'r': red_idat_fname = optarg; break; case 'o': output_pathname = optarg; break; case 'v': gentrain_version = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --gentrain-version %s\n", optarg); if (gentrain_version != 2 && gentrain_version != 3) error("The --gentrain-version option only allows values 2, and 3\n%s", usage_text()); break; case 'c': gencall_cutoff = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --gencall-cutoff %s\n", optarg); break; case 1: snp_map_fname = optarg; break; case 2: eof_check = 0; break; case 3: preset = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --preset %s\n", optarg); if (preset < 1 || preset > 4) error("The --preset option only allows values 1, 2, 3, and 4\n%s", usage_text()); switch (preset) { case 1: gentrain_version = 2; gender.version = 1; gtc_file_version = 3; sample_name = 0; checksums = 0; imaging_date = 0; autocall_version = "1.6.3.1"; break; case 2: gentrain_version = 3; gender.version = 2; gender.call_rate_threshold = 0.97; gtc_file_version = 5; sample_name = 0; checksums = 0; imaging_date = 1; autocall_version = "2.0.1.179"; break; case 3: gentrain_version = 3; gender.version = 2; // we did not reimplement the bug of estimating the autosomal call rate including loci with 0 cluster // scores as missing gender.call_rate_threshold = 0.97; gtc_file_version = 5; sample_name = 0; checksums = 0; imaging_date = 1; autocall_version = "3.0.0"; break; case 4: gentrain_version = 3; gender.version = 2; gender.call_rate_threshold = 0.97; gtc_file_version = 5; sample_name = 1; checksums = 1; imaging_date = 1; autocall_version = "3.0.0"; break; } break; case 4: gtc_file_version = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --gtc-version %s\n", optarg); if (gtc_file_version != 3 && gtc_file_version != 5) error("The --gtc-version option only allows values 3, and 5\n%s", usage_text()); break; case 5: sample_name = 0; break; case 6: checksums = 0; break; case 7: imaging_date = 0; break; case 8: autocall_date_format = optarg; break; case 9: autocall_version = optarg; break; case 10: allow_missing_clusters = 1; break; case 11: gender.version = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --gender-version %s\n", optarg); if (gender.version != 1 && gender.version != 2) error("The --gender-version option only allows values 1 and 2\n%s", usage_text()); break; case 12: gender.min_loci = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --min-loci %s\n", optarg); break; case 13: gender.max_loci = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --max-loci %s\n", optarg); break; case 14: gender.min_x_loci = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --min-x-loci %s\n", optarg); break; case 15: gender.min_y_loci = strtol(optarg, &tmp, 0); if (*tmp) error("Could not parse: --min-y-loci %s\n", optarg); break; case 16: gender.call_rate_threshold = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --call-rate-threshold %s\n", optarg); break; case 17: gender.y_threshold = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --y-threshold %s\n", optarg); break; case 18: gender.x_threshold = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --x-threshold %s\n", optarg); break; case 19: gender.x_het_rate_threshold = strtof(optarg, &tmp); if (*tmp) error("Could not parse: --x-het-rate-threshold %s\n", optarg); break; case 'h': case '?': default: error("%s", usage_text()); } } if (bpm_fname == NULL) error("The --bpm option is required\n%s", usage_text()); if (idat_pathname != NULL && (grn_idat_fname != NULL || red_idat_fname != NULL)) error("Cannot use option --idats with either option --grn-idats or --red-idats\n%s", usage_text()); if (grn_idat_fname != NULL && red_idat_fname == NULL) error("Option --grn-idats requires option --red-idats\n%s", usage_text()); if (grn_idat_fname == NULL && red_idat_fname != NULL) error("Option --red-idats requires option --grn-idats\n%s", usage_text()); if (idat_pathname == NULL && grn_idat_fname == NULL && red_idat_fname == NULL) { if (snp_map_fname == NULL && argc - optind == 0) error("No IDAT files provided as input\n%s", usage_text()); if (argc - optind % 2 == 1) error( "If options --idats/--grn-idats/--red-idats are not used, input an alternating list of green and red " "IDATs\n%s", usage_text()); } fprintf(stderr, "idat2gtc " IDAT2GTC_VERSION " http://github.com/freeseek/gtc2vcf\n"); fprintf(stderr, "Using normalization algorithm version %s\n", gentrain_version == 2 ? "1.1.2" : "1.2.0"); if (strcmp(output_pathname, ".") != 0) mkdir_p("%s/", output_pathname); // read SNP manifest file fprintf(stderr, "Reading BPM file %s\n", bpm_fname); bpm_t *bpm = bpm_init(bpm_fname, eof_check, 0, checksums); // read cluster file egt_t *egt = NULL; if (egt_fname) { fprintf(stderr, "Reading EGT file %s\n", egt_fname); egt = egt_init(egt_fname, eof_check, checksums); if (!strcmp(egt->normalization_version, "1.2.0")) { if (gentrain_version != 3) fprintf(stderr, "Normalization algorithm version %s for cluster file %s corresponds to GenTrain 3.0\n", egt->normalization_version, egt->fn); } else if (!strcmp(egt->normalization_version, "1.1.2")) { if (gentrain_version != 2) fprintf(stderr, "Normalization algorithm version %s for cluster file %s corresponds to GenTrain 2.0\n", egt->normalization_version, egt->fn); } else if (!strcmp(egt->normalization_version, "1.1.0")) { if (gentrain_version != 1) fprintf(stderr, "Normalization algorithm version %s for cluster file %s corresponds to GenTrain 1.0\n", egt->normalization_version, egt->fn); } else { fprintf(stderr, "Normalization algorithm version %s for cluster file %s is not recognized\n", egt->normalization_version, egt->fn); } } else { fprintf(stderr, "No cluster file specified or forcing no cluster use\n"); if (!gentrain_version) gentrain_version = 3; } // write SNP map file if requested if (snp_map_fname) snp_map_write(bpm, egt, snp_map_fname); // generate lists of green and red IDATs to process int i, n = 0; char **grn_idats = NULL; char **red_idats = NULL; if (idat_pathname != NULL) { // this code for now does not recursively looks for IDAT files DIR *d = opendir(idat_pathname); if (!d) error("Failed to open directory %s\n", idat_pathname); struct dirent *dir; int m_grn = 0; int m_red = 0; int p = strlen(idat_pathname); grn_idats = NULL; red_idats = NULL; while ((dir = readdir(d))) { char *ptr = strstr(dir->d_name, "_Grn.idat"); if (!ptr) continue; hts_expand0(char *, n + 1, m_grn, grn_idats); hts_expand0(char *, n + 1, m_red, red_idats); int q = strlen(dir->d_name); grn_idats[n] = (char *)malloc((p + q + 2) * sizeof(char)); memcpy(grn_idats[n], idat_pathname, p); grn_idats[n][p] = '/'; memcpy(&grn_idats[n][p + 1], dir->d_name, q + 1); dir->d_name[q - 8] = 'R'; dir->d_name[q - 7] = 'e'; dir->d_name[q - 6] = 'd'; red_idats[n] = (char *)malloc((p + q + 2) * sizeof(char)); memcpy(red_idats[n], idat_pathname, p); red_idats[n][p] = '/'; memcpy(&red_idats[n][p + 1], dir->d_name, q + 1); n++; } closedir(d); } else if (grn_idat_fname != NULL && red_idat_fname != NULL) { grn_idats = hts_readlines(grn_idat_fname, &n); int n_check; red_idats = hts_readlines(red_idat_fname, &n_check); if (n != n_check) error("File %s contains %d filenames while file %s contains %d filenames\n", grn_idat_fname, n, red_idat_fname, n_check); } else if (argc > optind) { n = (argc - optind) / 2; grn_idats = (char **)malloc(n * sizeof(char *)); red_idats = (char **)malloc(n * sizeof(char *)); for (i = 0; i < n; i++) { grn_idats[i] = argv[optind++]; red_idats[i] = argv[optind++]; } } if (n > 0) { if (egt) { fprintf(stderr, "Using genotyping algorithm version %s\n", gentrain_version == 2 ? "6.3.0" : "7.0.0"); fprintf(stderr, "Gender estimation parameters\n"); fprintf(stderr, "\tVersion: %d\n", gender.version); fprintf(stderr, "\tMinX_Loci: %d\n", gender.min_x_loci); fprintf(stderr, "\tX_HetRateThreshold: %f\n", gender.x_het_rate_threshold); fprintf(stderr, "\tMinAutosomalLoci: %d\n", gender.min_loci); fprintf(stderr, "\tMaxAutosomalLoci: %d\n", gender.max_loci); fprintf(stderr, "\tMinY_Loci: %d\n", gender.min_y_loci); fprintf(stderr, "\tAutosomalCallRateThreshold: %f\n", gender.call_rate_threshold); fprintf(stderr, "\tX_IntensityThreshold: %f\n", gender.x_threshold); fprintf(stderr, "\tY_IntensityThreshold: %f\n", gender.y_threshold); } DIR *d = opendir(output_pathname); if (!d) error("Failed to open directory %s\n", output_pathname); kstring_t gtc_fname = {0, 0, NULL}; for (i = 0; i < n; i++) { fprintf(stderr, "Reading GRN IDAT file %s\n", grn_idats[i]); idat_t *grn_idat = idat_init(grn_idats[i], 1); fprintf(stderr, "Reading RED IDAT file %s\n", red_idats[i]); idat_t *red_idat = idat_init(red_idats[i], 1); gtc_t *gtc = gtc_init(grn_idat, red_idat, bpm, egt, gentrain_version, gtc_file_version, gencall_cutoff, sample_name, checksums, imaging_date, autocall_date_format, autocall_version, allow_missing_clusters, &gender); const char *ptr = strstr(grn_idats[i], "_Grn.idat"); if (!ptr) ptr = strstr(grn_idats[i], ".idat"); const char *ptr2 = strrchr(grn_idats[i], '/'); if (ptr2) ptr2++; else ptr2 = grn_idats[i]; ksprintf(>c_fname, "%s/%.*s.gtc", output_pathname, (int)(ptr ? ptr - ptr2 : strlen(ptr2)), ptr2); idat_destroy(grn_idat); idat_destroy(red_idat); fprintf(stderr, "Writing GTC file %s\n", gtc_fname.s); if (gtc_write(gtc, gtc_fname.s, gtc_file_version) < 0) error("Failed to write GTC file: %s\n", gtc_fname.s); gtc_destroy(gtc); gtc_fname.l = 0; } free(gtc_fname.s); closedir(d); } if (idat_pathname != NULL || grn_idat_fname != NULL || red_idat_fname != NULL) { for (i = 0; i < n; i++) { free(grn_idats[i]); free(red_idats[i]); } } free(grn_idats); free(red_idats); bpm_destroy(bpm); egt_destroy(egt); return 0; } ================================================ FILE: nearest_neighbor.c ================================================ /* The MIT License Copyright (c) 2018 Giulio Genovese Author: Giulio Genovese Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include int elementsInBin[12]; int *binData[12]; int elementsInShiftedBin[11]; int *binDataShifted[11]; int findClosestSitesToPointsAlongAxis(int n_raw, float *raw_x, float *raw_y, int n_axis, float *axis_x, float *axis_y, int *ret) { int i; float *raw_a = NULL; float *raw_b = NULL; float *axis_a = NULL; float axis_max_val; float bin_width; int bin_idx; float quotient; float reminder; int *curr_bin_data; int curr_bin_size; float curr_axis_x; float curr_axis_y; float x_dist; float y_dist; double best_val; int best_idx; int j; int curr_idx; double sq_dist; double axis_max_dist; int use_y = 1; int use_x = 1; for (i = 0; i < n_axis; i++) { if (axis_x[i] > 0.0001) { use_y = 0; break; } } for (i = 0; i < n_axis; i++) { if (axis_y[i] > 0.0001) { use_x = 0; break; } } if (use_y) { raw_a = raw_y; raw_b = raw_x; axis_a = axis_y; } else if (use_x) { raw_a = raw_x; raw_b = raw_y; axis_a = axis_x; } else { return -1; } axis_max_val = axis_a[n_axis - 1]; bin_width = axis_max_val / 12.0f; axis_max_dist = (double)bin_width; for (i = 0; i < n_raw; i++) { if ((double)raw_b[i] > axis_max_dist) continue; bin_idx = (int)(raw_a[i] / bin_width); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; elementsInBin[bin_idx]++; bin_idx = (int)(raw_a[i] / bin_width - 0.5f); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 10) bin_idx = 10; elementsInShiftedBin[bin_idx]++; } for (i = 0; i <= 11; i++) { binData[i] = (int *)malloc((size_t)elementsInBin[i] * sizeof(int)); elementsInBin[i] = 0; if (i == 11) continue; binDataShifted[i] = (int *)malloc((size_t)elementsInShiftedBin[i] * sizeof(int)); elementsInShiftedBin[i] = 0; } for (i = 0; i < n_raw; i++) { if ((double)raw_b[i] > axis_max_dist) continue; bin_idx = (int)(raw_a[i] / bin_width); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; binData[bin_idx][elementsInBin[bin_idx]] = i; elementsInBin[bin_idx]++; bin_idx = (int)(raw_a[i] / bin_width - 0.5f); if (bin_idx < 0) bin_idx = 0; if (bin_idx > 10) bin_idx = 10; binDataShifted[bin_idx][elementsInShiftedBin[bin_idx]] = i; elementsInShiftedBin[bin_idx]++; } for (i = 0; i < n_axis; i++) { quotient = axis_a[i] / bin_width; bin_idx = (int)quotient; reminder = quotient - (float)bin_idx; curr_bin_data = NULL; curr_bin_size = 0; if (bin_idx < 0) bin_idx = 0; if (bin_idx > 11) bin_idx = 11; if (0.25f <= reminder && reminder <= 0.75f) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { if (reminder < 0.25f) { if (bin_idx == 0) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { curr_bin_data = binDataShifted[bin_idx - 1]; curr_bin_size = elementsInShiftedBin[bin_idx - 1]; } } else if (bin_idx == 11) { curr_bin_data = binData[bin_idx]; curr_bin_size = elementsInBin[bin_idx]; } else { curr_bin_data = binDataShifted[bin_idx]; curr_bin_size = elementsInShiftedBin[bin_idx]; } } curr_axis_x = axis_x[i]; curr_axis_y = axis_y[i]; best_val = 1e20; best_idx = -1; for (j = 0; j < curr_bin_size; j++) { curr_idx = curr_bin_data[j]; x_dist = raw_x[curr_idx] - curr_axis_x; y_dist = raw_y[curr_idx] - curr_axis_y; sq_dist = (double)(x_dist * x_dist + y_dist * y_dist); if (sq_dist < best_val) { best_val = sq_dist; best_idx = curr_idx; } } ret[i] = best_idx; } for (i = 0; i <= 11; i++) { free((void *)binData[i]); elementsInBin[i] = 0; if (i > 10) continue; free((void *)binDataShifted[i]); elementsInShiftedBin[i] = 0; } return 0; }