[
  {
    "path": "Build/README.md",
    "content": "### The Build folder.\nHere will be the executable after compilation.\n\n\n"
  },
  {
    "path": "D.cpp",
    "content": "//\n//  D.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#include \"D.h\"\n#include \"Dsuite_common.h\"\n#include \"kstest.h\"\n#include <deque>\n#include <list>\n#define SUBPROGRAM \"Dinvestigate\"\n\n#define DEBUG 1\n#define MIN_SETS 3\n\nstatic const char *ABBA_USAGE_MESSAGE =\n\"Usage: \" PROGRAM_BIN \" \" SUBPROGRAM \" [OPTIONS] INPUT_FILE.vcf.gz SETS.txt test_trios.txt\\n\"\n\"Outputs D, f_d (Martin et al. 2014 MBE), f_dM (Malinsky et al., 2015), and d_f (Pfeifer & Kapan, 2019) in genomic windows\\n\"\n\"The SETS.txt file should have two columns: SAMPLE_ID    POPULATION_ID\\n\"\n\"The test_trios.txt should contain names of three populations for which the statistics will be calculated:\\n\"\n\"POP1   POP2    POP3\\n\"\n\"There can be multiple lines and then the program generates multiple ouput files, named like POP1_POP2_POP3_localFstats_SIZE_STEP.txt\\n\"\n\"\\n\"\n\"       -h, --help                              display this help and exit\\n\"\n\"       -w SIZE,STEP --window=SIZE,STEP         (required) D, f_D, f_dM, and d_f statistics for windows containing SIZE useable SNPs, moving by STEP (default: 50,25)\\n\"\n\"       -g, --use-genotype-probabilities        (optional) use probabilities (GP tag) or calculate them from likelihoods (GL or PL tags) using a Hardy-Weinberg prior\\n\"\n\"                                               the probabilities are used to estimate allele frequencies in each population/species\\n\"\n\"       -n, --run-name                          run-name will be included in the output file name\\n\"\n\"\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\n\n//enum { OPT_F_JK };\n\nstatic const char* shortopts = \"hw:n:g\";\n\n//static const int JK_WINDOW = 5000;\n\nstatic const struct option longopts[] = {\n    { \"run-name\",   required_argument, NULL, 'n' },\n    { \"window\",   required_argument, NULL, 'w' },\n    { \"help\",   no_argument, NULL, 'h' },\n    { \"use-genotype-probabilities\", no_argument, NULL, 'g'},\n    { NULL, 0, NULL, 0 }\n};\n\nnamespace opt\n{\n    static string vcfFile;\n    static string setsFile;\n    static string testTriosFile;\n    static string runName = \"\";\n    static int minScLength = 0;\n    static int windowSize = 50;\n    static int windowStep = 25;\n    static bool useGenotypeProbabilities = false;\n    //int jkWindowSize = JK_WINDOW;\n}\n\n\nvoid doAbbaBaba() {\n    string line; // for reading the input files\n    \n    std::istream* vcfFile = createReader(opt::vcfFile);\n    std::ifstream* testTriosFile = new std::ifstream(opt::testTriosFile.c_str());\n    if (!testTriosFile->good()) { std::cerr << \"The file \" << opt::testTriosFile << \" could not be opened. Exiting...\" << std::endl; exit(EXIT_FAILURE);}\n    \n    // Get the sample sets\n    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupRequired);\n    \n    // Get the test trios\n    std::vector<std::ofstream*> outFiles;\n    std::vector<std::ofstream*> outFilesGenes;\n    std::vector<std::vector<string> > testTrios;\n    while (getline(*testTriosFile,line)) {\n        line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n        // std::cerr << line << std::endl;\n        std::vector<string> threePops = split(line, '\\t'); assert(threePops.size() == 3);\n        for (int i = 0; i != threePops.size(); i++) { // Check that the test trios are in the sets file\n            if (setInfo.popToIDsMap.count(threePops[i]) == 0) {\n                std::cerr << threePops[i] << \" is present in the \" << opt::testTriosFile << \" but missing from the \" << opt::setsFile << std::endl;\n            }\n        }\n        std::ofstream* outFile = new std::ofstream(threePops[0] + \"_\" + threePops[1] + \"_\" + threePops[2]+ \"_localFstats_\" + opt::runName + \"_\" + numToString(opt::windowSize) + \"_\" + numToString(opt::windowStep) + \".txt\");\n        *outFile << \"chr\\twindowStart\\twindowEnd\\tD\\tf_d\\tf_dM\\td_f\" << std::endl;\n        outFiles.push_back(outFile);\n        testTrios.push_back(threePops);\n    }\n    \n    // Create objects to hold the results for each trio\n    TestTrioInfo info(opt::windowSize); std::vector<TestTrioInfo> testTrioInfos(testTrios.size(), info);\n    \n    // Now go through the vcf and calculate D\n    int totalVariantNumber = 0;\n    int reportProgressEvery = 1000; string chr; string coord;\n\n   // int lastPrint = 0; int lastWindowVariant = 0;\n    std::vector<string> sampleNames; std::vector<std::string> fields;\n    clock_t start = clock(); // clock_t startGettingCounts; clock_t startCalculation;\n    //double durationGettingCounts; double durationCalculation;\n    while (getline(*vcfFile, line)) {\n        line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n        if (line[0] == '#' && line[1] == '#')\n            continue;\n        else if (line[0] == '#' && line[1] == 'C') {\n            fields = split(line, '\\t');\n            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n            setInfo.linkSetsAndVCFpositions(sampleNames);\n        } else {\n            totalVariantNumber++;\n            if (totalVariantNumber % reportProgressEvery == 0) reportProgessVCF(totalVariantNumber, start);\n        \n            fields = split(line, '\\t'); chr = fields[0]; coord = fields[1];\n            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n            // Only consider biallelic SNPs\n            string refAllele = fields[3]; string altAllele = fields[4];\n            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == \"*\") {\n                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();\n                genotypes.clear(); genotypes.shrink_to_fit(); continue;\n            }\n            \n            // startGettingCounts = clock();\n            GeneralSetCounts* c = new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());\n            try { c->getSetVariantCounts(genotypes, setInfo.posToPopMap); } catch (const std::out_of_range& oor) {\n                std::cerr << \"Problems getting splitCounts for \" << chr << \" \" << coord << std::endl; }\n            if (opt::useGenotypeProbabilities) {\n                int likelihoodsOrProbabilitiesTagPosition = c->checkForGenotypeLikelihoodsOrProbabilities(fields);\n                if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {\n                    printMissingLikelihoodsWarning(fields[0], fields[1]);\n                    opt::useGenotypeProbabilities = false;\n                } else c->getAFsFromGenotypeLikelihoodsOrProbabilities(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition);\n            }\n            genotypes.clear(); genotypes.shrink_to_fit();\n            // durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;\n            \n            // startCalculation = clock();\n            double p_O; try { p_O = c->setDAFs.at(\"Outgroup\"); } catch (const std::out_of_range& oor) {\n                std::cerr << \"Counts don't contain derived allele frequency for the Outgroup\" << std::endl; }\n            if (p_O == -1) { delete c; continue; } // We need to make sure that the outgroup is defined\n            \n            double p_S1; double p_S2; double p_S3; double ABBA; double BABA; double F_d_denom; double F_dM_denom;\n            for (int i = 0; i != testTrios.size(); i++) {\n                try {\n                    if (!opt::useGenotypeProbabilities) p_S1 = c->setDAFs.at(testTrios[i][0]);\n                    else p_S1 = c->setDAFsFromLikelihoods.at(testTrios[i][0]);\n                } catch (const std::out_of_range& oor) {\n                std::cerr << \"Counts don't contain derived allele frequency for \" << testTrios[i][0] << std::endl; }\n                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio\n                try {\n                    if (!opt::useGenotypeProbabilities) p_S2 = c->setDAFs.at(testTrios[i][1]);\n                    else p_S2 = c->setDAFsFromLikelihoods.at(testTrios[i][1]);\n                } catch (const std::out_of_range& oor) {\n                    std::cerr << \"Counts don't contain derived allele frequency for \" << testTrios[i][1] << std::endl; }\n                if (p_S2 == -1) continue;\n                try {\n                    if (!opt::useGenotypeProbabilities) p_S3 = c->setDAFs.at(testTrios[i][2]);\n                    else p_S3 = c->setDAFsFromLikelihoods.at(testTrios[i][2]);\n                } catch (const std::out_of_range& oor) {\n                    std::cerr << \"Counts don't contain derived allele frequency for \" << testTrios[i][2] << std::endl; }\n                if (p_S3 == -1) continue;\n                //if (p_S3 == 0) continue; // XXAA pattern is not informative\n                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio\n                //if (p_S1 == 1 && p_S2 == 1) continue; // BBAA pattern is not informative\n                //if (p_S1 == 0 && p_S2 == 0) continue; // AABA pattern is not informative\n                \n                \n                ABBA = ((1-p_S1)*p_S2*p_S3*(1-p_O)); testTrioInfos[i].ABBAtotal += ABBA;\n                if(ABBA > 0.5) {\n                    testTrioInfos[i].ABBAsitePositionsPerChomosome[chr].push_back(atoi(coord.c_str()));\n                }\n                BABA = (p_S1*(1-p_S2)*p_S3*(1-p_O)); testTrioInfos[i].BABAtotal += BABA;\n                if(BABA > 0.5) {\n                    testTrioInfos[i].BABAsitePositionsPerChomosome[chr].push_back(atoi(coord.c_str()));\n                }\n                \n                if (p_S2 > p_S3) {\n                    F_d_denom = ((1-p_S1)*p_S2*p_S2*(1-p_O)) - (p_S1*(1-p_S2)*p_S2*(1-p_O));\n                } else {\n                    F_d_denom = ((1-p_S1)*p_S3*p_S3*(1-p_O)) - (p_S1*(1-p_S3)*p_S3*(1-p_O));\n                } testTrioInfos[i].F_d_denom += F_d_denom; testTrioInfos[i].interimF_d_denom += F_d_denom;\n                \n                if (p_S1 <= p_S2) {\n                    if (p_S2 > p_S3) {\n                        F_dM_denom = ((1-p_S1)*p_S2*p_S2*(1-p_O)) - (p_S1*(1-p_S2)*p_S2*(1-p_O));\n                    } else {\n                        F_dM_denom = ((1-p_S1)*p_S3*p_S3*(1-p_O)) - (p_S1*(1-p_S3)*p_S3*(1-p_O));\n                    }\n                } else {\n                    if (p_S1 > p_S3) {\n                        F_dM_denom = -(((1-p_S1)*p_S2*p_S1*(1-p_O)) - (p_S1*(1-p_S2)*p_S1)*(1-p_O));\n                    } else {\n                        F_dM_denom = -(((1-p_S3)*p_S2*p_S3*(1-p_O)) - (p_S3*(1-p_S2)*p_S3)*(1-p_O));\n                    }\n                } testTrioInfos[i].F_dM_denom += F_dM_denom; testTrioInfos[i].interimF_dM_denom += F_dM_denom;\n                \n                \n                // d_f\n                double d13 = p_S1 + p_S3 - (2*p_S1*p_S3); double d23 = p_S2 + p_S3 - (2*p_S2*p_S3);\n                double dfNum = p_S2 * d13 - p_S1 * d23;\n                double dfDenom = p_S2 * d13 + p_S1 * d23;\n                \n                double ABBAplusBABA = ABBA + BABA;\n                if (ABBAplusBABA != 0) {\n                    testTrioInfos[i].windowABBAs.push_back(ABBA);  testTrioInfos[i].windowBABAs.push_back(BABA);\n                    testTrioInfos[i].windowF_d_denoms.push_back(testTrioInfos[i].interimF_d_denom);\n                    testTrioInfos[i].windowF_dM_denoms.push_back(testTrioInfos[i].interimF_dM_denom);\n                    testTrioInfos[i].window_d_f_nums.push_back(dfNum); testTrioInfos[i].window_d_f_denoms.push_back(dfDenom);\n                    testTrioInfos[i].windowInformativeSitesCords.push_back(atoi(coord.c_str()));\n                    testTrioInfos[i].windowABBAs.pop_front(); testTrioInfos[i].windowBABAs.pop_front();\n                    testTrioInfos[i].windowF_d_denoms.pop_front(); testTrioInfos[i].windowF_dM_denoms.pop_front();\n                    testTrioInfos[i].windowInformativeSitesCords.pop_front();\n                    testTrioInfos[i].window_d_f_nums.pop_front(); testTrioInfos[i].window_d_f_denoms.pop_front();\n                    testTrioInfos[i].interimF_d_denom = 0; testTrioInfos[i].interimF_dM_denom = 0;\n                    testTrioInfos[i].usedVars++;\n                \n                    if ((testTrioInfos[i].usedVars > opt::windowSize) && (testTrioInfos[i].usedVars % opt::windowStep == 0)) {\n                        double windowABBAtotal = vector_sum(testTrioInfos[i].windowABBAs); double windowBABAtotal = vector_sum(testTrioInfos[i].windowBABAs);\n                        double windowF_d_denom = vector_sum(testTrioInfos[i].windowF_d_denoms); double windowF_dM_denom = vector_sum(testTrioInfos[i].windowF_dM_denoms);\n                        double wDnum = windowABBAtotal - windowBABAtotal; double wDdenom = windowABBAtotal + windowBABAtotal;\n                        double w_d_f_num = vector_sum(testTrioInfos[i].window_d_f_nums);\n                        double w_d_f_denom = vector_sum(testTrioInfos[i].window_d_f_denoms);\n                        if ((atoi(coord.c_str()) - testTrioInfos[i].windowInformativeSitesCords[0]) > 0) {\n                            *outFiles[i] << std::fixed << chr << \"\\t\" << testTrioInfos[i].windowInformativeSitesCords[0] << \"\\t\" << coord << \"\\t\" << wDnum/wDdenom << \"\\t\" << wDnum/windowF_d_denom << \"\\t\" << wDnum/windowF_dM_denom << \"\\t\" << w_d_f_num/w_d_f_denom << std::endl;\n                        }\n                    }\n                }\n            }\n           // durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;\n            delete c;\n        }\n    }\n    \n    for (int i = 0; i != testTrios.size(); i++) {\n        testTrioInfos[i].mergeABBA_BABA_SiteCoordsOverChoms(); testTrioInfos[i].testIfSitesUniformlyDistributed();\n        \n        std::cout << testTrios[i][0] << \"\\t\" << testTrios[i][1] << \"\\t\" << testTrios[i][2] << std::endl;\n        std::cout << \"D=\" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/(testTrioInfos[i].ABBAtotal+testTrioInfos[i].BABAtotal) << std::endl;\n        std::cout << \"f_d=\" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/testTrioInfos[i].F_d_denom << \"\\t\" << (testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal) << \"/\" << testTrioInfos[i].F_d_denom << std::endl;\n        std::cout << \"f_dM=\" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/testTrioInfos[i].F_dM_denom << \"\\t\" << (testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal) << \"/\" << testTrioInfos[i].F_dM_denom << std::endl;\n        std::cout << \"ABBA_KSpval = \" << testTrioInfos[i].ABBA_KSpval << std::endl;\n        std::cout << \"BABA_KSpval = \" << testTrioInfos[i].BABA_KSpval << std::endl;\n        std::cout << std::endl;\n    }\n}\n\n\nint abbaBabaMain(int argc, char** argv) {\n    parseAbbaBabaOptions(argc, argv);\n    doAbbaBaba();\n    return 0;\n    \n}\n\nvoid TestTrioInfo::testIfSitesUniformlyDistributed() {\n    // Take care of the splits by random sampling with replacement:\n    std::random_device rd;     // only used once to initialise (seed) engine\n    std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)\n    std::uniform_int_distribution<int> uniABBA(0,linearABBApos.back()); // guaranteed unbiased\n    std::uniform_int_distribution<int> uniBABA(0,linearBABApos.back()); // guaranteed unbiased\n    std::list<int64_t> uniABBAvals; std::list<int64_t> uniBABAvals;\n    // uniABBAvals.re(linearABBApos.size()); uniBABAvals.resize(linearBABApos.size());\n    \n    \n    int numUniformSamples = (int)linearABBApos.size(); if (numUniformSamples < 10000) { numUniformSamples = 10000; }\n    for (int i = 0; i < numUniformSamples; i++) {\n        uniABBAvals.push_back(uniABBA(rng));\n    }\n    \n    numUniformSamples = (int)linearBABApos.size(); if (numUniformSamples < 10000) { numUniformSamples = 10000; }\n    for (int i = 0; i < numUniformSamples; i++) {\n        uniBABAvals.push_back(uniBABA(rng));\n    }\n    \n    std::list<int64_t> linearABBAposList(linearABBApos.begin(),linearABBApos.end());\n    std::list<int64_t> linearBABAposList(linearBABApos.begin(),linearBABApos.end());\n    \n    ABBA_KSpval = ks_test(uniABBAvals, linearABBAposList, std::cerr, false);\n    BABA_KSpval = ks_test(uniBABAvals, linearBABAposList, std::cerr, false);\n    \n    //double BABApval = ks_test(uniBABAvals, linearBABApos, std::cerr);\n    \n}\n\n \n\n\nvoid TestTrioInfo::mergeABBA_BABA_SiteCoordsOverChoms() {\n    int totalNumABBAsites = 0;\n    for(std::map<string,std::vector<int>>::iterator it = ABBAsitePositionsPerChomosome.begin(); it != ABBAsitePositionsPerChomosome.end(); it++) {\n        totalNumABBAsites = totalNumABBAsites + (int)it->second.size();\n    } linearABBApos.reserve(totalNumABBAsites);\n    \n    int linearPosSoFar = 0;\n    for(std::map<string,std::vector<int>>::iterator it = ABBAsitePositionsPerChomosome.begin(); it != ABBAsitePositionsPerChomosome.end(); it++) {\n        for (std::vector<int>::size_type i = 0; i < it->second.size(); i++) {\n            linearABBApos.push_back(it->second[i] + linearPosSoFar);\n        }\n    }\n    \n    int totalNumBABAsites = 0;\n    for(std::map<string,std::vector<int>>::iterator it = BABAsitePositionsPerChomosome.begin(); it != BABAsitePositionsPerChomosome.end(); it++) {\n        totalNumBABAsites = totalNumBABAsites + (int)it->second.size();\n    } linearBABApos.reserve(totalNumBABAsites);\n    \n    linearPosSoFar = 0;\n    for(std::map<string,std::vector<int>>::iterator it = BABAsitePositionsPerChomosome.begin(); it != BABAsitePositionsPerChomosome.end(); it++) {\n        for (std::vector<int>::size_type i = 0; i < it->second.size(); i++) {\n            linearBABApos.push_back(it->second[i] + linearPosSoFar);\n        }\n    }\n    \n}\n\nvoid parseAbbaBabaOptions(int argc, char** argv) {\n    bool die = false;\n    std::vector<string> windowSizeStep;\n    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)\n    {\n        std::istringstream arg(optarg != NULL ? optarg : \"\");\n        switch (c)\n        {\n            case '?': die = true; break;\n            case 'w':\n                windowSizeStep = split(arg.str(), ',');\n                if(windowSizeStep.size() != 2) {std::cerr << \"The -w option requires two arguments, separated by a comma ','\\n\"; exit(EXIT_FAILURE);}\n                opt::windowSize = atoi(windowSizeStep[0].c_str());\n                opt::windowStep = atoi(windowSizeStep[1].c_str());\n                break;\n            case 'n': arg >> opt::runName; break;\n            case 'g': opt::useGenotypeProbabilities = true; break;\n            case 'h':\n                std::cout << ABBA_USAGE_MESSAGE;\n                exit(EXIT_SUCCESS);\n        }\n    }\n    \n    if (argc - optind < 3) {\n        std::cerr << \"missing arguments\\n\";\n        die = true;\n    }\n    else if (argc - optind > 3)\n    {\n        std::cerr << \"too many arguments\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << ABBA_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n    \n    // Parse the input filenames\n    opt::vcfFile = argv[optind++];\n    opt::setsFile = argv[optind++];\n    opt::testTriosFile = argv[optind++];\n}\n"
  },
  {
    "path": "D.h",
    "content": "//\n//  D.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#ifndef D_h\n#define D_h\n\n#include \"Dsuite_utils.h\"\n\nclass TestTrioInfo {\npublic:\n    TestTrioInfo(int windowSize) {\n        windowABBAs.resize(windowSize); windowBABAs.resize(windowSize);\n        windowF_d_denoms.resize(windowSize); windowF_dM_denoms.resize(windowSize);\n        windowInformativeSitesCords.resize(windowSize);\n        window_d_f_nums.resize(windowSize); window_d_f_denoms.resize(windowSize);\n        interimF_d_denom = 0; interimF_dM_denom = 0;\n\n        usedVars = 0;\n        ABBAtotal = 0; BABAtotal = 0;\n        F_d_denom = 0; F_dM_denom = 0;\n        F_G_denom = 0; F_G_num = 0;\n        \n\n    };\n    \n    // string P1; string P2; string P3;\n    std::map<string,std::vector<int>> ABBAsitePositionsPerChomosome; std::vector<int> linearABBApos;\n    std::map<string,std::vector<int>> BABAsitePositionsPerChomosome; std::vector<int> linearBABApos;\n    std::deque<double> windowABBAs; std::deque<double> windowBABAs;\n    std::deque<double> windowF_d_denoms; std::deque<double> windowF_dM_denoms;\n    std::deque<double> window_d_f_nums; std::deque<double> window_d_f_denoms;\n    std::deque<int> windowInformativeSitesCords;\n    double interimF_d_denom; double interimF_dM_denom;\n    //double D1; double D2; double D3; double D1_p; double D2_p; double D3_p;\n    \n    double ABBAtotal; double BABAtotal;\n    double F_d_denom; double F_dM_denom; double F_G_denom; double F_G_num;\n    int usedVars;\n    double ABBA_KSpval; double BABA_KSpval; \n    \n    void mergeABBA_BABA_SiteCoordsOverChoms();\n    void testIfSitesUniformlyDistributed();\n    \n};\n\n\nvoid parseAbbaBabaOptions(int argc, char** argv);\nint abbaBabaMain(int argc, char** argv);\n#endif /* D_h */\n"
  },
  {
    "path": "Dmin.cpp",
    "content": "//\n//  Dmin.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include \"Dmin.h\"\n#include \"Dsuite_common.h\"\n\n#define SUBPROGRAM \"Dtrios\"\n\n#define DEBUG 0\n#define MIN_SETS 3\n\nstatic const char *DMIN_USAGE_MESSAGE =\n\"Usage: \" PROGRAM_BIN \" \" SUBPROGRAM \" [OPTIONS] INPUT_FILE.vcf SETS.txt\\n\"\n\"Calculate the D (ABBA/BABA) and f4-ratio statistics for all trios of species in the dataset (the outgroup being fixed)\\n\"\n\"The results are as definded in Patterson et al. 2012 (equivalent to Durand et al. 2011 when the Outgroup is fixed for the ancestral allele)\\n\"\n\"The SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\\n\"\n\"The outgroup (can be multiple samples) should be specified by using the keywork Outgroup in place of the SPECIES_ID\\n\"\n\"\\n\"\nstdInInfo\n\"       -h, --help                              display this help and exit\\n\"\n\"       -k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\\n\"\n\"       -j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\\n\"\n\"                                               when specified, this is used in place of the --JKnum option\\n\"\nregionOption    // -r\ntreeOption      // -t\noutOption       // -o\n\"       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\\n\"\n\"       --no-f4-ratio                           (optional) don't calculate the f4-ratio\\n\"\n\"       -l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\\n\"\n\"       -g, --use-genotype-probabilities        (optional) use probabilities (GP tag) or calculate them from likelihoods (GL or PL tags) using a Hardy-Weinberg prior\\n\"\n\"                                               the probabilities are used to estimate allele frequencies in each population/species\\n\"\n\"       -p, --pool-seq=MIN_DEPTH                (optional) VCF contains pool-seq data; i.e., each 'individual' is a population\\n\"\n\"                                               allele frequencies are then estimated from the AD (Allelic Depth) field, as long as there are MIN_DEPTH reads\\n\"\n\"                                               e.g MIN_DEPTH=5 may be reasonable; when there are fewer reads, the allele frequency is set to missing\\n\"\n\"       -c, --no-combine                        (optional) do not output the \\\"_combine.txt\\\" and \\\"_combine_stderr.txt\\\" files\\n\"\n\"       --ABBAclustering                        (optional) Test whether strong ABBA-informative sites cluster along the genome\\n\"\n//\"                                               TYPE can be: 1 - clustering within a vector of all segregating sites\\n\"\n//\"                                                            2 - clustering within a vector of strong ABBA and BABA sites\\n\"\n// \"                                               TYPE=2 is less sensitive, but is robust to mutation rate variation\\n\"\n\"\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\n\nenum { OPT_NO_F4, OPT_KS_TEST };\nstatic const char* shortopts = \"hr:n:t:j:fk:l:o:gcp:\";\n\nstatic const struct option longopts[] = {\n    { \"run-name\",   required_argument, NULL, 'n' },\n    { \"no-combine\",   required_argument, NULL, 'c' },\n    { \"out-prefix\",   required_argument, NULL, 'o' },\n    { \"region\",   required_argument, NULL, 'r' },\n    { \"tree\",   required_argument, NULL, 't' },\n    { \"JKwindow\",   required_argument, NULL, 'j' },\n    { \"JKnum\",   required_argument, NULL, 'k' },\n    { \"help\",   no_argument, NULL, 'h' },\n    { \"no-f4-ratio\",   no_argument, NULL, OPT_NO_F4 },\n    { \"use-genotype-probabilities\", no_argument, NULL, 'g'},\n    { \"pool-seq\", required_argument, NULL, 'p'},\n    { \"KS-test-for-homoplasy\", no_argument , NULL, OPT_KS_TEST},\n    { \"ABBAclustering\", no_argument , NULL, OPT_KS_TEST},\n    { NULL, 0, NULL, 0 }\n};\n\nnamespace opt\n{\n    static string vcfFile;\n    static string setsFile;\n    static string treeFile = \"\";\n    static string runName = \"\";\n    static string providedOutPrefix = \"\";\n    static int jkWindowSize = 0;\n    static int jkNum = 20;\n    static int regionStart = -1;\n    static int regionLength = -1;\n    static int providedNumLines = -1;\n    static bool fStats = true;\n    static bool KStest = false;\n    static bool useGenotypeProbabilities = false;\n    static bool poolSeq = false;\n    static int poolMinDepth;\n    static bool combine = true;\n}\n\n\nint DminMain(int argc, char** argv) {\n    parseDminOptions(argc, argv);\n    string line; // for reading the input files\n    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, opt::setsFile, opt::regionStart, opt::regionLength);\n    std::istream* treeFile; std::ofstream* outFileTree;\n    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;\n    if (opt::treeFile != \"\") {\n        treeFile = new std::ifstream(opt::treeFile.c_str());\n        if (!treeFile->good()) { std::cerr << \"The file \" << opt::treeFile << \" could not be opened. Exiting...\" << std::endl; exit(1);}\n        outFileTree = new std::ofstream(outFileRoot + \"_tree.txt\");\n        getline(*treeFile, line);\n        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);\n        //for (std::map<string,std::vector<int>>::iterator it = treeTaxonNamesToLoc.begin(); it != treeTaxonNamesToLoc.end(); ++it) {\n        //    std::cout << \"{\" << it->first << \"}\\n\";\n        // }\n    }\n    \n    int VCFlineCount = assignNumLinesToAnalyse(opt::providedNumLines, opt::regionLength, opt::vcfFile);;\n    \n    std::istream* vcfFile;\n    if (opt::vcfFile == \"stdin\") {\n        vcfFile = &std::cin;\n    } else {\n        vcfFile = createReader(opt::vcfFile.c_str());\n    }\n    \n    // Get the sample sets\n    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupRequired);\n\n    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot+\"_BBAA.txt\"); assertFileOpen(*outFileBBAA, outFileRoot+\"_BBAA.txt\");\n    std::ofstream* outFileDmin = new std::ofstream(outFileRoot+\"_Dmin.txt\"); assertFileOpen(*outFileDmin, outFileRoot+\"_Dmin.txt\");\n    std::ofstream* outFileCombine; if (opt::combine) {\n        outFileCombine = new std::ofstream(outFileRoot+\"_combine.txt\");\n        assertFileOpen(*outFileCombine, outFileRoot+\"_combine.txt\");\n    }\n    std::ofstream* outFileCombineStdErr; if (opt::combine) {\n        outFileCombineStdErr = new std::ofstream(outFileRoot+\"_combine_stderr.txt\");\n        assertFileOpen(*outFileCombineStdErr, outFileRoot+\"_combine_stderr.txt\");\n    }\n    \n    int nCombinations = nChoosek((int)setInfo.populations.size(),3);\n    if (opt::fStats) std::cerr << \"Going to calculate D and f4-ratio values for \" << nCombinations << \" trios\" << std::endl;\n    else std::cerr << \"Going to calculate D values for \" << nCombinations << \" trios\" << std::endl;\n    \n    if (opt::treeFile != \"\") { // Check that the tree contains all the populations/species\n        setInfo.checkIfTreeNamesMatch(treeTaxonNamesToLoc);\n    }\n    \n    // first, get all combinations of three sets (species):\n    std::vector<std::vector<string>> trios; trios.resize(nCombinations);\n    std::vector<std::vector<int>> triosInt; triosInt.resize(nCombinations);\n    std::vector<bool> v(setInfo.populations.size()); std::fill(v.begin(), v.begin() + 3, true); // prepare a selection vector\n    int pNum = 0;\n    do {\n        for (int i = 0; i < v.size(); ++i) {\n            if (v[i]) { trios[pNum].push_back(setInfo.populations[i]); triosInt[pNum].push_back(i); }\n        } pNum++;\n    } while (std::prev_permutation(v.begin(), v.end())); // Getting all permutations of the selection vector - so it selects all combinations\n    std::cerr << \"Done permutations\" << std::endl;\n    \n    // Create objects to hold the results for each trio\n    std::vector<TrioDinfo> trioInfos(nCombinations); for (int i = 0; i < nCombinations; i++) { TrioDinfo info; trioInfos[i] = info; }\n    \n    // And need to prepare the vectors to hold allele frequency values:\n    std::vector<double> allPs(setInfo.populations.size(),0.0);\n    std::vector<double> allSplit1Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit1Counts(setInfo.populations.size(),0);\n    std::vector<double> allSplit2Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit2Counts(setInfo.populations.size(),0);\n    std::vector<double> allCorrectionFactors(setInfo.populations.size(),0);\n    \n    int totalVariantNumber = 0;\n    std::vector<string> sampleNames; std::vector<std::string> fields;\n    // Find out how often to report progress, based on the number of trios\n    int reportProgressEvery; if (nCombinations < 1000) reportProgressEvery = 100000;\n    else if (nCombinations < 100000) reportProgressEvery = 10000;\n    else reportProgressEvery = 1000;\n    clock_t start = clock(); clock_t startGettingCounts; clock_t startCalculation;\n   // double durationGettingCounts; double durationCalculation;\n    int JKblockSizeBasedOnNum = 0;\n    \n    //int missingLikelihoodsCount = 0;\n    //int errCount = 0;\n    \n    while (getline(*vcfFile, line)) {\n        line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n        if (line[0] == '#' && line[1] == '#') {\n            if (opt::regionStart == -1) { VCFlineCount--; } continue;\n        } else if (line[0] == '#' && line[1] == 'C') {\n            if (opt::regionStart == -1) { VCFlineCount--; } JKblockSizeBasedOnNum = (VCFlineCount/opt::jkNum)-1;\n            printInitialMessageTriosQuartets(opt::regionLength, VCFlineCount, JKblockSizeBasedOnNum, opt::jkWindowSize, opt::jkNum);\n            fields = split(line, '\\t');\n            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n            setInfo.linkSetsAndVCFpositions(sampleNames);\n        } else {\n            totalVariantNumber++;\n            if (opt::regionStart != -1) {\n                if (totalVariantNumber < opt::regionStart)\n                    continue;\n                if (totalVariantNumber > (opt::regionStart+opt::regionLength)) {\n                    std::cerr << \"DONE\" << std::endl; break;\n                }\n            }\n            if (totalVariantNumber % JKblockSizeBasedOnNum == 0 && opt::jkWindowSize == 0) {\n                for (int i = 0; i != trios.size(); i++) {\n                    trioInfos[i].addRegionDs(P3isTrios2); trioInfos[i].addRegionDs(P3isTrios1); trioInfos[i].addRegionDs(P3isTrios0);\n                }\n            }\n            \n            if (totalVariantNumber % reportProgressEvery == 0) reportProgessVCF(totalVariantNumber, VCFlineCount, start);\n            \n            fields = split(line, '\\t'); checkGenotypesExist(fields, totalVariantNumber);\n            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n\n            // Only consider biallelic SNPs\n            string refAllele = fields[3]; string altAllele = fields[4];\n            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == \"*\") {\n                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();\n                genotypes.clear(); genotypes.shrink_to_fit(); continue;\n            }\n            \n            startGettingCounts = clock();\n            double p_O;\n            if (opt::fStats)  {\n                GeneralSetCountsWithSplits* c = new GeneralSetCountsWithSplits(setInfo.popToPosMap, (int)genotypes.size());\n                c->getSplitCountsNew(genotypes, setInfo.posToPopMap);\n                \n                if (opt::useGenotypeProbabilities) {\n                    int likelihoodsOrProbabilitiesTagPosition = c->checkForGenotypeLikelihoodsOrProbabilities(fields);\n                    if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {\n                        printMissingLikelihoodsWarning(fields[0], fields[1]);\n                        opt::useGenotypeProbabilities = false;\n                    } else c->getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition, atoi(fields[1].c_str()));\n                }\n                \n                if (opt::poolSeq) {\n                    int ADtagPos = c->findADtagPosition(fields);\n                    c->getAFsFromADtagWithSplits(genotypes, setInfo.popToPosMap, ADtagPos, opt::poolMinDepth);\n                }\n                \n                p_O = c->setDAFs.at(\"Outgroup\"); if (p_O == -1) { delete c; continue; } // We need to make sure that the outgroup is defined\n                \n                if (opt::useGenotypeProbabilities) {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try {\n                            allPs[i] = c->setDAFsFromLikelihoods.at(setInfo.populations[i]);\n                            allSplit1Ps[i] = c->setDAFsplit1fromLikelihoods.at(setInfo.populations[i]);\n                            allSplit2Ps[i] = c->setDAFsplit2fromLikelihoods.at(setInfo.populations[i]);\n                            allSplit1Counts[i] = c->setAlleleCountsSplit1fromLikelihoods.at(setInfo.populations[i]);\n                            allSplit2Counts[i] = c->setAlleleCountsSplit2fromLikelihoods.at(setInfo.populations[i]);\n                            if(allSplit1Ps[i] < 0) {\n                                std::cerr << line << std::endl;\n                            std::cerr << \"setInfo.populations[i] \" << setInfo.populations[i] << std::endl;\n                            std::cerr << \"allPs[i] \" << allSplit1Ps[i] << std::endl;\n                            std::cerr << \"allSplit1Ps[i] \" << allSplit1Ps[i] << std::endl;\n                            std::cerr << \"allSplit2Ps[i] \" << allSplit2Ps[i] << std::endl;\n                            }\n                        } catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                   // print_vector(allPs, std::cerr);\n                } else if (opt::poolSeq) {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try {\n                                allPs[i] = c->setPoolDAFs.at(setInfo.populations[i]);\n                                allSplit1Ps[i] = c->setPoolDAFsplit1.at(setInfo.populations[i]);\n                                allSplit2Ps[i] = c->setPoolDAFsplit2.at(setInfo.populations[i]);\n                                allSplit1Counts[i] = 1; allSplit2Counts[i] = 1;\n                        } catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                \n                } else {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try {\n                                allPs[i] = c->setDAFs.at(setInfo.populations[i]);\n                                allSplit1Ps[i] = c->setDAFsplit1.at(setInfo.populations[i]);\n                                allSplit2Ps[i] = c->setDAFsplit2.at(setInfo.populations[i]);\n                                allSplit1Counts[i] = c->setAlleleCountsSplit1.at(setInfo.populations[i]);\n                                allSplit2Counts[i] = c->setAlleleCountsSplit2.at(setInfo.populations[i]);\n                                allCorrectionFactors[i] = c->setCorrectionFactors.at(setInfo.populations[i]);\n                            \n                            /*if (isnan(allPs[i])) {\n                                                          std::cerr << \"allPs[i]: \" << allPs[i] << \" ; Exiting ...\" << std::endl;\n                                                      std::cerr << \"allSplit1Ps[i]: \" << allSplit1Ps[i] << \" ; Exiting ...\" << std::endl;\n                                                      std::cerr << \"allSplit2Ps[i]: \" << allSplit2Ps[i] << \" ; Exiting ...\" << std::endl;\n                                                      std::cerr << \"allSplit1Counts[i]: \" << allSplit1Counts[i] << \" ; Exiting ...\" << std::endl;\n                                                      std::cerr << \"allSplit2Counts[i]: \" << allSplit2Counts[i] << \" ; Exiting ...\" << std::endl;\n                                                        //  std::cerr << fields[0] << \" \" << fields[1] << \" species[i]: \" << species[i] << \" ; Exiting ...\" << std::endl;\n                                                        //  std::cerr << genotypes[speciesToPosMap.at(species[i])[0]] << std::endl;\n                                                        //  exit(1);\n                                                      } */\n                        } catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                    //print_vector(allPs, std::cerr);\n                }\n                delete c;\n            } else {\n                GeneralSetCounts* c2 = (GeneralSetCountsWithSplits*) new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());\n                c2->getSetVariantCounts(genotypes, setInfo.posToPopMap);\n                if (opt::useGenotypeProbabilities) {\n                    int likelihoodsOrProbabilitiesTagPosition = c2->checkForGenotypeLikelihoodsOrProbabilities(fields);\n                    if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {\n                        printMissingLikelihoodsWarning(fields[0], fields[1]);\n                        opt::useGenotypeProbabilities = false;\n                    } else c2->getAFsFromGenotypeLikelihoodsOrProbabilities(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition);\n                }\n                \n                if (opt::poolSeq) {\n                    int ADtagPos = c2->findADtagPosition(fields);\n                    c2->getAFsFromADtag(genotypes,setInfo.popToPosMap,ADtagPos, opt::poolMinDepth);\n                }\n                \n                p_O = c2->setDAFs.at(\"Outgroup\"); if (p_O == -1) { delete c2; continue; } // We need to make sure that the outgroup is defined\n                if (opt::useGenotypeProbabilities) {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try { allPs[i] = c2->setDAFsFromLikelihoods.at(setInfo.populations[i]); }\n                        catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                 // print_vector(allPs, std::cerr);\n                } else if (opt::poolSeq) {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try {allPs[i] = c2->setPoolDAFs.at(setInfo.populations[i]); }\n                        catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                } else {\n                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                        try {allPs[i] = c2->setDAFs.at(setInfo.populations[i]); }\n                        catch (const std::out_of_range& oor) { std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl; }\n                    }\n                //print_vector(allPs, std::cerr);\n                //exit(1);\n                }\n                delete c2;\n            }\n            genotypes.clear(); genotypes.shrink_to_fit();\n           // durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;\n            \n            startCalculation = clock();\n            // Now calculate the D stats:\n            double p_S1; double p_S2; double p_S3; double ABBA; double BABA; double BBAA; double BAAB = 0; double ABAB = 0; double AABB = 0;\n            double correctionP3;\n            for (int i = 0; i != trios.size(); i++) {\n                p_S1 = allPs[triosInt[i][0]];\n                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio\n                p_S2 = allPs[triosInt[i][1]];\n                if (p_S2 == -1) continue;\n                p_S3 = allPs[triosInt[i][2]];\n                if (p_S3 == -1) continue;\n                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio\n                \n                // Also no need to calculate anything if the SNP is variable in only one population\n             /* if (p_S1 == 0 && p_S2 == 0 && p_O == 0) continue;\n              if (p_S1 == 1 && p_S2 == 1 && p_O == 1) continue;\n              if (p_S1 == 0 && p_S3 == 0 && p_O == 0) continue;\n              if (p_S1 == 1 && p_S3 == 1 && p_O == 1) continue;\n              if (p_S2 == 0 && p_S3 == 0 && p_O == 0) continue;\n              if (p_S2 == 1 && p_S3 == 1 && p_O == 1) continue; */\n                \n                //std::cerr << \"p_S1: \" << p_S1 << \" ; p_S2: \" << p_S2 << \" ; p_S3: \" << p_S3 << std::endl;\n                //std::cerr << std::endl;\n                \n                \n                ABBA = (1-p_S1)*p_S2*p_S3*(1-p_O);\n                BABA = p_S1*(1-p_S2)*p_S3*(1-p_O);\n                BBAA = p_S1*p_S2*(1-p_S3)*(1-p_O);\n                \n                if (p_O != 0) {\n                    BAAB = p_S1*(1-p_S2)*(1-p_S3)*p_O;\n                    ABAB = (1-p_S1)*p_S2*(1-p_S3)*p_O;\n                    AABB = (1-p_S1)*(1-p_S2)*p_S3*p_O;\n                    \n                    ABBA = ABBA + BAAB; BABA = BABA + ABAB; BBAA = BBAA + AABB;\n                }\n                \n                trioInfos[i].ABBAtotal += ABBA; trioInfos[i].BABAtotal += BABA; trioInfos[i].BBAAtotal += BBAA;\n                \n                if (ABBA > 0.5 && (ABBA + BABA) == 0) {\n                    std::cerr << \"ABBA : \" << ABBA << std::endl;\n                    std::cerr << \"BABA : \" << BABA << std::endl;\n                    std::cerr << \"(ABBA + BABA): \" << (ABBA + BABA) << std::endl;\n                }\n                if ((ABBA + BABA) != 0) { trioInfos[i].usedVars[0]++; trioInfos[i].totalUsedVars[0]++;\n                    trioInfos[i].localD1num += ABBA - BABA; trioInfos[i].localD1denom += ABBA + BABA; }\n                if ((ABBA + BBAA) != 0) { trioInfos[i].usedVars[1]++; trioInfos[i].totalUsedVars[1]++;\n                    trioInfos[i].localD2num += ABBA - BBAA; trioInfos[i].localD2denom += ABBA + BBAA; }\n                if ((BBAA + BABA) != 0) { trioInfos[i].usedVars[2]++; trioInfos[i].totalUsedVars[2]++;\n                    trioInfos[i].localD3num += BBAA - BABA; trioInfos[i].localD3denom += BBAA + BABA; }\n                \n                \n                if (opt::KStest) {\n                    if (ABBA > 0.5) {\n                       // trioInfos[i].linearStrongABBApos[0].push_back(trioInfos[i].totalUsedVars[0]);\n                       // trioInfos[i].linearStrongABBApos[1].push_back(trioInfos[i].totalUsedVars[1]);\n                        trioInfos[i].numStrongVars[0]++; trioInfos[i].numStrongVars[1]++;\n                        trioInfos[i].linearStrongABBApos[0].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongABBAposStrongSitesOnly[0].push_back(trioInfos[i].numStrongVars[0]);\n                        trioInfos[i].linearStrongABBApos[1].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongABBAposStrongSitesOnly[1].push_back(trioInfos[i].numStrongVars[1]);\n                    }\n                    if (BABA > 0.5) {\n                        //trioInfos[i].linearStrongBABApos[0].push_back(trioInfos[i].totalUsedVars[0]);\n                        //trioInfos[i].linearStrongBABApos[2].push_back(trioInfos[i].totalUsedVars[2]);\n                        trioInfos[i].numStrongVars[0]++; trioInfos[i].numStrongVars[2]++;\n                        trioInfos[i].linearStrongBABApos[0].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongBABAposStrongSitesOnly[0].push_back(trioInfos[i].numStrongVars[0]);\n                        trioInfos[i].linearStrongBABApos[2].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongBABAposStrongSitesOnly[2].push_back(trioInfos[i].numStrongVars[2]);\n                    }\n                    if (BBAA > 0.5) {\n                        //trioInfos[i].linearStrongABBApos[2].push_back(trioInfos[i].totalUsedVars[2]);\n                        //trioInfos[i].linearStrongBABApos[1].push_back(trioInfos[i].totalUsedVars[1]);\n                        trioInfos[i].numStrongVars[1]++; trioInfos[i].numStrongVars[2]++;\n                        trioInfos[i].linearStrongABBApos[2].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongABBAposStrongSitesOnly[2].push_back(trioInfos[i].numStrongVars[2]);\n                        trioInfos[i].linearStrongBABApos[1].push_back(totalVariantNumber);\n                        trioInfos[i].linearStrongBABAposStrongSitesOnly[1].push_back(trioInfos[i].numStrongVars[1]);\n                    }\n                }\n                \n                \n                if (opt::fStats) {\n                    \n                    // f_G\n                 //   int c_S1a = 0; int c_S1b = 0; int c_S2a = 0; int c_S2b = 0;int c_S3a = 0; int c_S3b = 0;\n                  //  c_S3a = allSplit1Counts[triosInt[i][2]]; c_S3b = allSplit2Counts[triosInt[i][2]];\n                  //  c_S2a = allSplit1Counts[triosInt[i][1]]; c_S2b = allSplit2Counts[triosInt[i][1]];\n                  //  c_S1a = allSplit1Counts[triosInt[i][0]]; c_S1b = allSplit2Counts[triosInt[i][0]];\n                    \n                    \n                    \n                    double p_S1a = 0; double p_S1b = 0; double p_S2a = 0; double p_S2b = 0; double p_S3a = 0; double p_S3b = 0;\n                    \n                    correctionP3 = allCorrectionFactors[triosInt[i][2]];\n                    \n                    p_S3a = allSplit1Ps[triosInt[i][2]]; p_S3b = allSplit2Ps[triosInt[i][2]];\n                    p_S2a = allSplit1Ps[triosInt[i][1]]; p_S2b = allSplit2Ps[triosInt[i][1]];\n                    p_S1a = allSplit1Ps[triosInt[i][0]]; p_S1b = allSplit2Ps[triosInt[i][0]];\n                    \n                  //  std::cerr << \"p_S1a : \" << p_S1a << \"; p_S1b : \" << p_S1b << std::endl;\n                  //  std::cerr << \"p_S2a : \" << p_S2a << \"; p_S2b : \" << p_S2b << std::endl;\n                  //  std::cerr << \"p_S3a : \" << p_S3a << \"; p_S3b : \" << p_S3b << std::endl;\n                    \n                    assert(p_S1a >= 0); assert(p_S1b >= 0);\n                    assert(p_S2a >= 0); assert(p_S2b >= 0);\n                    assert(p_S3a >= 0); assert(p_S3b >= 0);\n                    \n                    \n                    double thisFgDenom1 = fG_Denom_perVariant(p_S1,p_S3a,p_S3b,p_O);\n                    double thisFgDenom1_rev = fG_Denom_perVariant(p_S2,p_S3a,p_S3b,p_O);\n                    \n                    trioInfos[i].F_G_denom1 += fG_Denom_perVariant(p_S1,p_S3a,p_S3b,p_O);\n                    trioInfos[i].F_G_denom1_reversed += fG_Denom_perVariant(p_S2,p_S3a,p_S3b,p_O);\n                    trioInfos[i].F_G_denom2 += fG_Denom_perVariant(p_S1,p_S2a,p_S2b,p_O);\n                    trioInfos[i].F_G_denom2_reversed += fG_Denom_perVariant(p_S3,p_S2a,p_S2b,p_O);\n                    trioInfos[i].F_G_denom3 += fG_Denom_perVariant(p_S3,p_S1a,p_S1b,p_O);\n                    trioInfos[i].F_G_denom3_reversed += fG_Denom_perVariant(p_S2,p_S1a,p_S1b,p_O);\n                    \n                    \n                    \n                    \n                    if (p_O != 0) {\n                        thisFgDenom1 += fG_Denom_perVariant(1-p_S1,1-p_S3a,1-p_S3b,1-p_O);\n                        thisFgDenom1_rev += fG_Denom_perVariant(1-p_S2,1-p_S3a,1-p_S3b,1-p_O);\n                        trioInfos[i].F_G_denom1 += fG_Denom_perVariant(1-p_S1,1-p_S3a,1-p_S3b,1-p_O);\n                        trioInfos[i].F_G_denom1_reversed += fG_Denom_perVariant(1-p_S2,1-p_S3a,1-p_S3b,1-p_O);\n                        trioInfos[i].F_G_denom2 += fG_Denom_perVariant(1-p_S1,1-p_S2a,1-p_S2b,1-p_O);\n                        trioInfos[i].F_G_denom2_reversed += fG_Denom_perVariant(1-p_S3,1-p_S2a,1-p_S2b,1-p_O);\n                        trioInfos[i].F_G_denom3 += fG_Denom_perVariant(1-p_S3,1-p_S1a,1-p_S1b,1-p_O);\n                        trioInfos[i].F_G_denom3_reversed += fG_Denom_perVariant(1-p_S2,1-p_S1a,1-p_S1b,1-p_O);\n                    }\n                    \n                    /* investigating rare cases of unexpected f4-ratio values\n                    if (thisFgDenom1 < 0) {\n                        errCount++;\n                        std::cerr << \"thisFgDenom1: \" << thisFgDenom1 << \" ; thisFgDenom1_rev: \" << thisFgDenom1_rev << std::endl;\n                        std::cerr << \"ABBA: \" << ABBA << \" ; BABA: \" << BABA << \" ; ABBA-BABA: \" << ABBA-BABA << std::endl;\n                        std::cerr << \"p_S1: \" << p_S1 << std::endl;\n                        std::cerr << \"p_S2: \" << p_S2 << std::endl;\n                        std::cerr << \"p_S3: \" << p_S3 << \"; p_S3a: \" << p_S3a << \" ; p_S3b: \" << p_S3b << std::endl;\n                        std::cerr << \"correctionP3: \" << correctionP3 << std::endl;\n                        print_vector(allPs, std::cerr);\n                        print_vector(allCorrectionFactors, std::cerr);\n                        std::cerr << \"p_O: \" << p_O << std::endl;\n                        std::cerr << std::endl;\n                        if (errCount > 10) {\n                            exit(1);\n                        }\n                    \n                    }\n                    */\n                    \n                    \n               /*\n                // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns\n                                   if (BBAAtotal >= BABAtotal && BBAAtotal >= ABBAtotal) {\n                                       BBAAarrangement = P3isTrios2;\n                                   } else if (BABAtotal >= BBAAtotal && BABAtotal >= ABBAtotal) {\n                                       BBAAarrangement = P3isTrios1;\n                                   } else if (ABBAtotal >= BBAAtotal && ABBAtotal >= BABAtotal) {\n                                       BBAAarrangement = P3isTrios0;\n                                   }\n                if (totalVariantNumber % reportProgressEvery == 0) {\n                    std::cerr << trios[0][0] << \"\\t\" << trios[0][1] << \"\\t\" << trios[0][2] << \"\\n\";\n                    std::cerr << \"p_S1a: \" << p_S1a << \" ; p_S1b: \" << p_S1b << std::endl;\n                    std::cerr << \"p_S2a: \" << p_S2a << \" ; p_S2b: \" << p_S2b << std::endl;\n                    std::cerr << \"p_S3a: \" << p_S3a << \" ; p_S3b: \" << p_S3b << std::endl;\n                    \n                    \n                    std::cerr << \"ABBA-BABA: \" << trioInfos[i].ABBAtotal-trioInfos[i].BABAtotal << \"; ABBA - BBAA: \" << trioInfos[i].ABBAtotal - trioInfos[i].BBAAtotal << \"; ABBA - BBAA: \" << trioInfos[i].BBAAtotal - trioInfos[i].BABAtotal << std::endl;\n                    std::cerr << \"trioInfos[i].F_G_denom1: \" << trioInfos[i].F_G_denom1 << \"; trioInfos[i].F_G_denom2: \" << trioInfos[i].F_G_denom2 << \"; trioInfos[i].F_G_denom3: \" << trioInfos[i].F_G_denom3 << std::endl;\n                    std::cerr << \"trioInfos[i].F_G_denom1_reversed: \" << trioInfos[i].F_G_denom1_reversed << \"; trioInfos[i].F_G_denom2_reversed: \" << trioInfos[i].F_G_denom2_reversed << \"; trioInfos[i].F_G_denom3_reversed: \" << trioInfos[i].F_G_denom3_reversed << std::endl;\n                    \n                    std::cerr << std::endl;\n                    } */\n                }\n                \n                // std::cerr << \"trioInfos[i].localD1num\" << trioInfos[i].localD1denom << std::endl;\n                if (opt::jkWindowSize > 0) {\n                    if (trioInfos[i].usedVars[0] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios2); }\n                    if (trioInfos[i].usedVars[1] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios1); }\n                    if (trioInfos[i].usedVars[2] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios0); }\n                }\n                // } */\n            }\n           // durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;\n        }\n    }\n    std::cerr << \"Done processing VCF. Preparing output files...\" << '\\n';\n    \n    string header = makeHeader(false, opt::fStats, opt::KStest);\n    *outFileBBAA << header << std::endl; *outFileDmin << header << std::endl;\n    if (opt::treeFile != \"\") *outFileTree << header << std::endl;\n    \n    int exceptionCount = 0;\n    for (int i = 0; i != trios.size(); i++) { //\n        // Get the D values\n        try {\n            trioInfos[i].calculateFinalDs();\n        } catch (const char* msg) {\n            exceptionCount++;\n            if (exceptionCount <= 10) {\n                std::cerr << msg << std::endl;\n                std::cerr << \"Could not calculate p-values for the trio: \" << trios[i][0] << \" \" << trios[i][1] << \" \" << trios[i][2] << std::endl;\n                if (opt::jkWindowSize > 0) std::cerr << \"You should probably decrease the the jackknife block size (-j option)\" << std::endl;\n                else std::cerr << \"it looks like there aren't enough ABBA-BABA informative variants for this trio\" << std::endl;\n                std::cerr << std::endl;\n            }\n            trioInfos[i].D1_p = nan(\"\"); trioInfos[i].D2_p = nan(\"\"); trioInfos[i].D3_p = nan(\"\");\n        }\n        \n        // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA\n        trioInfos[i].assignBBAAarrangement();\n        std::vector<string> BBAAoutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].BBAAarrangement);\n        print_vector(BBAAoutVec,*outFileBBAA);\n        \n        // Find Dmin:\n        trioInfos[i].assignDminArrangement();\n        std::vector<string> DminOutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].DminArrangement);\n        print_vector(DminOutVec,*outFileDmin);\n        \n        // Find which arrangement of trios is consistent with the input tree (if provided):\n        if (opt::treeFile != \"\") {\n            int loc1 = treeTaxonNamesToLoc[trios[i][0]][0];\n            int loc2 = treeTaxonNamesToLoc[trios[i][1]][0];\n            int loc3 = treeTaxonNamesToLoc[trios[i][2]][0];\n            trioInfos[i].treeArrangement = trioInfos[i].assignTreeArrangement(treeLevels, loc1, loc2, loc3);\n            std::vector<string> treeOutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].treeArrangement);\n            print_vector(treeOutVec,*outFileTree);\n        }\n        \n        // Output a simple file that can be used for combining multiple local runs:\n        if (opt::combine) {\n            *outFileCombine << trios[i][0] << \"\\t\" << trios[i][1] << \"\\t\" << trios[i][2] << \"\\t\" << trioInfos[i].BBAAtotal << \"\\t\" << trioInfos[i].BABAtotal << \"\\t\" << trioInfos[i].ABBAtotal;\n            if (opt::fStats) {\n                *outFileCombine << \"\\t\" << trioInfos[i].F_G_denom1 << \"\\t\" << trioInfos[i].F_G_denom2 << \"\\t\" << trioInfos[i].F_G_denom3;\n                *outFileCombine << \"\\t\" << trioInfos[i].F_G_denom1_reversed << \"\\t\" << trioInfos[i].F_G_denom2_reversed << \"\\t\" << trioInfos[i].F_G_denom3_reversed;\n                *outFileCombine << std::endl;\n            } else {\n                *outFileCombine << std::endl;\n            }\n            print_vector(trioInfos[i].regionDs[0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << \"\\t\"; print_vector(trioInfos[i].regionDs[1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << \"\\t\";\n            print_vector(trioInfos[i].regionDs[2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl;\n        }\n        //std::cerr << trios[i][0] << \"\\t\" << trios[i][1] << \"\\t\" << trios[i][2] << \"\\t\" << D1 << \"\\t\" << D2 << \"\\t\" << D3 << \"\\t\" << BBAAtotals[i] << \"\\t\" << BABAtotals[i] << \"\\t\" << ABBAtotals[i] << std::endl;\n    }\n    if (exceptionCount > 10) {\n        std::cerr << \"...\" << std::endl;\n        std::cerr << \"p-value could not be calculated for \" << exceptionCount << \" trios\" << std::endl;\n        if (opt::jkWindowSize > 0) std::cerr << \"You should probably decrease the the jackknife block size (-j option)\" << std::endl;\n        else std::cerr << \"it looks like there aren't enough ABBA-BABA informative variants for these trios\" << std::endl;\n        std::cerr << \"If this was a run for a subset of the genome (e.g. one chromosome), you may still get p-values for these trios from DtriosCombine\" << std::endl;\n        std::cerr << std::endl;\n    }\n    return 0;\n    \n}\n\n\n\nvoid parseDminOptions(int argc, char** argv) {\n    bool die = false; string regionArgString; std::vector<string> regionArgs;\n    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)\n    {\n        std::istringstream arg(optarg != NULL ? optarg : \"\");\n        switch (c)\n        {\n            case '?': die = true; break;\n            case 'n': arg >> opt::runName; break;\n            case 't': arg >> opt::treeFile; break;\n            case 'j': arg >> opt::jkWindowSize; break;\n            case 'k': arg >> opt::jkNum; break;\n            case OPT_NO_F4: opt::fStats = false; break;\n            case OPT_KS_TEST: opt::KStest = true; break;\n            case 'c': opt::combine = false; break;\n            case 'g': opt::useGenotypeProbabilities = true; break;\n            case 'l': arg >> opt::providedNumLines; break;\n            case 'o': arg >> opt::providedOutPrefix; break;\n            case 'p': opt::poolSeq = true; arg >> opt::poolMinDepth; break;\n            case 'r': arg >> regionArgString; regionArgs = split(regionArgString, ',');\n                if (regionArgs.size() != 2) {\n                    std::cerr << \"the --region argument should be two numbers separated by a comma\\n\";\n                    die = true;\n                } else {\n                    opt::regionStart = (int)stringToDouble(regionArgs[0]); opt::regionLength = (int)stringToDouble(regionArgs[1]);  break;\n                }\n            case 'h':\n                std::cout << DMIN_USAGE_MESSAGE;\n                exit(EXIT_SUCCESS);\n        }\n    }\n    \n    int maxNumArgs = 2; int minNumArgs = 2; // if (opt::poolSeq) { minNumArgs = 1; }\n    \n    if (opt::poolSeq && opt::useGenotypeProbabilities) {\n        std::cerr << \"Error: The -p and -g options are not compatible. Please check your command line. Exiting ....\\n\";\n        die = true;\n    }\n    \n    if (argc - optind < minNumArgs) {\n        std::cerr << \"missing arguments\\n\";\n        die = true;\n    }\n    else if (argc - optind > maxNumArgs)\n    {\n        std::cerr << \"too many arguments\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << DMIN_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n    \n    // Parse the input filenames\n    opt::vcfFile = argv[optind++];\n    opt::setsFile = argv[optind++];\n    \n    if (opt::vcfFile == \"stdin\" && opt::providedNumLines <= 0) {\n        std::cerr << \"If you want to read the VCF via a pipe, you need to specify the number of lines in the input via the -l option\\n\";\n        std::cerr << \"See the example above\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << DMIN_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n}\n\n"
  },
  {
    "path": "Dmin.h",
    "content": "//\n//  Dmin.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#ifndef Dmin_h\n#define Dmin_h\n#include \"Dsuite_utils.h\"\n\n\nvoid parseDminOptions(int argc, char** argv);\nint DminMain(int argc, char** argv);\n\n\n\n#endif /* Dmin_h */\n"
  },
  {
    "path": "Dmin_combine.cpp",
    "content": "//\n//  Dmin_combine.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#include \"Dmin_combine.h\"\n#include \"Dsuite_common.h\"\n\n#define SUBPROGRAM \"DtriosCombine\"\n\n#define DEBUG 1\n\nstatic const char *DMINCOMBINE_USAGE_MESSAGE =\n\"Usage: \" PROGRAM_BIN \" \" SUBPROGRAM \" [OPTIONS] DminFile1 DminFile2 DminFile3 ....\\n\"\n\"Combine the BBAA, ABBA, and BABA counts from multiple files (e.g per-chromosome) and output the overall D stats,\\n\"\n\"p-values and f4-ratio values\\n\"\n\"\\n\"\n\"       -h, --help                              display this help and exit\\n\"\n\"       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\\n\"\n\"                                               output will be put in OUT_FILE_PREFIX_combined_BBAA.txt, OUT_FILE_PREFIX_combined_Dmin.txt, OUT_FILE_PREFIX_combined_tree.txt etc.\\n\"\n\"                                               by default, the prefix is \\\"out\\\"\\n\"\n\"       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\\n\"\n\"       -t , --tree=TREE_FILE.nwk               (optional) a file with a tree in the newick format specifying the relationships between populations/species\\n\"\n\"                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\\n\"\n\"       -s , --subset=start,length              (optional) only process a subset of the trios\\n\"\n\"\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\n\nstatic const char* shortopts = \"hn:t:s:o:\";\n\nstatic const struct option longopts[] = {\n    { \"subset\",   required_argument, NULL, 's' },\n    { \"out-prefix\",   required_argument, NULL, 'o' },\n    { \"run-name\",   required_argument, NULL, 'n' },\n    { \"tree\",   required_argument, NULL, 't' },\n    { \"help\",   no_argument, NULL, 'h' },\n    { NULL, 0, NULL, 0 }\n};\n\nnamespace opt\n{\n    static std::vector<string> dminFiles;\n    static string providedOutPrefix = \"out\";\n    static string runName = \"\";\n    static string treeFile = \"\";\n    int subsetStart = -1;\n    int subsetLength = -1;\n}\n\n\nint DminCombineMain(int argc, char** argv) {\n    parseDminCombineOptions(argc, argv);\n    const bool KStestPossible = false;\n    \n    string line; // for reading the input files\n    \n    \n    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, \"\", -1, -1);\n    \n    std::vector<std::istream*> dminstdErrFiles; std::vector<std::istream*> dminBBAAscoreFiles;\n    for (int i = 0; i < opt::dminFiles.size(); i++) {\n        std::istream* dminBBAAscoreFile;\n        if (file_exists(opt::dminFiles[i] + \"_combine.txt\")) {\n            dminBBAAscoreFile = createReader((opt::dminFiles[i] + \"_combine.txt\").c_str());\n        } else if(file_exists(opt::dminFiles[i] + \"_combine.txt.gz\")) {\n            dminBBAAscoreFile = createReader((opt::dminFiles[i] + \"_combine.txt.gz\").c_str());\n        } else {\n            std::cerr << \"Can't find the file: \" << opt::dminFiles[i] + \"_combine.txt\" << \" or \" << opt::dminFiles[i] + \"_combine.txt.gz. Exiting...\" << std::endl;\n            exit(EXIT_FAILURE);\n        }\n        dminBBAAscoreFiles.push_back(dminBBAAscoreFile);\n        std::istream* dminstdErrFile;\n        if (file_exists(opt::dminFiles[i] + \"_combine_stderr.txt\")) {\n            dminstdErrFile = createReader((opt::dminFiles[i] + \"_combine_stderr.txt\").c_str());\n        } else if(file_exists(opt::dminFiles[i] + \"_combine_stderr.txt.gz\")) {\n            dminstdErrFile = createReader((opt::dminFiles[i] + \"_combine_stderr.txt.gz\").c_str());\n        } else {\n            std::cerr << \"Can't find the file: \" << opt::dminFiles[i] + \"_combine_stderr.txt\" << \" or \" << opt::dminFiles[i] + \"_combine_stderr.txt.gz. Exiting...\" << std::endl;\n            exit(EXIT_FAILURE);\n        }\n        dminstdErrFiles.push_back(dminstdErrFile);\n        std::cerr << \"Reading file \" << opt::dminFiles[i] << std::endl;\n    }\n    \n    std::istream* treeFile; std::ofstream* outFileTree;\n    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;\n    if (opt::treeFile != \"\") {\n        treeFile = new std::ifstream(opt::treeFile.c_str());\n        if (!treeFile->good()) { std::cerr << \"The file \" << opt::treeFile << \" could not be opened. Exiting...\" << std::endl; exit(1);}\n        outFileTree = new std::ofstream(outFileRoot + \"_combined_tree.txt\");\n        getline(*treeFile, line);\n        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);\n    }\n    // Now get the standard error values\n    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot + \"_combined_BBAA.txt\"); std::ofstream* outFileDmin = new std::ofstream(outFileRoot + \"_combined_Dmin.txt\");\n    \n    std::vector<double> BBAA_local_Ds; std::vector<double> ABBA_local_Ds; std::vector<double> BABA_local_Ds;\n    string s1; string s2; string s3;\n    bool allDone = false; bool fIncluded = false;\n    int processedTriosNumber = 0; int exceptionCount = 0;\n    \n    getline(*dminBBAAscoreFiles[0], line); std::vector<string> patternCounts = split(line, '\\t');\n    if (patternCounts.size() == 12) fIncluded = true;\n    string header = makeHeader(false,fIncluded,KStestPossible);\n    *outFileBBAA << header << std::endl; *outFileDmin << header << std::endl;\n    if (opt::treeFile != \"\") *outFileTree << header << std::endl;\n    dminBBAAscoreFiles[0]->seekg(0, dminBBAAscoreFiles[0]->beg); // Go back to the beginning of this file\n    \n    do {\n        TrioDinfo info; processedTriosNumber++;\n        if (processedTriosNumber % 10000 == 0) { std::cerr << \"Processed \" << processedTriosNumber << \" trios\" << std::endl; }\n        \n        if (opt::subsetStart != -1) {\n            if (processedTriosNumber < opt::subsetStart) {\n                for (int i = 0; i < dminBBAAscoreFiles.size(); i++) { getline(*dminBBAAscoreFiles[i], line); }\n                for (int i = 0; i < dminstdErrFiles.size(); i++) { getline(*dminstdErrFiles[i], line); }\n                continue;\n            }\n            if (processedTriosNumber >= (opt::subsetStart+opt::subsetLength)) {\n                std::cerr << \"DONE\" << std::endl; break;\n            }\n        }\n        \n        \n        for (int i = 0; i < dminBBAAscoreFiles.size(); i++) {\n            if (getline(*dminBBAAscoreFiles[i], line)) {\n                std::vector<string> patternCounts = split(line, '\\t');\n                assert(patternCounts.size() == 6 || patternCounts.size() == 12);\n\n                if (i == 0) {\n                    s1 = patternCounts[0]; s2 = patternCounts[1]; s3 = patternCounts[2];\n                } else {\n                    assert(s1 == patternCounts[0]); assert(s2 == patternCounts[1]); assert(s3 == patternCounts[2]);\n                }\n                info.BBAAtotal += stringToDouble(patternCounts[3]);\n                info.BABAtotal += stringToDouble(patternCounts[4]);\n                info.ABBAtotal += stringToDouble(patternCounts[5]);\n                if (fIncluded) {\n                    info.F_G_denom1 += stringToDouble(patternCounts[6]);\n                    info.F_G_denom2 += stringToDouble(patternCounts[7]);\n                    info.F_G_denom3 += stringToDouble(patternCounts[8]);\n                    info.F_G_denom1_reversed += stringToDouble(patternCounts[9]);\n                    info.F_G_denom2_reversed += stringToDouble(patternCounts[10]);\n                    info.F_G_denom3_reversed += stringToDouble(patternCounts[11]);\n                }\n            } else {\n                allDone = true; break;\n            }\n        }\n        \n        for (int i = 0; i < dminstdErrFiles.size(); i++) {\n            if (getline(*dminstdErrFiles[i], line)) {\n                std::vector<string> localDs = split2(line, \"\\t\");\n                //assert(localDs.size() == 3 || localDs.size() == 0);\n                if (localDs.size() == 3) {\n                    std::vector<string> regionD_strings0 = split(localDs[0], ',');\n                    std::vector<string> regionD_strings1 = split(localDs[1], ',');\n                    std::vector<string> regionD_strings2 = split(localDs[2], ',');\n                    for (int j = 0; j < regionD_strings0.size(); j++) {\n                        double localD = stringToDouble(regionD_strings0[j]);\n                        if (!std::isnan(localD)) info.regionDs[0].push_back(localD);\n                    }\n                    for (int j = 0; j < regionD_strings1.size(); j++) {\n                        double localD = stringToDouble(regionD_strings1[j]);\n                        if (!std::isnan(localD)) info.regionDs[1].push_back(localD);\n                    }\n                    for (int j = 0; j < regionD_strings2.size(); j++) {\n                        double localD = stringToDouble(regionD_strings2[j]);\n                        if (!std::isnan(localD)) info.regionDs[2].push_back(localD);\n                    }\n                } else {\n                    print_vector(localDs,std::cerr); exit(EXIT_FAILURE);\n                }\n            } else {\n                allDone = true; break;\n            }\n        }\n        \n        \n        if (!allDone) {\n            try {\n                info.calculateFinalDs();\n            } catch (const char* msg) {\n                exceptionCount++;\n                if (exceptionCount <= 10) {\n                    std::cerr << msg << std::endl;\n                    std::cerr << \"Could not calculate p-values for the trio: \" << s1 << \" \" << s2 << \" \" << s3 << std::endl;\n                    std::cerr << \"You should probably decrease the the jackknife block size (-j option)\" << std::endl;\n                    std::cerr << std::endl;\n                }\n                info.D1_p = nan(\"\"); info.D2_p = nan(\"\"); info.D3_p = nan(\"\");\n            }\n            \n            std::vector<string> trio; trio.push_back(s1); trio.push_back(s2); trio.push_back(s3);\n            // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA\n            info.assignBBAAarrangement();\n            std::vector<string> BBAAoutVec = info.makeOutVec(trio, fIncluded,KStestPossible, info.BBAAarrangement);\n            print_vector(BBAAoutVec,*outFileBBAA);\n           \n            // Find Dmin:\n            info.assignDminArrangement();\n            std::vector<string> DminOutVec = info.makeOutVec(trio, fIncluded, KStestPossible,info.DminArrangement);\n            print_vector(DminOutVec,*outFileDmin);\n            \n            if (opt::treeFile != \"\") {\n                int loc1 = treeTaxonNamesToLoc[s1][0]; int loc2 = treeTaxonNamesToLoc[s2][0]; int loc3 = treeTaxonNamesToLoc[s3][0];\n                info.treeArrangement = info.assignTreeArrangement(treeLevels, loc1, loc2, loc3);\n                std::vector<string> treeOutVec = info.makeOutVec(trio, fIncluded, KStestPossible, info.treeArrangement);\n                print_vector(treeOutVec,*outFileTree);\n            }\n        }\n \n    } while(!allDone);\n    \n    if (exceptionCount > 10) {\n        std::cerr << \"...\" << std::endl;\n        std::cerr << \"p-value could not be calculated for \" << exceptionCount << \" trios\" << std::endl;\n        std::cerr << \"You should definitely decrease the the jackknife block size!!!\" << std::endl;\n        std::cerr << std::endl;\n    }\n    \n    return 0;\n    \n}\n\n\n\nvoid parseDminCombineOptions(int argc, char** argv) {\n    bool die = false; string subsetArgString; std::vector<string> subsetArgs;\n    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)\n    {\n        std::istringstream arg(optarg != NULL ? optarg : \"\");\n        switch (c)\n        {\n            case '?': die = true; break;\n            case 'n': arg >> opt::runName; break;\n            case 't': arg >> opt::treeFile; break;\n            case 'o': arg >> opt::providedOutPrefix; break;\n            case 's': arg >> subsetArgString; subsetArgs = split(subsetArgString, ',');\n                opt::subsetStart = (int)stringToDouble(subsetArgs[0]); opt::subsetLength = (int)stringToDouble(subsetArgs[1]);  break;\n            case 'h':\n                std::cout << DMINCOMBINE_USAGE_MESSAGE;\n                exit(EXIT_SUCCESS);\n        }\n    }\n    \n    \n    int nFilenames = argc - optind;\n    if (nFilenames < 1) {\n        std::cerr << \"missing arguments\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << DMINCOMBINE_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n    \n    // Parse the input filenames\n    while (optind < argc) {\n        opt::dminFiles.push_back(argv[optind++]);\n    }\n}\n"
  },
  {
    "path": "Dmin_combine.h",
    "content": "//\n//  Dmin_combine.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#ifndef Dmin_combine_h\n#define Dmin_combine_h\n\n#include \"Dsuite_utils.h\"\n\nvoid parseDminCombineOptions(int argc, char** argv);\nint DminCombineMain(int argc, char** argv);\n\n#endif /* Dmin_combine_h */\n"
  },
  {
    "path": "Dquartets.cpp",
    "content": "//\n//  Dquartets.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 14/07/2020.\n//\n\n#include \"Dquartets.h\"\n#include \"Dsuite_common.h\"\n\n#define SUBPROGRAM \"Dquartets\"\n\n#define DEBUG 0\n#define MIN_SETS 4\n\nstatic const char *DQUARTS_USAGE_MESSAGE =\n\"Usage: \" PROGRAM_BIN \" \" SUBPROGRAM \" [OPTIONS] INPUT_FILE.vcf SETS.txt\\n\"\n\"Calculate the D (ABBA/BABA) and f4-ratio (f_G) statistics for all quartets of species in the dataset (there is no outgroup)\\n\"\n\"The results are as definded in Patterson et al. 2012\\n\"\n\"The SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\\n\"\n\"\\n\"\nstdInInfo\n\"       -h, --help                              display this help and exit\\n\"\n\"       -k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\\n\"\n\"       -j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\\n\"\n\"                                               when specified, this is used in place of the --JKnum option\\n\"\nregionOption    // -r\ntreeOption      // -t\noutOption       // -o\n\"       -n, --run-name                          (optional; default=quartets) run-name will be included in the output file name after the PREFIX\\n\"\n\"       --no-f4-ratio                           (optional) don't calculate the f4-ratio\\n\"\n\"       -l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\\n\"\n\"       -a, --allF4-ratios                      (optional) output F4 ratios for all posible arrangements\\n\"\n\"\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\nenum { OPT_NO_F4 };\nstatic const char* shortopts = \"hr:n:t:j:fpk:l:o:a\";\n\nstatic const struct option longopts[] = {\n    { \"run-name\",   required_argument, NULL, 'n' },\n    { \"out-prefix\",   required_argument, NULL, 'o' },\n    { \"region\",   required_argument, NULL, 'r' },\n    { \"tree\",   required_argument, NULL, 't' },\n    { \"JKwindow\",   required_argument, NULL, 'j' },\n    { \"JKnum\",   required_argument, NULL, 'k' },\n    { \"help\",   no_argument, NULL, 'h' },\n    { \"no-f4-ratio\",   no_argument, NULL, OPT_NO_F4 },\n    { \"allF4-ratios\",   no_argument, NULL, 'a' },\n    { NULL, 0, NULL, 0 }\n};\n\nnamespace opt\n{\n    static string vcfFile;\n    static string setsFile;\n    static string treeFile = \"\";\n    static string runName = \"quartets\";\n    static string providedOutPrefix = \"\";\n    static int jkWindowSize = 0;\n    static int jkNum = 20;\n    static int regionStart = -1;\n    static int regionLength = -1;\n    static int providedNumLines = -1;\n    static bool fStats = true;\n    static bool allF4 = false;\n}\n\n\nint DquartetsMain(int argc, char** argv) {\n    parseDquartetsOptions(argc, argv);\n    string line; // for reading the input files\n    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, opt::setsFile, opt::regionStart, opt::regionLength);\n    \n    std::istream* treeFile; std::ofstream* outFileTree;\n    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;\n    if (opt::treeFile != \"\") {\n        treeFile = new std::ifstream(opt::treeFile.c_str());\n        if (!treeFile->good()) { std::cerr << \"The file \" << opt::treeFile << \" could not be opened. Exiting...\" << std::endl; exit(1);}\n        outFileTree = new std::ofstream(outFileRoot+ \"_\" + opt::runName + \"_tree.txt\");\n        getline(*treeFile, line);\n        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);\n        //for (std::map<string,std::vector<int>>::iterator it = treeTaxonNamesToLoc.begin(); it != treeTaxonNamesToLoc.end(); ++it) {\n        //    std::cout << \"{\" << it->first << \"}\\n\";\n        // }\n    }\n    \n    int VCFlineCount = assignNumLinesToAnalyse(opt::providedNumLines, opt::regionLength, opt::vcfFile);\n    \n    std::istream* vcfFile;\n    if (opt::vcfFile == \"stdin\") { vcfFile = &std::cin; }\n    else { vcfFile = createReader(opt::vcfFile.c_str()); }\n    \n    // Get the sample sets\n    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupNotRequired);\n    \n    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot+\"_BBAA.txt\"); assertFileOpen(*outFileBBAA, outFileRoot+\"_BBAA.txt\");\n    std::ofstream* outFileDmin = new std::ofstream(outFileRoot+\"_Dmin.txt\"); assertFileOpen(*outFileDmin, outFileRoot+\"_Dmin.txt\");\n    std::ofstream* outFileCombine = new std::ofstream(outFileRoot+\"_combine.txt\"); assertFileOpen(*outFileCombine, outFileRoot+\"_combine.txt\");\n    std::ofstream* outFileCombineStdErr = new std::ofstream(outFileRoot+\"_combine_stderr.txt\");\n    assertFileOpen(*outFileCombineStdErr, outFileRoot+\"_combine_stderr.txt\");\n\n    \n    int nCombinations = nChoosek((int)setInfo.populations.size(),4);\n    if (opt::fStats) std::cerr << \"Going to calculate D and f4-ratio values for \" << nCombinations << \" quartets\" << std::endl;\n    else std::cerr << \"Going to calculate D values for \" << nCombinations << \" quartets\" << std::endl;\n    \n    if (opt::treeFile != \"\") { // Check that the tree contains all the populations/species\n        setInfo.checkIfTreeNamesMatch(treeTaxonNamesToLoc);\n    }\n    \n    // first, get all combinations of four sets (species):\n    std::vector<std::vector<string>> quartets; quartets.resize(nCombinations);\n    std::vector<std::vector<int>> quartetsInt; quartetsInt.resize(nCombinations);\n    std::vector<bool> v(setInfo.populations.size()); std::fill(v.begin(), v.begin() + 4, true); // prepare a selection vector\n    int pNum = 0;\n    do {\n        for (int i = 0; i < v.size(); ++i) {\n            if (v[i]) { quartets[pNum].push_back(setInfo.populations[i]); quartetsInt[pNum].push_back(i); }\n        } pNum++;\n    } while (std::prev_permutation(v.begin(), v.end())); // Getting all permutations of the selection vector - so it selects all combinations\n    std::cerr << \"Done permutations\" << std::endl;\n    \n    // Create objects to hold the results for each quartet\n    std::vector<QuartetDinfo> quartetInfos(nCombinations); for (int i = 0; i < nCombinations; i++) {\n        QuartetDinfo info; quartetInfos[i] = info;\n    }\n    \n    // If a tree was supplied, check the tree arrangement for each trio...\n    if (opt::treeFile != \"\") {\n        for (int i = 0; i != quartets.size(); i++) {\n            int loc1 = treeTaxonNamesToLoc[quartets[i][0]][0];\n            int loc2 = treeTaxonNamesToLoc[quartets[i][1]][0];\n            int loc3 = treeTaxonNamesToLoc[quartets[i][2]][0];\n            int loc4 = treeTaxonNamesToLoc[quartets[i][3]][0];\n            quartetInfos[i].treeArrangement = quartetInfos[i].assignQuartetTreeArrangement(treeLevels, loc1, loc2, loc3,loc4);\n        }\n    }\n    \n    // And need to prepare the vectors to hold allele frequency values:\n    std::vector<double> allPs(setInfo.populations.size(),0.0);\n    std::vector<double> allSplit1Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit1Counts(setInfo.populations.size(),0);\n    std::vector<double> allSplit2Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit2Counts(setInfo.populations.size(),0);\n    \n    int totalVariantNumber = 0;\n    std::vector<string> sampleNames; std::vector<std::string> fields;\n    // Find out how often to report progress, based on the number of trios\n    int reportProgressEvery; if (nCombinations < 1000) reportProgressEvery = 100000;\n    else if (nCombinations < 100000) reportProgressEvery = 10000;\n    else reportProgressEvery = 1000;\n    clock_t start; clock_t startGettingCounts; clock_t startCalculation;\n    double durationOverall; double durationGettingCounts; double durationCalculation;\n    int JKblockSizeBasedOnNum = 0;\n    \n    while (getline(*vcfFile, line)) {\n        line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n        if (line[0] == '#' && line[1] == '#') {\n            VCFlineCount--; continue;\n        } else if (line[0] == '#' && line[1] == 'C') {\n            VCFlineCount--; JKblockSizeBasedOnNum = (VCFlineCount/opt::jkNum)-1;\n            printInitialMessageTriosQuartets(opt::regionLength, VCFlineCount, JKblockSizeBasedOnNum, opt::jkWindowSize, opt::jkNum);\n            fields = split(line, '\\t');\n            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n            setInfo.linkSetsAndVCFpositions(sampleNames);\n            start = clock();\n            //  std::cerr << \" \" << std::endl;\n            //  std::cerr << \"Outgroup at pos: \"; print_vector_stream(speciesToPosMap[\"Outgroup\"], std::cerr);\n        } else {\n            totalVariantNumber++;\n            if (opt::regionStart != -1) {\n                if (totalVariantNumber < opt::regionStart)\n                    continue;\n                if (totalVariantNumber > (opt::regionStart+opt::regionLength)) {\n                    std::cerr << \"DONE\" << std::endl; break;\n                }\n            }\n            if (totalVariantNumber % JKblockSizeBasedOnNum == 0 && opt::jkWindowSize == 0) {\n                for (int i = 0; i != quartets.size(); i++) {\n                    quartetInfos[i].addRegionDs(P3isTrios2); quartetInfos[i].addRegionDs(P3isTrios1); quartetInfos[i].addRegionDs(P3isTrios0);\n                }\n            }\n            if (totalVariantNumber % reportProgressEvery == 0) {\n                durationOverall = ( clock() - start ) / (double) CLOCKS_PER_SEC;\n                std::cerr << \"Processed \" << totalVariantNumber << \" variants (\" << ((double)totalVariantNumber/VCFlineCount)*100 << \"%) in \" << durationOverall << \"secs\" << std::endl;\n                //std::cerr << \"GettingCounts \" << durationGettingCounts << \" calculation \" << durationCalculation << \"secs\" << std::endl;\n            }\n            fields = split(line, '\\t');\n            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());\n\n            // Only consider biallelic SNPs\n            string refAllele = fields[3]; string altAllele = fields[4];\n            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == \"*\") {\n                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();\n                genotypes.clear(); genotypes.shrink_to_fit(); continue;\n            }\n            \n            startGettingCounts = clock();\n            if (opt::fStats)  {\n                GeneralSetCountsWithSplits* c = new GeneralSetCountsWithSplits(setInfo.popToPosMap, (int)genotypes.size());\n                c->getSplitCountsNew(genotypes, setInfo.posToPopMap);\n                for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                    try {\n                        allPs[i] = c->setAAFs.at(setInfo.populations[i]);\n                        allSplit1Ps[i] = c->setAAFsplit1.at(setInfo.populations[i]);\n                        allSplit2Ps[i] = c->setAAFsplit2.at(setInfo.populations[i]);\n                        allSplit1Counts[i] = c->setAlleleCountsSplit1.at(setInfo.populations[i]);\n                        allSplit2Counts[i] = c->setAlleleCountsSplit2.at(setInfo.populations[i]);\n                       // std::cerr << \"species[i] \" << species[i] << \"; allPs[i] \" << allPs[i] << \" ; c->setDAFs[species[i]] \" << c->setDAFs[0] << std::endl;\n                    } catch (const std::out_of_range& oor) {\n                        std::cerr << \"Counts are missing some info for \" << setInfo.populations[i] << std::endl;\n                    }\n                }\n                delete c;\n            } else {\n                GeneralSetCounts* c = (GeneralSetCountsWithSplits*) new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());\n                c->getSetVariantCounts(genotypes, setInfo.posToPopMap);\n                for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {\n                    allPs[i] = c->setAAFs.at(setInfo.populations[i]);\n                 //   std::cerr << \"species[i] \" << species[i] << \"; allPs[i] \" << allPs[i] << std::endl;\n                }\n                delete c;\n            }\n            genotypes.clear(); genotypes.shrink_to_fit();\n            durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;\n            \n            startCalculation = clock();\n            // Now calculate the D stats:\n            double p_S1; double p_S2; double p_S3; double p_S4; double ABBA; double BABA; double BBAA; double BAAB; double ABAB; double AABB;\n            for (int i = 0; i != quartets.size(); i++) {\n                p_S1 = allPs[quartetsInt[i][0]];\n             //   std::cerr << \"p_S1 \" << p_S1 << std::endl;\n                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio\n                p_S2 = allPs[quartetsInt[i][1]];\n             //   std::cerr << \"p_S2 \" << p_S2 << std::endl;\n                if (p_S2 == -1) continue;\n                p_S3 = allPs[quartetsInt[i][2]];\n             //   std::cerr << \"p_S3 \" << p_S3 << std::endl;\n                if (p_S3 == -1) continue;\n                p_S4 = allPs[quartetsInt[i][3]];\n             //   std::cerr << \"p_S4 \" << p_S4 << std::endl;\n                if (p_S4 == -1) continue;\n                \n                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 0 && p_S2 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 0 && p_S3 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio\n                if (p_S2 == 0 && p_S3 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio\n                \n                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 1 && p_S2 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio\n                if (p_S1 == 1 && p_S3 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio\n                if (p_S2 == 1 && p_S3 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio\n                \n                if (p_S4 != 1) {\n                    ABBA = (1-p_S1)*p_S2*p_S3*(1-p_S4); quartetInfos[i].ABBAtotal += ABBA;\n                    BABA = p_S1*(1-p_S2)*p_S3*(1-p_S4); quartetInfos[i].BABAtotal += BABA;\n                    BBAA = p_S1*p_S2*(1-p_S3)*(1-p_S4); quartetInfos[i].BBAAtotal += BBAA;\n                    if ((ABBA + BABA) != 0) { quartetInfos[i].usedVars[0]++; quartetInfos[i].localD1num += ABBA - BABA; quartetInfos[i].localD1denom += ABBA + BABA; }\n                    if ((ABBA + BBAA) != 0) { quartetInfos[i].usedVars[1]++; quartetInfos[i].localD2num += ABBA - BBAA; quartetInfos[i].localD2denom += ABBA + BBAA; }\n                    if ((BBAA + BABA) != 0) { quartetInfos[i].usedVars[2]++; quartetInfos[i].localD3num += BBAA - BABA; quartetInfos[i].localD3denom += BBAA + BABA; }\n                }\n                if (p_S4 != 0) {\n                    BAAB = p_S1*(1-p_S2)*(1-p_S3)*p_S4; quartetInfos[i].ABBAtotal += BAAB;\n                    ABAB = (1-p_S1)*p_S2*(1-p_S3)*p_S4; quartetInfos[i].BABAtotal += ABAB;\n                    AABB = (1-p_S1)*(1-p_S2)*p_S3*p_S4; quartetInfos[i].BBAAtotal += AABB;\n                    if (BAAB + ABAB != 0)  { quartetInfos[i].localD1num += BAAB - ABAB; quartetInfos[i].localD1denom += BAAB + ABAB; }\n                    if (BAAB + AABB != 0)  { quartetInfos[i].localD2num += BAAB - AABB; quartetInfos[i].localD2denom += BAAB + AABB; }\n                    if (AABB + ABAB != 0)  { quartetInfos[i].localD3num += AABB - ABAB; quartetInfos[i].localD3denom += AABB + ABAB; }\n                }\n                \n                if (opt::fStats) {\n                    \n                    double p_S1a = allSplit1Ps[quartetsInt[i][0]]; double p_S1b = allSplit2Ps[quartetsInt[i][0]];\n                    double p_S2a = allSplit1Ps[quartetsInt[i][1]]; double p_S2b = allSplit2Ps[quartetsInt[i][1]];\n                    double p_S3a = allSplit1Ps[quartetsInt[i][2]]; double p_S3b = allSplit2Ps[quartetsInt[i][2]];\n                    double p_S4a = allSplit1Ps[quartetsInt[i][3]]; double p_S4b = allSplit2Ps[quartetsInt[i][3]];\n                    \n                    /* Orientation 1: F4(P1, P2; P3, P4)\n                     ----------------------------------\n                     These are the different denominators with 'a' and 'b' being the subsamples\n                     1) F4(P1, P3a; P3b, P4) ----- F_G_denom1 --- (p_S1,p_S3a,p_S3b,p_S4)\n                     2) F4(P1, P2a; P2b, P4) ----- F_G_denom2 --- (p_S1,p_S2a,p_S2b,p_S4)\n                     3) F4(P1a, P2; P3, P1b)\n                     4) F4(P4a, P2; P3, P4b) */\n                    quartetInfos[i].F_G_denoms[0] += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);\n                    quartetInfos[i].F_G_denoms[1] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);\n                    quartetInfos[i].F_G_denoms[2] += f4_perVariant(p_S1a,p_S2,p_S3,p_S1b);\n                    quartetInfos[i].F_G_denoms[3] += f4_perVariant(p_S4a,p_S2,p_S3,p_S4b);\n                    \n                    /* Orientation 1b: F4(P2, P1; P3, P4)\n                    ----------------------------------   Same as Orientation 3\n                    5) F4(P2, P1a; P1b, P4)\n                    6) F4(P2, P3a; P3b, P4)\n                    7) F4(P2a, P1; P3, P2b)\n                    8) F4(P4a, P1; P3, P4b) */\n                    quartetInfos[i].F_G_denoms[4] += f4_perVariant(p_S2,p_S1a,p_S1b,p_S4);\n                    quartetInfos[i].F_G_denoms[5] += f4_perVariant(p_S2,p_S3a,p_S3b,p_S4);\n                    quartetInfos[i].F_G_denoms[6] += f4_perVariant(p_S2a,p_S1,p_S3,p_S2b);\n                    quartetInfos[i].F_G_denoms[7] += f4_perVariant(p_S4a,p_S1,p_S3,p_S4b);\n                    \n                    /* Orientation 2: F4(P1, P3; P2, P4)\n                     ----------------------------------\n                     9) F4(P1, P3a; P3b, P4) - a duplicate of 1)\n                     10) F4(P1, P2a; P2b, P4) - a duplicate of 2)\n                     11) F4(P1a, P3; P2, P1b)\n                     12) F4(P4a, P3; P2, P4b) */\n                    quartetInfos[i].F_G_denoms[8] += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);\n                    quartetInfos[i].F_G_denoms[9] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);\n                    quartetInfos[i].F_G_denoms[10] += f4_perVariant(p_S1a,p_S3,p_S2,p_S1b);\n                    quartetInfos[i].F_G_denoms[11] += f4_perVariant(p_S4a,p_S3,p_S2,p_S4b);\n                    \n                    /* Orientation 2b: F4(P3, P1; P2, P4)\n                     ----------------------------------\n                     13) F4(P3, P1a; P1b, P4) ---- F_G_denom3 ---   (p_S3,p_S1a,p_S1b,p_S4)\n                     14) F4(P3, P2a; P2b, P4) ---- F_G_denom2_reversed --- (p_S3,p_S2a,p_S2b,p_S4)\n                     15) F4(P3a, P1; P2, P3b) ----\n                     16) F4(P4a, P1; P2, P4b) ---- */\n                    quartetInfos[i].F_G_denoms[12] += f4_perVariant(p_S3,p_S1a,p_S1b,p_S4);\n                    quartetInfos[i].F_G_denoms[13] += f4_perVariant(p_S3,p_S2a,p_S2b,p_S4);\n                    quartetInfos[i].F_G_denoms[14] += f4_perVariant(p_S3a,p_S1,p_S2,p_S3b);\n                    quartetInfos[i].F_G_denoms[15] += f4_perVariant(p_S4a,p_S1,p_S2,p_S4b);\n                    \n                    /* Orientation 3: F4(P1, P4; P2, P3)\n                     ---------------------------------- Same as Orientation 1b\n                     17) F4(P1, P4a; P4b, P3) - a duplicate of 8)\n                     18) F4(P1a, P4; P2, P1b) - a duplicate of 5) ---- F_G_denom3_reversed --- (p_S2,p_S1a,p_S1b,p_S4)\n                     19) F4(P1, P2a; P2b, P3) - a duplicate of 7)\n                     20) F4(P3a, P4; P2, P3b) - a duplicate of 6) ---- F_G_denom1_reversed --- (p_S2,p_S3a,p_S3b,p_S4) */\n                    quartetInfos[i].F_G_denoms[16] += f4_perVariant(p_S1,p_S4a,p_S4b,p_S3);\n                    quartetInfos[i].F_G_denoms[17] += f4_perVariant(p_S1a,p_S4,p_S2,p_S1b);\n                    quartetInfos[i].F_G_denoms[18] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S3);\n                    quartetInfos[i].F_G_denoms[19] += f4_perVariant(p_S3a,p_S4,p_S2,p_S3b);\n                    \n                    /* Orientation 3b: F4(P4, P1; P2, P3)\n                     ----------------------------------\n                     21) F4(P4, P1a; P1b, P3) - a duplicate of 13)\n                     22) F4(P4, P2a; P2b, P3) - a duplicate of 14)\n                     23) F4(P3a, P1; P2, P3b) - a duplicate of 15)\n                     24) F4(P4a, P1; P2, P4b) - a duplicate of 16) */\n                    quartetInfos[i].F_G_denoms[20] += f4_perVariant(p_S4,p_S1a,p_S1b,p_S3);\n                    quartetInfos[i].F_G_denoms[21] += f4_perVariant(p_S4,p_S2a,p_S2b,p_S3);\n                    quartetInfos[i].F_G_denoms[22] += f4_perVariant(p_S3a,p_S1,p_S2,p_S3b);\n                    quartetInfos[i].F_G_denoms[23] += f4_perVariant(p_S4a,p_S1,p_S2,p_S4b);\n                    \n                   // Original version\n                    quartetInfos[i].F_G_denom1 += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);\n                    quartetInfos[i].F_G_denom1_reversed += f4_perVariant(p_S2,p_S3a,p_S3b,p_S4);\n                    quartetInfos[i].F_G_denom2 += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);\n                    quartetInfos[i].F_G_denom2_reversed += f4_perVariant(p_S3,p_S2a,p_S2b,p_S4);\n                    quartetInfos[i].F_G_denom3 += f4_perVariant(p_S3,p_S1a,p_S1b,p_S4);\n                    quartetInfos[i].F_G_denom3_reversed += f4_perVariant(p_S2,p_S1a,p_S1b,p_S4);\n                }\n                \n                // std::cerr << \"trioInfos[i].localD1num\" << trioInfos[i].localD1denom << std::endl;\n                if (opt::jkWindowSize > 0) {\n                    if (quartetInfos[i].usedVars[0] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios2); }\n                    if (quartetInfos[i].usedVars[1] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios1); }\n                    if (quartetInfos[i].usedVars[2] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios0); }\n                }\n                // } */\n            }\n            durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;\n        }\n    }\n    std::cerr << \"Done processing VCF. Preparing output files...\" << '\\n';\n    \n    string header = makeHeader(true, opt::fStats,false);\n    *outFileDmin << header << std::endl;\n    if(opt::allF4) {\n        header += \"\\tF_G_denom1\\tF_G_denom2\\tF_G_denom3\\tF_G_denom4\";\n    }\n    if (opt::treeFile != \"\") *outFileTree << header << std::endl;\n    *outFileBBAA << header << std::endl;\n    \n    int exceptionCount = 0;\n    for (int i = 0; i != quartets.size(); i++) { //\n        // Get the D values\n        try {\n            /*std::cerr << \"Here...\" << '\\n';\n            std::cerr << \"quartetInfos[i].\" << quartetInfos[i].ABBAtotal << '\\n';\n            std::cerr << \"quartetInfos[i].\" << quartetInfos[i].BBAAtotal << '\\n';\n            std::cerr << \"quartetInfos[i].\" << quartetInfos[i].BABAtotal << '\\n'; */\n            quartetInfos[i].calculateFinalDs();\n        } catch (const char* msg) {\n            exceptionCount++;\n            if (exceptionCount <= 10) {\n                std::cerr << msg << std::endl;\n                std::cerr << \"Could not calculate p-values for the quartet: \" << quartets[i][0] << \" \" << quartets[i][1] << \" \" << quartets[i][2] << \" \" << quartets[i][3]<< std::endl;\n                if (opt::jkWindowSize > 0) std::cerr << \"You should probably decrease the the jackknife block size (-j option)\" << std::endl;\n                else std::cerr << \"it looks like there aren't enough ABBA-BABA informative variants for this quartet\" << std::endl;\n                std::cerr << std::endl;\n            }\n            quartetInfos[i].D1_p = nan(\"\"); quartetInfos[i].D2_p = nan(\"\"); quartetInfos[i].D3_p = nan(\"\");\n        }\n       // std::cerr << \"Here...\" << '\\n';\n        \n        // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA\n        quartetInfos[i].assignBBAAarrangement();\n        std::vector<string> BBAAoutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].BBAAarrangement, opt::allF4);\n       // std::cerr << \"quartetInfos[i].BBAAarrangement: \" << quartetInfos[i].BBAAarrangement << std::endl;\n        print_vector(BBAAoutVec,*outFileBBAA);\n        \n        // Find Dmin:\n        quartetInfos[i].assignDminArrangement();\n       // std::cerr << \"quartetInfos[i].DminArrangement \" << quartetInfos[i].DminArrangement << '\\n';\n        std::vector<string> DminOutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].DminArrangement);\n        print_vector(DminOutVec,*outFileDmin);\n        \n        // Find which arrangement of trios is consistent with the input tree (if provided):\n        if (opt::treeFile != \"\") {\n       //     std::cerr << \"quartetInfos[i].treeArrangement \" << quartetInfos[i].treeArrangement << '\\n';\n            std::vector<string> treeOutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].treeArrangement, opt::allF4);\n            print_vector(treeOutVec,*outFileTree);\n        }\n        \n        // Output a simple file that can be used for combining multiple local runs:\n        *outFileCombine << quartets[i][0] << \"\\t\" << quartets[i][1] << \"\\t\" << quartets[i][2] << \"\\t\" << quartetInfos[i].BBAAtotal << \"\\t\" << quartetInfos[i].BABAtotal << \"\\t\" << quartetInfos[i].ABBAtotal;\n        if (opt::fStats) {\n            *outFileCombine << \"\\t\" << quartetInfos[i].F_G_denom1 << \"\\t\" << quartetInfos[i].F_G_denom2 << \"\\t\" << quartetInfos[i].F_G_denom3;\n            *outFileCombine << \"\\t\" << quartetInfos[i].F_G_denom1_reversed << \"\\t\" << quartetInfos[i].F_G_denom2_reversed << \"\\t\" << quartetInfos[i].F_G_denom3_reversed;\n            *outFileCombine << std::endl;\n        } else {\n            *outFileCombine << std::endl;\n        }\n        print_vector(quartetInfos[i].regionDs[0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << \"\\t\"; print_vector(quartetInfos[i].regionDs[1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << \"\\t\";\n        print_vector(quartetInfos[i].regionDs[2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl;\n        \n        //std::cerr << trios[i][0] << \"\\t\" << trios[i][1] << \"\\t\" << trios[i][2] << \"\\t\" << D1 << \"\\t\" << D2 << \"\\t\" << D3 << \"\\t\" << BBAAtotals[i] << \"\\t\" << BABAtotals[i] << \"\\t\" << ABBAtotals[i] << std::endl;\n    }\n    if (exceptionCount > 10) {\n        std::cerr << \"...\" << std::endl;\n        std::cerr << \"p-value could not be calculated for \" << exceptionCount << \" quartets\" << std::endl;\n        if (opt::jkWindowSize > 0) std::cerr << \"You should probably decrease the the jackknife block size (-j option)\" << std::endl;\n        else std::cerr << \"it looks like there aren't enough ABBA-BABA informative variants for these quartets\" << std::endl;\n       // std::cerr << \"If this was a run for a subset of the genome (e.g. one chromosome), you may still get p-values for these quartets from DtriosCombine\" << std::endl;\n        std::cerr << std::endl;\n    }\n    return 0;\n    \n}\n\n\nvoid parseDquartetsOptions(int argc, char** argv) {\n    bool die = false; string regionArgString; std::vector<string> regionArgs;\n    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)\n    {\n        std::istringstream arg(optarg != NULL ? optarg : \"\");\n        switch (c)\n        {\n            case '?': die = true; break;\n            case 'n': arg >> opt::runName; break;\n            case 't': arg >> opt::treeFile; break;\n            case 'j': arg >> opt::jkWindowSize; break;\n            case 'k': arg >> opt::jkNum; break;\n            case 'o': arg >> opt::providedOutPrefix; break;\n            case OPT_NO_F4: opt::fStats = false; break;\n            case 'l': arg >> opt::providedNumLines; break;\n            case 'a': opt::allF4 = true; break;\n            case 'r': arg >> regionArgString; regionArgs = split(regionArgString, ',');\n                opt::regionStart = (int)stringToDouble(regionArgs[0]); opt::regionLength = (int)stringToDouble(regionArgs[1]);  break;\n            case 'h':\n                std::cout << DQUARTS_USAGE_MESSAGE;\n                exit(EXIT_SUCCESS);\n        }\n    }\n    \n    if (argc - optind < 2) {\n        std::cerr << \"missing arguments\\n\";\n        die = true;\n    }\n    else if (argc - optind > 2)\n    {\n        std::cerr << \"too many arguments\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << DQUARTS_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n    \n    // Parse the input filenames\n    opt::vcfFile = argv[optind++];\n    opt::setsFile = argv[optind++];\n    \n    if (opt::vcfFile == \"stdin\" && opt::providedNumLines <= 0) {\n        std::cerr << \"If you want to read the VCF via a pipe, you need to specify the number of lines in the input via the -l option\\n\";\n        std::cerr << \"See the example above\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << DQUARTS_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n}\n"
  },
  {
    "path": "Dquartets.h",
    "content": "//\n//  Dquartets.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 14/07/2020.\n//\n\n#ifndef Dquartets_h\n#define Dquartets_h\n#include \"Dsuite_utils.h\"\n\nvoid parseDquartetsOptions(int argc, char** argv);\nint DquartetsMain(int argc, char** argv);\n\n#endif /* Dquartets_h */\n"
  },
  {
    "path": "Dsuite.cpp",
    "content": "//\n//  Dsuite.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include <iostream>\n#include \"Dsuite_utils.h\"\n#include \"Dmin.h\"\n#include \"D.h\"\n#include \"Dmin_combine.h\"\n#include \"Dsuite_fBranch.h\"\n#include \"Dquartets.h\"\n\n#define AUTHOR \"Milan Malinsky\"\n#define PACKAGE_VERSION \"0.5 r58\"\n\n\nstatic const char *VERSION_MESSAGE =\n\"Dsuite software Version \" PACKAGE_VERSION \"\\n\"\n\"Written by Milan Malinsky.\\n\"\n\"\\n\";\n\nstatic const char *USAGE_MESSAGE =\n\"Program: \" PROGRAM_BIN \"\\n\"\n\"Version: \" PACKAGE_VERSION \"\\n\"\n\"Contact: \" AUTHOR \" [\" PACKAGE_BUGREPORT \"]\\n\"\n\"Usage: \" PROGRAM_BIN \" <command> [options]\\n\\n\"\n\"Commands:\\n\"\n\"           Dtrios                  Calculate D (ABBA-BABA) and f4-ratio statistics for all possible trios of populations/species\\n\"\n\"           DtriosCombine           Combine results from Dtrios runs across genomic regions (e.g. per-chromosome)\\n\"\n\"           Dinvestigate            Follow up analyses for trios with significantly elevated D:\\n\"\n\"                                   calculates f_d, f_dM, and d_f in windows along the genome\\n\"\n\"           Fbranch                 Calculate D and f statistics for branches on a tree that relates the populations/species\\n\"\n\"\\n\"\n\"Experimental:\\n\"\n\"           Dquartets               Calculate D (ABBA-BABA) and f4-ratio statistics for all possible quartets of populations/species\\n\"\n\"                                   (no outgroup specified)\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\nint main(int argc, char **argv) {\n    \n    if(argc <= 1)\n    {\n        std::cout << USAGE_MESSAGE;\n        return 0;\n    }\n    else\n    {\n        std::string command(argv[1]);\n        if(command == \"help\" || command == \"--help\" || command == \"-h\")\n        {\n            std::cout << USAGE_MESSAGE;\n            return 0;\n        }\n        else if(command == \"version\" || command == \"--version\")\n        {\n            std::cout << VERSION_MESSAGE;\n            return 0;\n        }\n        \n        if(command == \"Dinvestigate\")\n            abbaBabaMain(argc - 1, argv + 1);\n        else if (command == \"Dtrios\")\n            DminMain(argc - 1, argv + 1);\n        else if (command == \"DtriosCombine\")\n            DminCombineMain(argc - 1, argv + 1);\n        else if (command == \"Fbranch\")\n            fBranchMain(argc - 1, argv + 1);\n        else if (command == \"Dquartets\")\n            DquartetsMain(argc - 1, argv + 1);\n        else\n        {\n            std::cerr << \"Unrecognized command: \" << command << \"\\n\";\n            return 1;\n        }\n        return 0;\n    }\n}\n\n"
  },
  {
    "path": "Dsuite_common.cpp",
    "content": "//\n//  Dsuite_common.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 21/07/2020.\n//\n\n#include \"Dsuite_common.h\"\n\n\n\nvoid SetInformation::linkSetsAndVCFpositions(const std::vector<std::string>& sampleNames) {\n    // print_vector_stream(sampleNames, std::cerr);\n    for (std::vector<std::string>::size_type i = 0; i != sampleNames.size(); i++) {\n        try { posToPopMap[i] = IDsToPopMap.at(sampleNames[i]); } catch (const std::out_of_range& oor) {\n            std::cerr << \"WARNING: The sample \" << sampleNames[i] << \" is in the VCF but not assigned in the SETS.txt file\" << std::endl;\n        }\n    }\n    // Iterate over all the keys in the map to find the samples in the VCF:\n    // Give an error if no sample is found for a species:\n    for(std::map<string, std::vector<string>>::const_iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {\n        string sp =  it->first;\n        //std::cerr << \"sp \" << sp << std::endl;\n        std::vector<string> IDs = it->second;\n        std::vector<size_t> spPos = locateSet(sampleNames, IDs); \n        if (spPos.empty()) {\n            std::cerr << \"Did not find any samples in the VCF for \\\"\" << sp << \"\\\"\" << std::endl;\n            assert(!spPos.empty());\n        }\n        popToPosMap[sp] = spPos;\n    }\n}\n\nvoid SetInformation::checkIfTreeNamesMatch(std::map<string,std::vector<int>>& treeTaxonNamesToLoc) {\n        for (int i = 0; i != populations.size(); i++) {\n            try { treeTaxonNamesToLoc.at(populations[i]);\n            } catch (const std::out_of_range& oor) {\n                std::cerr << \"Out of Range error: \" << oor.what() << '\\n';\n                std::cerr << \"species[i]: \" << populations[i] << '\\n';\n                std::cerr << CHECK_TREE_ERROR_MSG << '\\n';\n                exit(1);\n    }}\n}\n\n\nstring makeHeader(bool quartet, bool includeFstats, bool includeKSstats) {\n    string header = \"P1\\tP2\\tP3\"; if (quartet) header += \"\\tP4\";\n    header += \"\\tDstatistic\\tZ-score\\tp-value\";\n    if (includeFstats) { header += \"\\t\"; header += F4HEADER; }\n    if (includeKSstats) { header += \"\\t\"; header += \"clustering_sensitive\"; header += \"\\t\"; header += \"clustering_robust\";}\n    header += \"\\tBBAA\\tABBA\\tBABA\";\n    return header;\n}\n\nstring prepareOutFileRootString(const string& providedPrefix, const string& runName, const string& setsFileName, const int regionStart, const int regionLength) {\n    string fileNameRootString; string outRoot; if (providedPrefix == \"\") { outRoot = stripExtension(setsFileName);} else { outRoot = providedPrefix; }\n    if (regionStart == -1) { if (runName != \"\") fileNameRootString = outRoot + \"_\" + runName; else fileNameRootString = outRoot; }\n    else fileNameRootString = outRoot+\"_\"+runName+\"_\"+numToString(regionStart)+\"_\"+numToString(regionStart+regionLength);\n    return fileNameRootString;\n}\n\nvoid printMissingLikelihoodsWarning(const string& chr, const string& pos) {\n    std::cerr << \"WARNING: Could not fing genotype likelihoods/probabilities (GP, PL, or GL fields) for variant at \" << chr << \" \" << pos << std::endl;\n    std::cerr << \"WARNING: Did you really mean to use the -g option? Reverting to using called genotypes.\" << std::endl;\n}\n\nvoid duplicateTreeValueError(const string& duplicate) {\n    std::cerr << \"ERROR: Duplicate value in the tree \\\"\" << duplicate << \"\\\"\\n\";\n    std::cerr << \"Exiting\\n\";\n    exit(1);\n}\n\nvoid printInitialMessageTriosQuartets(const int regionLengthOpt, const int VCFlineCount, const int JKblockSizeBasedOnNum, const int jkWindowSizeOpt, const int jkNumOpt) {\n    if (regionLengthOpt > 0) { std::cerr << \"The VCF region to be analysed contains \" << VCFlineCount << \" variants\\n\"; }\n    else { std::cerr << \"The VCF contains \" << VCFlineCount << \" variants\\n\"; }\n    if (jkWindowSizeOpt == 0) std::cerr << \"Going to use block size of \" << JKblockSizeBasedOnNum << \" variants to get \" << jkNumOpt << \" Jackknife blocks\\n\";\n}\n\nvoid assignTreeLevelsAndLinkToTaxa(string& treeLine, std::map<string,std::vector<int>>& taxaToLoc, std::vector<int>& levels) {\n    // First take care of any branch lengths\n    std::regex branchLengths(\":.*?(?=,|\\\\))\");\n    treeLine = std::regex_replace(treeLine,branchLengths,\"\");\n    //std::cerr << line << std::endl;\n\n    // Now process the tree\n    levels.assign(treeLine.length(),0); int currentLevel = 0;\n    std::vector<string> treeTaxonNames;\n    string currentTaxonName = \"\";\n    int lastBegin = 0;\n    for (int i = 0; i < treeLine.length(); ++i) {\n        if (treeLine[i] == '(') {\n            currentLevel++; levels[i] = currentLevel;\n        } else if (treeLine[i] == ')') {\n            currentLevel--; levels[i] = currentLevel;\n            if (currentTaxonName != \"\") {\n                if (taxaToLoc.count(currentTaxonName) == 1) { duplicateTreeValueError(currentTaxonName); }\n                treeTaxonNames.push_back(currentTaxonName);\n                taxaToLoc[currentTaxonName].push_back(lastBegin);\n                taxaToLoc[currentTaxonName].push_back(i-1);\n                currentTaxonName = \"\";\n            }\n        } else if (treeLine[i] == ',') {\n            levels[i] = currentLevel;\n            if (currentTaxonName != \"\") {\n                treeTaxonNames.push_back(currentTaxonName);\n                taxaToLoc[currentTaxonName].push_back(lastBegin);\n                taxaToLoc[currentTaxonName].push_back(i-1);\n                currentTaxonName = \"\";\n            }\n        } else {\n            if (currentTaxonName == \"\")\n                lastBegin = i;\n            levels[i] = currentLevel;\n            currentTaxonName += treeLine[i];\n        }\n    }\n    //print_vector(treeTaxonNames, std::cout,'\\n');\n    //print_vector(treeLevels, std::cout,' ');\n    //for (std::map<string,std::vector<int>>::iterator i = treeTaxonNamesToLoc.begin(); i != treeTaxonNamesToLoc.end(); i++) {\n    //    std::cout << i->first << \"\\t\" << i->second[0] << \"\\t\" << i->second[1] << \"\\t\" << treeLevels[i->second[0]] << \"\\t\" << treeLevels[i->second[1]] << std::endl;\n    //}\n}\n\nint assignNumLinesToAnalyse(const int providedNumLinesOpt, const int regionLengthOpt,const string& vcfFileOpt) {\n    int VCFlineCount;\n    if (providedNumLinesOpt > 0) {\n        VCFlineCount = providedNumLinesOpt;\n    } else if (regionLengthOpt > 0) {\n        VCFlineCount = regionLengthOpt;\n    } else { // Block to find the number of lines in the VCF file\n        std::istream* vcfFile = createReader(vcfFileOpt.c_str());\n        // See how big is the VCF file\n        vcfFile->unsetf(std::ios_base::skipws); // new lines will be skipped unless we stop it from happening:\n        // count the newlines with an algorithm specialized for counting:\n        VCFlineCount = (int)std::count(std::istream_iterator<char>(*vcfFile),std::istream_iterator<char>(),'\\n');\n        //std::cout << \"VCF Lines: \" << VCFlineCount << \"\\n\";\n    }\n    return VCFlineCount;\n}\n"
  },
  {
    "path": "Dsuite_common.h",
    "content": "//\n//  Dsuite_common.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 21/07/2020.\n//\n\n#ifndef Dsuite_common_h\n#define Dsuite_common_h\n\n#define stdInInfo   \"Use 'stdin' for the VCF file when piping from another program into Dsuite via standard input\\n\" \\\n                    \"in this case it is necessary to provide the number of lines in the filtered VCF via the -l option\\n\" \\\n                    \"For example, to filter the VCF for overall mimimum depth of at least 1000 across all samples:\\n\" \\\n                    \"NUMLINES=$(bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | wc -l)  # to get NUMLINES\\n\" \\\n                    \"bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | Dsuite Dtrios -l $NUMLINES stdin SETS.txt\\n\" \\\n                    \"\\n\"\n\n#define regionOption    \"       -r, --region=start,length               (optional) only process a subset of the VCF file; both \\\"start\\\" and \\\"length\\\" indicate variant numbers\\n\" \\\n                        \"                                               e.g. --region=20001,10000 will process variants from 20001 to 30000\\n\"\n\n#define treeOption      \"       -t, --tree=TREE_FILE.nwk                (optional) a file with a tree in the newick format specifying the relationships between populations/species\\n\" \\\n                        \"                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\\n\"\n\n#define outOption       \"       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\\n\" \\\n                        \"                                               output will be put in OUT_FILE_PREFIX_BBAA.txt, OUT_FILE_PREFIX_Dmin.txt, OUT_FILE_PREFIX_tree.txt etc.\\n\" \\\n                        \"                                               by default, the prefix is taken from the name of the SETS.txt file\\n\"\n\n#include \"Dsuite_utils.h\"\n\ninline void notEnoughPopulationsError(const int minPopulations) {\n    std::cerr << \"ERROR: You need at least \" << minPopulations << \" sets (populations/species) for this analysis.\" << std::endl;\n    exit(EXIT_FAILURE);\n}\n\ninline void outgroupNeededError(const string& setsFileName) {\n    std::cerr << \"ERROR: The file \" << setsFileName << \" needs to specify the \\\"Outgroup\\\"\" << std::endl;\n    exit(EXIT_FAILURE);\n}\n\ninline void outgroupNotUsedInQuartetsWarning(const string& setsFileName) {\n    std::cerr << \"WARNING: You specified the \\\"Outgroup\\\" in \" << setsFileName << \". This is needed in Dtrios, but will be ignored in Dquarters - the \\\"Outgroup\\\" will be treated as any other population. It must also be present in the tree if you are supplying one.\" << std::endl;\n}\n\ninline void wrongNumberOfColumnsError(const string& setsFileName, int lineNum) {\n    std::cerr << \"ERROR: Please fix the format of the \" << setsFileName << \" file.\" << std::endl;\n    std::cerr << \"Line \" << lineNum << \" does not have two columns separated by a tab.\" << std::endl;\n    exit(EXIT_FAILURE);\n}\n\ninline void lineEmptyError(const string& setsFileName, int lineNum) {\n    std::cerr << \"ERROR: Please fix the format of the \" << setsFileName << \" file.\" << std::endl;\n    std::cerr << \"Line \" << lineNum << \" is empty.\" << std::endl;\n    exit(EXIT_FAILURE);\n}\n\nclass SetInformation {\npublic:\n    \n    SetInformation(const string& setsFileName, const int minPopulations, const int outgroupRequirement) {\n        \n        std::ifstream* setsFile = new std::ifstream(setsFileName.c_str());\n        assertFileOpen(*setsFile, setsFileName);\n        \n        string line; int l = 0; bool outgroupSpecified = false;\n        while (getline(*setsFile, line)) {\n            line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n            \n            l++; if (line == \"\") lineEmptyError(setsFileName,l);\n            \n            std::vector<string> ID_Pop = split(line, '\\t');\n            \n            if (ID_Pop.size() != 2) wrongNumberOfColumnsError(setsFileName,l);\n            if (ID_Pop[1] == \"Outgroup\") { outgroupSpecified = true; }\n            \n            popToIDsMap[ID_Pop[1]].push_back(ID_Pop[0]);\n            IDsToPopMap[ID_Pop[0]] = ID_Pop[1];\n        }\n        \n        for(std::map<string,std::vector<string>>::iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {\n            if ((it->first) != \"Outgroup\" && it->first != \"xxx\") {\n                populations.push_back(it->first);\n            }\n        } std::cout << \"There are \" << populations.size() << \" sets (populations/species) excluding the Outgroup\" << std::endl;\n        \n        if (populations.size() < minPopulations) notEnoughPopulationsError(minPopulations);\n        \n        // Provide error/warning messages depending on which analysis is run and the presence/absence of Outgroup in the SETS file\n        if (outgroupRequirement == OutgroupNotRequired && outgroupSpecified) outgroupNotUsedInQuartetsWarning(setsFileName);\n        if (outgroupRequirement == OutgroupRequired && !outgroupSpecified) outgroupNeededError(setsFileName);\n    };\n    \n    \n    \n    std::vector<string> populations;\n    std::map<string, string> IDsToPopMap;\n    std::map<string, std::vector<string>> popToIDsMap;\n    std::map<string, std::vector<size_t>> popToPosMap;\n    std::map<size_t, string> posToPopMap;\n\n    void linkSetsAndVCFpositions(const std::vector<std::string>& sampleNames);\n    void checkIfTreeNamesMatch(std::map<string,std::vector<int>>& treeTaxonNamesToLoc);\n};\n\n\nvoid process_SETS_file(std::ifstream* setsFile, const string fName, std::map<string, std::vector<string>>& speciesToIDsMap, std::map<string, string>& IDsToSpeciesMap, int outgroupRequirement);\nstring makeHeader(bool quartet, bool includeFstats, bool includeKSstats);\nstring prepareOutFileRootString(const string& providedPrefix, const string& runName, const string& setsFileName, const int regionStart, const int regionLength);\nvoid printMissingLikelihoodsWarning(const string& chr, const string& pos);\nvoid printInitialMessageTriosQuartets(const int regionLengthOpt, const int VCFlineCount, const int JKblockSizeBasedOnNum, const int jkWindowSizeOpt, const int jkNumOpt);\nvoid duplicateTreeValueError(const string& duplicate);\nvoid assignTreeLevelsAndLinkToTaxa(string& treeLine, std::map<string,std::vector<int>>& taxaToLoc, std::vector<int>& levels);\nint assignNumLinesToAnalyse(const int providedNumLinesOpt, const int regionLengthOpt,const string& vcfFileOpt);\n\ninline void reportProgessVCF(const int variantsProcessed, const std::clock_t startTime) {\n    double durationOverall = ( std::clock() - startTime ) / (double) CLOCKS_PER_SEC;\n    std::cout << \"Processed \" << variantsProcessed << \" variants in \" << durationOverall << \"secs\" << std::endl;\n}\n\ninline void reportProgessVCF(const int variantsProcessed, const int VCFlineCount, const std::clock_t startTime) {\n    double durationOverall = ( std::clock() - startTime ) / (double) CLOCKS_PER_SEC;\n    std::cerr << \"Processed \" << variantsProcessed << \" variants (\" << ((double)variantsProcessed/VCFlineCount)*100 << \"%) in \" << durationOverall << \"secs\" << std::endl;\n}\n\n#endif /* Dsuite_common_h */\n"
  },
  {
    "path": "Dsuite_fBranch.cpp",
    "content": "//\n//  Dsuite_fBranch.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 11/11/2019.\n//\n\n#include \"Dsuite_fBranch.h\"\n#define SUBPROGRAM \"Fbranch\"\n\n#define DEBUG 0\n\nstatic const char *BRANCHSCORE_USAGE_MESSAGE =\n\"Usage: \" PROGRAM_BIN \" \" SUBPROGRAM \" [OPTIONS] TREE_FILE.nwk FVALS_tree.txt\\n\"\n\"Implements the 'f-branch' type calculations developed by Hannes Svardal for Malinsky et al., 2018, Nat. Ecol. Evo.\\n\"\n\"Uses the f4-ratio (f_G) values produced by Dsuite Dtrios (or DtriosCombine) with the --tree option; this is the output of Dtrios with the \\\"_tree.txt\\\" suffix\\n\"\n\"To use  Fbranch, the tree in TREE_FILE.nwk must be rooted with the Outgroup.\\n\"\n\"Output to stdout\\n\"\n\"\\n\"\n\"       -p, --pthresh                           (default=0.01) fb scores whose associated p-value is less than \\n\"\n\"       -Z, --Zb-matrix                         (optional)  output the equivalent of fb-statistic, but with Z-scores to assess statistical significance\\n\"\n\"                                               this will be printed below the f-branch matrix\\n\"\n\"       -P, --Pb-matrix                         (optional)  output the equivalent of fb-statistic, but with p-values to assess statistical significance\\n\"\n\"                                               this will be printed below the f-branch matrix\\n\"\n\"       -h, --help                              display this help and exit\\n\"\n\"\\n\"\n\"\\nReport bugs to \" PACKAGE_BUGREPORT \"\\n\\n\";\n\n//enum { OPT_F_JK };\n\nstatic const char* shortopts = \"hp:Z\";\n\n//static const int JK_WINDOW = 5000;\n\nstatic const struct option longopts[] = {\n    { \"Zb-matrix\",   no_argument, NULL, 'Z' },\n    { \"Pb-matrix\",   no_argument, NULL, 'P' },\n    { \"pthresh\",   required_argument, NULL, 'p' },\n    { \"help\",   no_argument, NULL, 'h' },\n    { NULL, 0, NULL, 0 }\n};\n\nnamespace opt\n{\n    static string treeFile;\n    static string DvalsFile;\n    static bool printZb = false;\n    static bool printPb = false;\n    static double pthresh = 0.01;\n}\n\n\nint fBranchMain(int argc, char** argv) {\n    parseFbranchOptions(argc, argv);\n    std::istream* treeFile = new std::ifstream(opt::treeFile.c_str());\n    if (!treeFile->good()) { std::cerr << \"The file \" << opt::treeFile << \" could not be opened. Exiting...\" << std::endl; exit(EXIT_FAILURE);}\n    std::istream* DvalsFile = new std::ifstream(opt::DvalsFile.c_str());\n    if (!DvalsFile->good()) { std::cerr << \"The file \" << opt::DvalsFile << \" could not be opened. Exiting...\" << std::endl; exit(EXIT_FAILURE);}\n    if (opt::DvalsFile.substr(opt::DvalsFile.size()-9) != \"_tree.txt\") { std::cerr << \"The name of the input file with the f4-ratio values should end in \\\"_tree.txt\\\".\\nPlease make sure you run Dtrios with the --tree option and then feed the correct file into Fbranch. Exiting...\" << std::endl; exit(EXIT_FAILURE); }\n    std::map<string,std::vector<std::vector<string>>> acToBmap;\n    string line; int l = 0;\n    getline(*DvalsFile, line); // get the header\n    std::vector<string> headerVec = split(line, '\\t');\n    int indexFg = -1; int indexZ = -1;\n    if (headerVec[4] == \"Z-score\") { indexZ = 4; }\n    if (headerVec[5] == F4HEADER || headerVec[5] == \"f_G\") { indexFg = 5; } else if (headerVec[6] == F4HEADER || headerVec[6] == \"f_G\") { indexFg = 6; }\n    while (getline(*DvalsFile, line)) {\n        line.erase(std::remove(line.begin(), line.end(), '\\r'), line.end()); // Deal with any left over \\r from files prepared on Windows\n        l++; if (line == \"\") { std::cerr << \"Please fix the format of the \" << opt::DvalsFile << \" file.\\nLine \" << l << \" is empty. Exiting...\" << std::endl; exit(EXIT_FAILURE); }\n        std::vector<string> speciesAndVals = split(line, '\\t');\n        if (speciesAndVals.size() < 6 || indexFg == -1) {\n            std::cerr << \"Please fix the format of the \" << opt::DvalsFile << \" file.\" << std::endl;\n            std::cerr << \"Looks like the file does not contain f4-ratio statistics. Exiting...\" << std::endl;\n            exit(EXIT_FAILURE);\n        }\n        double f4ratio = stringToDouble(speciesAndVals[indexFg]); double Zscore = stringToDouble(speciesAndVals[indexZ]);\n        double pval = 2 * (1 - normalCDF(Zscore));\n        std::vector<string> bAndValLine;  bAndValLine.push_back(speciesAndVals[1]);\n        if (pval < opt::pthresh) bAndValLine.push_back(speciesAndVals[indexFg]); else bAndValLine.push_back(\"0\"); // Set non-significant f4-ratio statistics to 0\n        if (indexZ != -1) bAndValLine.push_back(speciesAndVals[indexZ]);\n        std::vector<string> aAndValLine;  aAndValLine.push_back(speciesAndVals[0]); aAndValLine.push_back(\"0\");\n        if (indexZ != -1) aAndValLine.push_back(\"0\");\n        acToBmap[speciesAndVals[0]+\",\"+speciesAndVals[2]].push_back(bAndValLine);\n        acToBmap[speciesAndVals[1]+\",\"+speciesAndVals[2]].push_back(aAndValLine);\n    }\n    string treeString; getline(*treeFile, treeString);\n    Tree* testTree = new Tree(treeString);\n    testTree->updateProgenyIds();\n    testTree->fillSisterBranches();\n    for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {\n        if ((*b)->parentId != \"treeOrigin\") {\n            std::vector<string> Bs = (*b)->progenyIds;\n            std::vector<string> As = (*b)->sisterBranch->progenyIds;\n            //if((*b)->id == \"b5\") { print_vector(Bs, std::cout); }\n            std::vector<double> Bmins; std::vector<double> vals;\n            std::vector<double> ZBmins; std::vector<double> Zvals;\n            for (std::vector<string>::iterator C = testTree->allSpecies.begin(); C != testTree->allSpecies.end(); C++) {\n                for (std::vector<string>::iterator A = As.begin(); A != As.end(); A++) {\n                    std::vector<std::vector<string>> bAndVal; std::vector<std::vector<string>> aAndVal;\n                    try { bAndVal = acToBmap.at(*A+\",\"+*C); } catch (const std::out_of_range& oor) {}\n                    for (int i = 0; i < bAndVal.size(); i++) {\n                        if (std::count(Bs.begin(), Bs.end(), bAndVal[i][0])) {\n                            vals.push_back(stringToDouble(bAndVal[i][1]));\n                            if (indexZ != -1) {\n                                Zvals.push_back(stringToDouble(bAndVal[i][2]));\n                               // std::cerr << \"bAndVal[i]: \"; print_vector(bAndVal[i],std::cerr);\n                            }\n                        }\n                        //if((*b)->id == \"b5\") { std::cout << *A << \"\\t\" << bAndVal[i][0] << \"\\t\" << bAndVal[i][1] << \"\\tbAndVal.size():\\t\" << bAndVal.size() << \"\\ti:\\t\" << i << std::endl;\n                            //\n                        //}\n                    }\n                    if (!vals.empty()) { Bmins.push_back(*std::min_element(vals.begin(),vals.end())); vals.clear(); }\n                    if (!Zvals.empty()) { ZBmins.push_back(*std::min_element(Zvals.begin(),Zvals.end())); Zvals.clear(); }\n                    //\n                }\n                double fbC = NAN; double ZfbC = NAN;\n                if (!Bmins.empty()) { fbC = median(Bmins.begin(),Bmins.end()); Bmins.clear(); }\n                if (!ZBmins.empty()) { ZfbC = median(ZBmins.begin(),ZBmins.end()); ZBmins.clear(); }\n                /* else { // There is no positive value; just find if any value is possible for this ABC combination\n                    bool ACpossible = false;\n                    for (std::vector<string>::iterator B = Bs.begin(); B != Bs.end(); B++) {\n                        std::vector<std::vector<string>> bAndVal; std::vector<std::vector<string>> aAndVal;\n                        try { bAndVal = acToBmap.at(*B+\",\"+*C); } catch (const std::out_of_range& oor) {}\n                        for (int i = 0; i < bAndVal.size(); i++) {\n                            if (std::count(As.begin(), As.end(), bAndVal[i][0])) {\n                                ACpossible = true; break;\n                            }\n                        }\n                    }\n                    if (ACpossible) fbC = 0;\n                } */\n                (*b)->fbCvals.push_back(fbC);\n                (*b)->ZfbCvals.push_back(ZfbC);\n                (*b)->PfbCvals.push_back(2 * (1 - normalCDF(ZfbC)));\n               // std::cerr << \"Here: (*b)->progenyIds: \"; print_vector((*b)->progenyIds,std::cerr);\n               // std::cerr << \"Here: (*b)->ZfbCvals.size() \" << (*b)->ZfbCvals.size() << std::endl;\n               // std::cerr << \"Here: (*b)->ZfbCvals: \"; print_vector((*b)->ZfbCvals,std::cerr);\n            }\n        }\n    }\n    \n    // Generate output\n    std::cout << \"branch\\tbranch_descendants\\t\"; print_vector(testTree->allSpecies, std::cout);\n    for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {\n        if ((*b)->parentId != \"treeOrigin\") {\n            std::cout << (*b)->id << \"\\t\"; print_vector((*b)->progenyIds, std::cout, ',', false);\n            std::cout << \"\\t\"; print_vector((*b)->fbCvals, std::cout);\n            //std::cout << \"Sister branch:\\t\" <<  (*b)->sisterBranch->id << std::endl;\n            //std::cout << \"This branch progeny:\\t\"; print_vector((*b)->progenyIds, std::cout);\n            //std::cout << \"Sister branch progeny:\\t\"; print_vector((*b)->sisterBranch->progenyIds, std::cout);\n            //std::cout << \"fbCs:\\t\"; print_vector((*b)->fbCvals, std::cout);\n            //std::cout << std::endl;\n        }\n    }\n    if (indexZ != -1 && opt::printZb) {\n        std::cout << \"\\n\";\n        std::cout << \"# Z-scores:\\n\";\n        std::cout << \"branch\\tbranch_descendants\\t\"; print_vector(testTree->allSpecies, std::cout);\n        for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {\n            if ((*b)->parentId != \"treeOrigin\") {\n                std::cout << (*b)->id << \"\\t\"; print_vector((*b)->progenyIds, std::cout, ',', false);\n                std::cout << \"\\t\"; print_vector((*b)->ZfbCvals, std::cout);\n            }\n        }\n    }\n    if (indexZ != -1 && opt::printPb) {\n        std::cout << \"\\n\";\n        std::cout << \"# p-values:\\n\";\n        std::cout << \"branch\\tbranch_descendants\\t\"; print_vector(testTree->allSpecies, std::cout);\n        for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {\n            if ((*b)->parentId != \"treeOrigin\") {\n                std::cout << (*b)->id << \"\\t\"; print_vector((*b)->progenyIds, std::cout, ',', false);\n                std::cout << \"\\t\"; print_vector((*b)->PfbCvals, std::cout);\n            }\n        }\n    }\n    return 0;\n    \n}\n\nvoid parseFbranchOptions(int argc, char** argv) {\n    bool die = false;\n    std::vector<string> windowSizeStep;\n    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)\n    {\n        std::istringstream arg(optarg != NULL ? optarg : \"\");\n        switch (c)\n        {\n            case '?': die = true; break;\n            case 'p': arg >> opt::pthresh; break;\n            case 'Z': opt::printZb = true; break;\n            case 'P': opt::printPb = true; break;\n            case 'h':\n                std::cout << BRANCHSCORE_USAGE_MESSAGE;\n                exit(EXIT_SUCCESS);\n        }\n    }\n    \n    if (argc - optind < 2) {\n        std::cerr << \"missing arguments\\n\";\n        die = true;\n    }\n    else if (argc - optind > 2)\n    {\n        std::cerr << \"too many arguments\\n\";\n        die = true;\n    }\n    \n    if (die) {\n        std::cout << \"\\n\" << BRANCHSCORE_USAGE_MESSAGE;\n        exit(EXIT_FAILURE);\n    }\n    \n    // Parse the input filenames\n    opt::treeFile = argv[optind++];\n    opt::DvalsFile = argv[optind++]; \n}\n\n\nvoid Tree::updateProgenyIds() {\n    // Determine the progeny of each branch (needed to know whether conditions are met, and for fossil constraints).\n    // First of all, set progeniesComplete to 2 for all extinct and present branches.\n    for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {\n        if ((*b)->daughterId1 == \"none\") {\n            (*b)->progeniesComplete = 2;\n            (*b)->progenyIds.push_back((*b)->terminalSpeciesId);\n        }\n        // Set progenyPassedOn to true for the two root branches.\n        if ((*b)->parentId == \"treeOrigin\") (*b)->progenyPassedOn = true;\n    }\n    bool allProgeniesComplete = false;\n    while(!allProgeniesComplete) {\n        std::vector<Branch*> newlyCompleted;\n        for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {\n            // Determine if the progeny of this branch is clear but has not been passed on to the parent yet.\n            if ((*b)->progeniesComplete == 2 && (*b)->progenyPassedOn == false) {\n                newlyCompleted.push_back(*b);\n            }\n        }\n        if (newlyCompleted.size() == 0) allProgeniesComplete = true;\n        for (std::vector<Branch*>::iterator b = newlyCompleted.begin(); b != newlyCompleted.end(); b++) {\n            // Find parent, pass progeny+self on to parents progeny, add parent.progeniesComplete += 1, and change own progenyPassedOn to true.\n            for (std::vector<Branch*>::iterator bb = branches.begin(); bb != branches.end(); bb++) {\n                if ((*bb)->id == (*b)->parentId) {\n                    (*b)->parentBranch = *bb;\n                    (*bb)->progenyIds.insert((*bb)->progenyIds.end(), (*b)->progenyIds.begin(), (*b)->progenyIds.end() );\n                    (*bb)->progeniesComplete++;\n                    (*b)->progenyPassedOn = true;\n                    break;\n                }\n            }\n        }\n    }\n}\n\nvoid Tree::fillSisterBranches() {\n    for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {\n        if ((*b)->parentId != \"treeOrigin\") {\n            string sisterId;\n            if ((*b)->parentBranch->daughterId1 != (*b)->id)\n                sisterId = (*b)->parentBranch->daughterId1;\n            else\n                sisterId = (*b)->parentBranch->daughterId2;\n            for (std::vector<Branch*>::iterator bb = branches.begin(); bb != branches.end(); bb++) {\n                if ((*bb)->id == sisterId) {\n                    (*b)->sisterBranch = *bb;\n                    break;\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "Dsuite_fBranch.h",
    "content": "//\n//  Dsuite_fBranch.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 11/11/2019.\n//\n\n#ifndef Dsuite_fBranch_h\n#define Dsuite_fBranch_h\n\n#include <stdio.h>\n#include \"Dsuite_utils.h\"\n#include \"Dsuite_common.h\"\n\n\nint fBranchMain(int argc, char** argv);\nvoid parseFbranchOptions(int argc, char** argv);\n\n\nclass Branch {\npublic:\n    Branch(string inId, string inParentId, string inDaughterId1, string inDaughterId2, string inTerminalSpeciesId) {\n        id = inId;\n        parentId = inParentId;\n        if (inTerminalSpeciesId == \"unknown\") {\n            daughterId1 = inDaughterId1;\n            daughterId2 = inDaughterId2;\n            terminalSpeciesId = \"\";\n        } else {\n            //assert(inDaughterIds.size() == 0);\n            terminalSpeciesId = inTerminalSpeciesId;\n            daughterId1 = \"none\";\n            daughterId2 = \"none\";\n        }\n        progeniesComplete = 0;\n        progenyPassedOn = false;\n    };\n    \n    \n    string id;\n    string parentId;\n    string daughterId1;\n    string daughterId2;\n    std::vector<string> progenyIds;\n    string terminalSpeciesId;\n    \n    Branch* parentBranch;\n    Branch* sisterBranch;\n    std::vector<double> fbCvals;\n    std::vector<double> ZfbCvals;\n    std::vector<double> PfbCvals;\n    \n    int progeniesComplete;\n    bool progenyPassedOn;\n};\n\n\nclass Tree {\npublic:\n    Tree(string treeString) {\n        // First take care of any branch lengths\n        std::regex branchLengths(\":.*?(?=,|\\\\))\");\n        string treeNoBranchLengths = std::regex_replace(treeString,branchLengths,\"\");\n        std::vector<string> tmpBranchEndNodeId;\n        std::vector<string> tmpBranchStartNodeId;\n        int numberOfInternalNodes = 0;\n        std::regex sistersRegEx(\"\\\\(([a-zA-Z0-9.[:s:]_-]+),([a-zA-Z0-9.[:s:]_-]+)\\\\)\");\n        std::regex sistersRegExNoGroups(\"\\\\([a-zA-Z0-9.[:s:]_-]+,[a-zA-Z0-9.[:s:]_-]+\\\\)\");\n        std::regex comma(\",\");\n        std::smatch match;\n        string workingTreeCopy = treeNoBranchLengths;\n        while (std::regex_search(workingTreeCopy,match,sistersRegEx)) {\n            assert(match.size() == 3);\n            // for (auto x:match) std::cout << x << \" \"; std::cout << std::endl;\n            string nodeId = \"internalNode\"+numToString(numberOfInternalNodes)+\"X\";\n            tmpBranchStartNodeId.push_back(nodeId);\n            tmpBranchStartNodeId.push_back(nodeId);\n            if (std::count(tmpBranchEndNodeId.begin(),tmpBranchEndNodeId.end(),match[1])) duplicateTreeValueError(match[1]);\n            else tmpBranchEndNodeId.push_back(match[1]);\n            if (std::count(tmpBranchEndNodeId.begin(),tmpBranchEndNodeId.end(),match[2])) duplicateTreeValueError(match[2]);\n            else tmpBranchEndNodeId.push_back(match[2]);\n            \n            workingTreeCopy = std::regex_replace(workingTreeCopy, sistersRegExNoGroups, nodeId, std::regex_constants::format_first_only);\n            // std::cout << workingTreeCopy << std::endl;\n            numberOfInternalNodes++;\n        }\n        if (std::regex_search(workingTreeCopy,comma)) {\n            std::cerr << \"ERROR: The tree string could not be parsed correctly! The remaining unparsed tree string is:\"  << std::endl;\n            std::cerr << workingTreeCopy << std::endl;\n            exit(1);\n        }\n        \n        // Prepare arrays for temporary branch format.\n        std::vector<string> tmp2BranchID;\n        std::vector<string> tmp2BranchParentId;\n        std::vector<string> tmp2BranchDaughterId1;\n        std::vector<string> tmp2BranchDaughterId2;\n        std::vector<string> tmp2BranchEndNodeId;\n        \n        // Prepare the first two branches in temporary format (tmpBranchEndNodeId[-1] and tmpBranchEndNodeId[-2] are the two oldest branches).\n        // Test if the first root branch ends in an internal node.\n        std::regex internalNodeRegEx(\"internalNode[0-9]+X\");\n        tmp2BranchID.push_back(\"b0\");\n        tmp2BranchParentId.push_back(\"treeOrigin\");\n        tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-1]);\n        if (std::regex_match(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-1],internalNodeRegEx)) {\n            tmp2BranchDaughterId1.push_back(\"unborn\"); tmp2BranchDaughterId2.push_back(\"unborn\");\n        } else {\n            tmp2BranchDaughterId1.push_back(\"none\"); tmp2BranchDaughterId2.push_back(\"none\");\n        }\n        // Repeat the above for the second branch.\n        // Test if the second root branch ends in an internal node.\n        tmp2BranchID.push_back(\"b1\");\n        tmp2BranchParentId.push_back(\"treeOrigin\");\n        tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-2]);\n        if (std::regex_match(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-2],internalNodeRegEx)) {\n            tmp2BranchDaughterId1.push_back(\"unborn\"); tmp2BranchDaughterId2.push_back(\"unborn\");\n        } else {\n            tmp2BranchDaughterId1.push_back(\"none\"); tmp2BranchDaughterId2.push_back(\"none\");\n        }\n        \n        // Find out about all remaining branches until either all branches end with extinctions, or all branches have reached the present.\n        int branchIdCounter = 2;\n        bool treeComplete = false;\n        while (!treeComplete) {\n            bool change = false;\n            //std::cout << \"tmp2BranchID.size(): \" << tmp2BranchID.size() << std::endl;\n            for (int i = 0; i < tmp2BranchID.size(); i++) {\n                // if a branch terminated with a speciation event in the past, then add the two daughter branches\n                if (tmp2BranchDaughterId1[i] == \"unborn\" && tmp2BranchDaughterId2[i] == \"unborn\") {\n                    //std::cout << \"tmp2BranchEndNodeId.size(): \" << tmp2BranchEndNodeId.size() << std::endl;\n                    // Find the two branches that have the same start node as this branch's end node.\n                    for (int j = 0; j < tmpBranchStartNodeId.size(); j++) {\n                       // std::cout << \"j: \" << j << \" i: \" << i << std::endl;\n                        if (tmpBranchStartNodeId[j] == tmp2BranchEndNodeId[i]) {\n                            tmp2BranchID.push_back(\"b\"+numToString(branchIdCounter));\n                            //std::cout << \"tmp2BranchID.size(): \" << tmp2BranchID.size() << \" i: \" << i << std::endl;\n                            tmp2BranchParentId.push_back(tmp2BranchID[i]);\n                            //std::cout << \"tmpBranchEndNodeId.size(): \" << tmpBranchEndNodeId.size() << \" j: \" << j << std::endl;\n                            tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[j]);\n                            if (std::regex_match(tmpBranchEndNodeId[j],internalNodeRegEx)) {\n                                tmp2BranchDaughterId1.push_back(\"unborn\");\n                                tmp2BranchDaughterId2.push_back(\"unborn\");\n                            } else {\n                                tmp2BranchDaughterId1.push_back(\"none\");\n                                tmp2BranchDaughterId2.push_back(\"none\");\n                            }\n                            // Update daughter ids of temporary parent.\n                            //std::cout << \"tmp2BranchDaughterId1.size(): \" << tmp2BranchDaughterId1.size() << \" i: \" << i << std::endl;\n                           // std::cout << \"tmp2BranchDaughterId2.size(): \" << tmp2BranchDaughterId2.size() << \" i: \" << i << std::endl;\n                            if (tmp2BranchDaughterId1[i] == \"unborn\") {\n                                tmp2BranchDaughterId1[i] = \"b\"+numToString(branchIdCounter);\n                            } else {\n                                tmp2BranchDaughterId2[i] = \"b\"+numToString(branchIdCounter);\n                            }\n                            // Increase the branchIdCounter\n                            branchIdCounter += 1;\n                            change = true;\n                        }\n                    }\n                }\n            }\n            if (change == false) treeComplete = true;\n        }\n        \n        // Fill array @branch, and at the same time, add species for terminal branches.\n        std::vector<string> species;\n        for (int i = 0; i < tmp2BranchID.size(); i++) {\n            string speciesId;\n            if (std::regex_match(tmp2BranchEndNodeId[i], internalNodeRegEx)) {\n                speciesId = \"unknown\";\n            } else {\n                speciesId = tmp2BranchEndNodeId[i];\n                //if (tmp2BranchParentId[i] != \"treeOrigin\")\n                allSpecies.push_back(speciesId);\n            }\n            branches.push_back(new Branch(tmp2BranchID[i], tmp2BranchParentId[i], tmp2BranchDaughterId1[i], tmp2BranchDaughterId2[i], speciesId));\n            \n        }\n    };\n    \n    std::vector<string> allSpecies;\n    std::vector<Branch*> branches;\n    void updateProgenyIds();\n    void fillSisterBranches();\n    \n};\n\n \n\n\n#endif /* Dsuite_fBranch_h */\n"
  },
  {
    "path": "Dsuite_utils.cpp",
    "content": "//\n//  Dsuite_utils.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include \"Dsuite_utils.h\"\n\nlong double normalCDF(double x) // Phi(-∞, x) aka N(x)\n{\n    return erfcl(-x/std::sqrt(2))/2;\n}\n\ndouble Fd_Denom_perVariant(double p1, double p2, double p3, double pO) {\n    double Fd_Denom = 0;\n    if (p2 > p3) Fd_Denom = ((1-p1)*p2*p2*(1-pO)) - (p1*(1-p2)*p2*(1-pO));\n    else Fd_Denom = ((1-p1)*p3*p3*(1-pO)) - (p1*(1-p3)*p3*(1-pO));\n    return Fd_Denom;\n}\n\ndouble fG_Denom_perVariant(double p1, double p3a, double p3b, double pO) {\n    double fG_Denom = ((1-p1)*p3a*p3b*(1-pO)) - (p1*(1-p3a)*p3b*(1-pO));\n    return fG_Denom;\n}\n\n// As per Patterson et al. (2012)\ndouble f4_perVariant(double p1, double p2, double p3, double p4) {\n    double f4 = (p2-p1)*(p3-p4);\n    return f4;\n}\n\ndouble FdM_Denom_perVariant(double p1, double p2, double p3, double pO) {\n    double FdM_Denom = 0;\n    if (p1 <= p2) {\n        if (p2 > p3) FdM_Denom = ((1-p1) * p2 * p2 * (1-pO)) - (p1 * (1-p2) * p2 * (1-pO));\n        else FdM_Denom = ((1-p1) * p3 * p3 * (1-pO)) - (p1 * (1-p3) * p3 * (1-pO));\n    } else {\n        if (p1 > p3) FdM_Denom = -(((1-p1)*p2*p1*(1-pO)) - (p1*(1-p2)*p1*(1-pO)));\n        else FdM_Denom = -(((1-p3)*p2*p3*(1-pO)) - (p3*(1-p2)*p3*(1-pO)));\n    }\n    return FdM_Denom;\n}\n\n\n\n// Works only on biallelic markers\nvoid GeneralSetCounts::getSetVariantCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {\n    \n    getBasicCounts(genotypes, posToSpeciesMap);\n    \n    // If at least one of the outgroup individuals has non-missing data\n    // Find out what is the \"ancestral allele\" - i.e. the one more common in the outgroup\n    try {\n        if (setAlleleCounts.at(\"Outgroup\") > 0) {\n            if ((double)setAltCounts.at(\"Outgroup\")/setAlleleCounts.at(\"Outgroup\") < 0.5) { AAint = AncestralAlleleRef; }\n            else { AAint = AncestralAlleleAlt; }\n        }\n    } catch (std::out_of_range& e) { AAint = AncestralAlleleMissing; }\n    \n    // Now fill in the allele frequencies\n    double totalAAF = 0; double totalDAF = 0; int numNonZeroCounts = 0;\n    for(std::map<string,int>::iterator it = setAltCounts.begin(); it != setAltCounts.end(); ++it) {\n        if (setAlleleCounts.at(it->first) > 0) {\n            numNonZeroCounts++;\n            double thisAAF = (double)setAltCounts.at(it->first)/setAlleleCounts.at(it->first);\n            setAAFs[it->first] = thisAAF; totalAAF += thisAAF;\n            if (AAint == 0) { // Ancestral allele seems to be the ref, so derived is alt\n                setDAFs[it->first] = thisAAF; totalDAF += thisAAF;\n            } else if (AAint == 1) { // Ancestral allele seems to be alt, so derived is ref\n                setDAFs[it->first] = (1 - thisAAF); totalDAF += (1 - thisAAF);\n            }\n        }\n    }\n    averageAAF = totalAAF/numNonZeroCounts; averageDAF = totalDAF/numNonZeroCounts;\n}\n\nint GeneralSetCounts::returnFormatTagPosition(std::vector<std::string>& format, const std::string& tag) {\n    // Find the position of GQ (genotype quality) in the genotypeData vector below\n    std::vector<std::string>::iterator TAGit; int TAGi = std::numeric_limits<int>::min();\n    TAGit = find (format.begin(), format.end(), tag);\n    if (TAGit == format.end()) {\n        // std::cerr << \"This variant hasn't got associated per-sample GQ info\" << std::endl;\n    } else {\n        TAGi = (int)std::distance( format.begin(), TAGit );\n        //hasGQ = true;\n    }\n    return TAGi;\n}\n\n\nint GeneralSetCounts::checkForGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& vcfLineFields) {\n    std::vector<std::string> format = split(vcfLineFields[8], ':');\n    if (format.size() == 1) return LikelihoodsProbabilitiesAbsent; // The GT tag must be present in the first place\n    \n    int likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, \"GP\");\n    if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesGP; }\n    else {\n        likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, \"GL\");\n        if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesGL; }\n        else {\n            likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, \"PL\");\n            if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesPL; }\n        }\n    }\n    return likelihoodsOrProbabilitiesTagPosition;\n}\n\ndouble getExpectedGenotype(const std::vector<double>& thisProbabilities) {\n    double Egenotype = thisProbabilities[1] + 2*thisProbabilities[2];\n    return Egenotype;\n}\n\nvoid transformFromPhred(std::vector<double>& thisLikelihoods) {\n\n    thisLikelihoods[0] = pow(10,-(thisLikelihoods[0]/10.0));\n    thisLikelihoods[1] = pow(10,-(thisLikelihoods[1]/10.0));\n    thisLikelihoods[2] = pow(10,-(thisLikelihoods[2]/10.0));\n}\n\nvoid transformFromGL(std::vector<double>& thisLikelihoods) {\n\n    thisLikelihoods[0] = pow(10,(thisLikelihoods[0]/10.0));\n    thisLikelihoods[1] = pow(10,(thisLikelihoods[1]/10.0));\n    thisLikelihoods[2] = pow(10,(thisLikelihoods[2]/10.0));\n}\n\nstd::vector<double> GeneralSetCounts::probabilitiesFromLikelihoods(const std::vector<double>& thisLikelihoods, const string& species) {\n    std::vector<double> thisProbabilities; thisProbabilities.assign(3, 0.0);\n    double multiple0 = thisLikelihoods[0]*setHWEpriorsFromAAFfromGT[species][0];\n    double multiple1 = thisLikelihoods[1]*setHWEpriorsFromAAFfromGT[species][1];\n    double multiple2 = thisLikelihoods[2]*setHWEpriorsFromAAFfromGT[species][2];\n    double sum = multiple0 + multiple1 + multiple2;\n    \n    thisProbabilities[0] = multiple0/sum;\n    thisProbabilities[1] = multiple1/sum;\n    thisProbabilities[2] = multiple2/sum;\n    \n    return thisProbabilities;\n}\n \nvoid GeneralSetCounts::setHWEpriorsFromAFfromGT() {\n    double AF;\n    // Alternative allele frequencies\n    for(std::map<string,double>::iterator it = setAAFs.begin(); it != setAAFs.end(); ++it) {\n        if (it->second >= 0) AF = it->second; else AF = averageAAF; // This should be average of AFs across populations where it is known\n        setHWEpriorsFromAAFfromGT[it->first][0] = pow((1-AF),2);\n        setHWEpriorsFromAAFfromGT[it->first][1] = AF*(1-AF);\n        setHWEpriorsFromAAFfromGT[it->first][2] = pow(AF,2);\n    }\n    // Derived allele frequencies\n    for(std::map<string,double>::iterator it = setDAFs.begin(); it != setDAFs.end(); ++it) {\n        if (it->second >= 0) AF = it->second; else AF = averageDAF; // This should be average of AFs across populations\n        setHWEpriorsFromDAFfromGT[it->first][0] = pow((1-AF),2);\n        setHWEpriorsFromDAFfromGT[it->first][1] = AF*(1-AF);\n        setHWEpriorsFromDAFfromGT[it->first][2] = pow(AF,2);\n    }\n} \n\n\n\n\nvoid GeneralSetCounts::getAFsFromGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition) {\n    if (likelihoodsProbabilitiesType == LikelihoodsProbabilitiesPL || likelihoodsProbabilitiesType == LikelihoodsProbabilitiesGL) {\n        setHWEpriorsFromAFfromGT();\n    }\n    \n    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {\n        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {\n            continue;\n        }\n       // std::cerr << genotypeFields[i] << std::endl;\n        std::string thisLikelihoodsOrProbabilitiesString = split(genotypeFields[i], ':')[likelihoodsOrProbabilitiesTagPosition];\n        if (thisLikelihoodsOrProbabilitiesString == \".\") continue;\n        \n        else {\n            setAlleleProbCounts.at(species) += 2;\n            std::vector<double> thisLikelihoodsOrProbabilities = splitToDouble(thisLikelihoodsOrProbabilitiesString,',');\n            std::vector<double> thisProbabilities;\n            switch (likelihoodsProbabilitiesType)\n            {\n                case LikelihoodsProbabilitiesPL:\n                    transformFromPhred(thisLikelihoodsOrProbabilities);\n                   // print_vector(thisLikelihoodsOrProbabilities, std::cerr);\n                    thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);\n                    break;\n                case LikelihoodsProbabilitiesGL: transformFromGL(thisLikelihoodsOrProbabilities);\n                    thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);\n                    break;\n                case LikelihoodsProbabilitiesGP:\n                    thisProbabilities = thisLikelihoodsOrProbabilities;\n                    break;\n            }\n            if (setAAFsFromLikelihoods.at(species) == -1) setAAFsFromLikelihoods.at(species) = 0;\n            setAAFsFromLikelihoods.at(species) += getExpectedGenotype(thisProbabilities);\n        }\n    }\n    \n    for(std::map<string,double>::iterator it = setAAFsFromLikelihoods.begin(); it != setAAFsFromLikelihoods.end(); ++it) {\n        if (setAAFsFromLikelihoods.at(it->first) != -1) {\n            double AF = it->second/setAlleleProbCounts.at(it->first);\n            it->second = AF;\n            if (AAint == AncestralAlleleRef) {\n                setDAFsFromLikelihoods.at(it->first) = AF;\n            } else if (AAint == AncestralAlleleAlt) {\n                setDAFsFromLikelihoods.at(it->first) = (1 - AF);\n            }\n        }\n    }\n     \n}\n\nvoid GeneralSetCounts::getAFsFromADtag(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth) {\n    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {\n          // std::cerr << genotypeFields[i] << std::endl;\n           std::string thisADstring = split(genotypeFields[i], ':')[ADTagPosition];\n           if (thisADstring == \".\") {\n               std::cerr << \"The AD tag info appears to be missing: \" << thisADstring << \" ; Exiting ...\" << std::endl;\n               exit(1);\n           }\n           \n           else {\n               std::vector<double> ADs = splitToDouble(thisADstring,',');\n               if (ADs.size() != 2) {\n                   std::cerr << \"This AD tag appears malformed: \" << thisADstring << \" ; Exiting ...\" << std::endl;\n                   exit(1);\n               }\n               \n               int overallDepth = ADs[0] + ADs[1];\n               if (overallDepth >= minDepth) {\n                    individualPoolAAFs[i] = ADs[0]/(overallDepth);\n               }\n           }\n       }\n       \n       for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {\n           int individualsInThisSet = (int) it->second.size();\n           assert(individualsInThisSet > 0);\n           if (individualsInThisSet == 1) {\n               int pos = (int) it->second[0];\n               setPoolAAFs.at(it->first) = individualPoolAAFs[pos];\n           } else {\n               std::vector<double> thisSetAFs;\n               for (int i = 0; i < individualsInThisSet; i++) {\n                   int pos = (int) it->second[i];\n                   if (individualPoolAAFs[pos] != -1.0) thisSetAFs.push_back(individualPoolAAFs[pos]);\n               }\n               setPoolAAFs.at(it->first) = vector_average(thisSetAFs);\n               \n           }\n           \n           \n           if (AAint == AncestralAlleleRef) {\n               setPoolDAFs.at(it->first) = setPoolAAFs.at(it->first);\n           } else if (AAint == AncestralAlleleAlt && setPoolAAFs.at(it->first) != -1.0) {\n               setPoolDAFs.at(it->first) = (1 - setPoolAAFs.at(it->first));\n           }\n           \n               \n       }\n}\n\n\nvoid GeneralSetCountsWithSplits::getAFsFromADtagWithSplits(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth) {\n    \n    \n    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {\n       // std::cerr << genotypeFields[i] << std::endl;\n        std::string thisADstring = split(genotypeFields[i], ':')[ADTagPosition];\n        if (thisADstring == \".\") {\n            std::cerr << \"The AD tag info appears to be missing: \" << thisADstring << \" ; Exiting ...\" << std::endl;\n            exit(1);\n        }\n        \n        else {\n            std::vector<double> ADs = splitToDouble(thisADstring,',');\n            if (ADs.size() != 2) {\n                std::cerr << \"This AD tag appears malformed: \" << thisADstring << \" ; Exiting ...\" << std::endl;\n                exit(1);\n            }\n            \n            int overallDepth = ADs[0] + ADs[1];\n            if (overallDepth >= minDepth) {\n                 individualPoolAAFs[i] = ADs[0]/(overallDepth);\n            }\n        }\n    }\n    \n    for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {\n        int individualsInThisSet = (int) it->second.size();\n        assert(individualsInThisSet > 0);\n        if (individualsInThisSet == 1) {\n            int pos = (int) it->second[0];\n            setPoolAAFs.at(it->first) = individualPoolAAFs[pos];\n            setPoolAAFsplit1.at(it->first) = individualPoolAAFs[pos];\n            setPoolAAFsplit2.at(it->first) = individualPoolAAFs[pos];\n        } else {\n            std::vector<double> thisSetAFs;\n            for (int i = 0; i < individualsInThisSet; i++) {\n                int pos = (int) it->second[i];\n                thisSetAFs.push_back(individualPoolAAFs[pos]);\n            }\n            setPoolAAFs.at(it->first) = vector_average(thisSetAFs);\n            \n            // Take care of the splits by random sampling with replacement:\n            std::random_device rd;     // only used once to initialise (seed) engine\n            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)\n            std::uniform_int_distribution<int> uni(0,(individualsInThisSet - 1)); // guaranteed unbiased\n            \n            std::vector<double> thisSetAFsplit1; std::vector<double> thisSetAFsplit2;\n            for (int i = 0; i < individualsInThisSet; i++) {\n                int random_pos_s1 = uni(rng);\n                int random_pos_s2 = uni(rng);\n                thisSetAFsplit1.push_back(individualPoolAAFs[random_pos_s1]);\n                thisSetAFsplit2.push_back(individualPoolAAFs[random_pos_s2]);\n            }\n            setPoolAAFsplit1.at(it->first) = vector_average(thisSetAFsplit1);\n            setPoolAAFsplit2.at(it->first) = vector_average(thisSetAFsplit2);\n            \n        }\n        \n        if (AAint == AncestralAlleleRef) {\n            setPoolDAFs.at(it->first) = setPoolAAFs.at(it->first);\n            setPoolDAFsplit1.at(it->first) = setPoolAAFsplit1.at(it->first);\n            setPoolDAFsplit2.at(it->first) = setPoolAAFsplit2.at(it->first);\n        } else if (AAint == AncestralAlleleAlt && setPoolAAFs.at(it->first) != -1.0) {\n            setPoolDAFs.at(it->first) = (1 - setPoolAAFs.at(it->first));\n            setPoolDAFsplit1.at(it->first) = (1 - setPoolAAFsplit1.at(it->first));\n            setPoolDAFsplit2.at(it->first) = (1 - setPoolAAFsplit2.at(it->first));\n        }\n            \n    }\n}\n\n\n// Only works for diploids for now!!!\nvoid GeneralSetCountsWithSplits::getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition, const int pos) {\n   \n    \n    if (likelihoodsProbabilitiesType == LikelihoodsProbabilitiesPL || likelihoodsProbabilitiesType == LikelihoodsProbabilitiesGL) {\n        setHWEpriorsFromAFfromGT();\n    }\n    \n    getBasicCountsFromLikelihoodsOrProbabilities(genotypeFields, posToSpeciesMap, likelihoodsOrProbabilitiesTagPosition);\n    \n     \n    // Now fill in the allele frequencies\n    for(std::map<string,std::vector<double>>::iterator it = setIndividualExpectedGenotypes.begin(); it != setIndividualExpectedGenotypes.end(); ++it) {\n        if (it->first == \"\") {\n            std::cerr << \"it->first \" << it->first << \"\\t\"; print_vector(it->second, std::cerr); std::cerr << std::endl;\n        }\n        std::vector<double> thisSetExpectedGenotypes = it->second;\n        \n        \n        if (thisSetExpectedGenotypes.size() > 0) {\n            double thisAAF = (double)vector_sum(thisSetExpectedGenotypes)/(2*thisSetExpectedGenotypes.size());\n          /* Debug stuff\n           if(pos == 1180 || pos == 1046) {\n                std::cerr << \"pos: \" << pos << std::endl;\n                std::cerr << \"it->first: \" << it->first << std::endl;\n                print_vector(thisSetExpectedGenotypes, std::cerr);\n                std::cerr << \"thisAAF: \" << thisAAF << std::endl;\n            }\n           */\n            //std::cerr << \"species: \" << it->first << std::endl;\n            // print_vector(thisSetExpectedGenotypes, std::cerr);\n            // std::cerr << \"thisAAF: \" << thisAAF << std::endl;\n            setAAFsFromLikelihoods.at(it->first) = thisAAF;\n            \n            // Take care of the splits by random sampling with replacement:\n            std::random_device rd;     // only used once to initialise (seed) engine\n            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)\n            std::uniform_int_distribution<int> uniAFs(0,((int)thisSetExpectedGenotypes.size() - 1)); // guaranteed unbiased\n            \n            \n            std::vector<double> thisSetIndividualExpectedGenotypesSampledSplit1;\n            std::vector<double> thisSetIndividualExpectedGenotypesSampledSplit2;\n            for (int i = 0; i < thisSetExpectedGenotypes.size(); i++) {\n                int random_pos_s1 = uniAFs(rng);\n                int random_pos_s2 = uniAFs(rng);\n                thisSetIndividualExpectedGenotypesSampledSplit1.push_back(thisSetExpectedGenotypes[random_pos_s1]);\n                thisSetIndividualExpectedGenotypesSampledSplit2.push_back(thisSetExpectedGenotypes[random_pos_s2]);\n            }\n            \n            double thisAAFsplit1 = (double)vector_sum(thisSetIndividualExpectedGenotypesSampledSplit1)/(2*thisSetExpectedGenotypes.size());\n           // std::cerr << \"thisAAFsplit1: \" << thisAAFsplit1 << std::endl;\n            double thisAAFsplit2 = (double)vector_sum(thisSetIndividualExpectedGenotypesSampledSplit2)/(2*thisSetExpectedGenotypes.size());\n           // std::cerr << \"thisAAFsplit2: \" << thisAAFsplit2 << std::endl;\n\n            \n           // std::cerr << \"it->first \" << it->first << std::endl;\n            try {\n            setAAFsplit1fromLikelihoods.at(it->first) = thisAAFsplit1; setAAFsplit2fromLikelihoods.at(it->first) = thisAAFsplit2;\n                \n            if (AAint == AncestralAlleleRef) { // Ancestral allele seems to be the ref, so derived is alt\n                setDAFsFromLikelihoods.at(it->first) = thisAAF;\n                setDAFsplit1fromLikelihoods.at(it->first) = thisAAFsplit1;\n                setDAFsplit2fromLikelihoods.at(it->first) = thisAAFsplit2;\n            } else if (AAint == AncestralAlleleAlt) { // Ancestral allele seems to be alt, so derived is ref\n                setDAFsFromLikelihoods.at(it->first) = (1 - thisAAF);\n                setDAFsplit1fromLikelihoods.at(it->first) = 1 - thisAAFsplit1;\n                setDAFsplit2fromLikelihoods.at(it->first) = 1 - thisAAFsplit2;\n            }\n            } catch (std::out_of_range& e) { std::cerr << \"The trouble was here\" << it->first << std::endl; }\n        }\n    }\n}\n\n\n\n// Works only on biallelic markers\nvoid GeneralSetCounts::getSetVariantCountsSimple(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {\n    // std::cerr << fields[0] << \"\\t\" << fields[1] << std::endl;\n    getBasicCounts(genotypes, posToSpeciesMap);\n    \n    // Now fill in the allele frequencies\n    for(std::map<string,int>::iterator it = setAltCounts.begin(); it != setAltCounts.end(); ++it) {\n        if (setAlleleCounts.at(it->first) > 0) {\n            setAAFs[it->first] = (double)setAltCounts.at(it->first)/setAlleleCounts.at(it->first);\n        }\n    }\n}\n\nvoid GeneralSetCounts::getBasicCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {\n    // Go through the genotypes - only biallelic markers are allowed\n    for (std::vector<std::string>::size_type i = 0; i != genotypes.size(); i++) {\n        bool speciesDefined = true;\n        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {\n            speciesDefined = false;\n        }\n        // The first allele in this individual\n        if (genotypes[i][0] == '1') { overall++; individualsWithVariant[i]++; }\n        if (genotypes[i][2] == '1') { overall++; individualsWithVariant[i]++; }\n        if (speciesDefined) {\n            if (genotypes[i][0] == '1') {\n                setAltCounts[species]++; setAlleleCounts[species]++;\n            } else if (genotypes[i][0] == '0') {\n                setAlleleCounts[species]++;\n            }\n            // The second allele in this individual\n            if (genotypes[i][2] == '1') {\n                setAltCounts[species]++; setAlleleCounts[species]++;\n            } else if (genotypes[i][2] == '0') {\n                setAlleleCounts[species]++;\n            }\n        }\n    }\n}\n\nvoid GeneralSetCountsWithSplits::getBasicCountsWithSplitsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {\n    \n    // Go through the genotypes - only biallelic markers are allowed\n    for (std::vector<std::string>::size_type i = 0; i != genotypes.size(); i++) {\n        bool speciesDefined = true;\n        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {\n            speciesDefined = false;\n        }\n        \n        if (speciesDefined) {\n            string onlyGenotypeCalls = split(genotypes[i], ':')[0];   // The string with 0/0, 0/1, 1/0, 1/1, or e.g. 0/0/1/1 for a tetraploid\n            if (onlyGenotypeCalls[0] == '.') {\n                continue;   // Ignore missing data\n            }\n            // Find ploidy\n            int l = (int)onlyGenotypeCalls.length();\n            int numGTs = (l/2)+1;\n            setAlleleCounts[species] += numGTs;\n            \n            // Go through the genotypes and fill in the data structure \"GeneralSetCountsWithSplits\"\n            for (std::vector<std::string>::size_type j = 0; j <= l; j = j+2) {\n               // std::cerr << \"genotypes[i][j]: \" << genotypes[i][j] << std::endl;\n                setGenotypes[species].push_back(genotypes[i][j] - '0');\n                if (genotypes[i][j] == '1') {\n                    overall++; individualsWithVariant[i]++;\n                    setAltCounts[species]++;\n                }\n            }\n            double individualAF = (double)individualsWithVariant[i]/numGTs;\n            \n            /* std::cerr << \"onlyGenotypeCalls: \" << onlyGenotypeCalls << std::endl;\n            std::cerr << \"individualsWithVariant[i]: \" << individualsWithVariant[i] << std::endl;\n            std::cerr << \"numGTs: \" << numGTs << std::endl;\n            std::cerr << \"individualAF: \" << individualAF << std::endl;\n            */\n            setIndividualAFs[species].push_back(individualAF);\n        }\n    }\n}\n\nvoid GeneralSetCountsWithSplits::getBasicCountsFromLikelihoodsOrProbabilities(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition) {\n    \n    // Go through the genotypes - only biallelic markers are allowed\n    for (std::vector<string>::size_type i = 0; i != genotypes.size(); i++) {\n        bool speciesDefined = true;\n        string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {\n            speciesDefined = false;\n        }\n        \n        if (speciesDefined) {\n            string thisLikelihoodsOrProbabilitiesString = split(genotypes[i], ':')[likelihoodsOrProbabilitiesTagPosition];\n            if (thisLikelihoodsOrProbabilitiesString == \".\") continue;\n            else {\n                setAlleleProbCounts.at(species) += 2;\n                std::vector<double> thisLikelihoodsOrProbabilities = splitToDouble(thisLikelihoodsOrProbabilitiesString,',');\n                std::vector<double> thisProbabilities;\n                switch (likelihoodsProbabilitiesType)\n                {\n                    case LikelihoodsProbabilitiesPL:\n                        transformFromPhred(thisLikelihoodsOrProbabilities);\n                     // print_vector(thisLikelihoodsOrProbabilities, std::cerr);\n                        thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);\n                        break;\n                    case LikelihoodsProbabilitiesGL: break;\n                    case LikelihoodsProbabilitiesGP:\n                        thisProbabilities = thisLikelihoodsOrProbabilities;\n                        break;\n                }\n                setIndividualExpectedGenotypes[species].push_back(getExpectedGenotype(thisProbabilities));\n            }\n        }\n    }\n}\n\nvoid GeneralSetCountsWithSplits::getSplitCountsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {\n    \n    getBasicCountsWithSplitsNew(genotypes, posToSpeciesMap);\n    \n    // If at least one of the outgroup individuals has non-missing data\n    // Find out what is the \"ancestral allele\" - i.e. the one more common in the outgroup\n    try {\n        if (setAlleleCounts.at(\"Outgroup\") > 0) {\n            if ((double)vector_sum(setGenotypes.at(\"Outgroup\"))/setGenotypes.at(\"Outgroup\").size() < 0.5) { AAint = AncestralAlleleRef; }\n            else { AAint = AncestralAlleleAlt; } \n        }\n    } catch (std::out_of_range& e) { AAint = -1; }\n    \n    // Now fill in the allele frequencies\n    double totalAAF = 0; int numNonZeroCounts = 0;\n    for(std::map<string,std::vector<int>>::iterator it = setGenotypes.begin(); it != setGenotypes.end(); ++it) {\n        if (it->first == \"\") {\n            std::cerr << \"it->first \" << it->first << \"\\t\"; print_vector(it->second, std::cerr); std::cerr << std::endl;\n        }\n        std::vector<int> thisSetGenotypes = setGenotypes.at(it->first);\n        std::vector<double> thisSetIndividualAFs = setIndividualAFs.at(it->first);\n        \n        if (thisSetGenotypes.size() > 0) {\n            numNonZeroCounts++;\n            double thisAAF = (double)vector_sum(thisSetGenotypes)/thisSetGenotypes.size();\n           // print_vector(thisSetGenotypes, std::cerr);\n           // std::cerr << \"thisAAF: \" << thisAAF << std::endl;\n            setAAFs[it->first] = thisAAF; totalAAF += thisAAF;\n            \n            // Take care of the splits by random sampling with replacement:\n            std::random_device rd;     // only used once to initialise (seed) engine\n            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)\n            std::uniform_int_distribution<int> uni(0,((int)thisSetGenotypes.size() - 1)); // guaranteed unbiased\n            std::uniform_int_distribution<int> uniAFs(0,((int)thisSetIndividualAFs.size() - 1)); // guaranteed unbiased\n            \n         /*   std::vector<int> thisSetGenotypesSampledSplit1; std::vector<int> thisSetGenotypesSampledSplit2;\n            for (int i = 0; i < thisSetGenotypes.size(); i++) {\n                int random_pos_s1 = uni(rng);\n                int random_pos_s2 = uni(rng);\n                thisSetGenotypesSampledSplit1.push_back(thisSetGenotypes[random_pos_s1]);\n                thisSetGenotypesSampledSplit2.push_back(thisSetGenotypes[random_pos_s2]);\n            }\n          */\n            \n            std::vector<double> thisSetIndividualAFsSampledSplit1; std::vector<double> thisSetIndividualAFsSampledSplit2;\n            for (int i = 0; i < thisSetIndividualAFs.size(); i++) {\n                int random_pos_s1 = uniAFs(rng);\n                int random_pos_s2 = uniAFs(rng);\n                thisSetIndividualAFsSampledSplit1.push_back(thisSetIndividualAFs[random_pos_s1]);\n                thisSetIndividualAFsSampledSplit2.push_back(thisSetIndividualAFs[random_pos_s2]);\n            }\n            \n          //  double thisAAFsplit1 = vector_average(thisSetGenotypesSampledSplit1);\n          //  double thisAAFsplit2 = vector_average(thisSetGenotypesSampledSplit2);\n            double thisAAFsplit1 = vector_average(thisSetIndividualAFsSampledSplit1);\n            double thisAAFsplit2 = vector_average(thisSetIndividualAFsSampledSplit2);\n            setAAFsplit1[it->first] = thisAAFsplit1; setAAFsplit2[it->first] = thisAAFsplit2;\n            \n            // Count correction as in admixtools\n         //   double ya = vector_sum(thisSetGenotypes); double yb = thisSetGenotypes.size() - vector_sum(thisSetGenotypes);\n         //   double yt = (double)thisSetGenotypes.size();\n         //   double h = ya * yb / (yt * (yt - 1.0));\n            //std::cerr << \"it->first: \" << it->first << std::endl;\n            //std::cerr << \"ya: \" << ya << \" ; yb: \" << yb << \" ; yt: \" << yt << std::endl;\n            //std::cerr << \"h: \" << h << \" ; h / yt: \" << h / yt << std::endl;\n            \n          //  setCorrectionFactors[it->first] = h / yt;\n            \n           // std::cerr << \"it->first \" << it->first << std::endl;\n            try {\n            if (AAint == AncestralAlleleRef) { // Ancestral allele seems to be the ref, so derived is alt\n                setDAFs[it->first] = thisAAF;\n                setDAFsplit1[it->first] = thisAAFsplit1; setDAFsplit2[it->first] = thisAAFsplit2;\n            } else if (AAint == AncestralAlleleAlt) { // Ancestral allele seems to be alt, so derived is ref\n                setDAFs[it->first] = (1 - thisAAF);\n                setDAFsplit1[it->first] = 1 - thisAAFsplit1;\n                setDAFsplit2[it->first] = 1 - thisAAFsplit2;\n            }\n                } catch (std::out_of_range& e) { std::cerr << \"The trouble was here\" << it->first << std::endl; }\n        }\n    }\n    averageAAF = totalAAF/numNonZeroCounts;\n    if (AAint == AncestralAlleleRef) averageDAF = averageAAF;\n    else if (AAint == AncestralAlleleAlt) averageDAF = (1 - averageAAF);\n}\n\n\n\nint GeneralSetCounts::findADtagPosition(const std::vector<std::string>& vcfLineFields) {\n    \n    std::vector<std::string> format = split(vcfLineFields[8], ':');\n    if (format.size() == 1) return LikelihoodsProbabilitiesAbsent; // The GT tag must be present in the first place\n    \n    int ADTagPosition = returnFormatTagPosition(format, \"AD\");\n    if (ADTagPosition == std::numeric_limits<int>::min()) {\n        std::cerr << \"Could not find the AD tag in the VCF file. This tag is requored to use the pool-seq option. Exiting ....\" << std::endl;\n        exit(1);\n    }\n    return ADTagPosition;\n}\n\n\ndouble calculateOneDs(double ABBAtotal, double BABAtotal) {\n    // Get the D values\n    double Dnum1 = ABBAtotal - BABAtotal;\n    \n    double Ddenom1 = ABBAtotal + BABAtotal;\n    double D = Dnum1/Ddenom1;\n    return D;\n}\n\n\n\ndouble* calculateThreeDs(double ABBAtotal, double BABAtotal, double BBAAtotal) {\n    // Get the D values\n    double Dnum1 = ABBAtotal - BABAtotal;\n    double Dnum2 = ABBAtotal - BBAAtotal;\n    double Dnum3 = BBAAtotal - BABAtotal;\n    \n    double Ddenom1 = ABBAtotal + BABAtotal;\n    double Ddenom2 = ABBAtotal + BBAAtotal;\n    double Ddenom3 = BBAAtotal + BABAtotal;\n    static double Ds[3]; Ds[0] = Dnum1/Ddenom1; Ds[1] = Dnum2/Ddenom2; Ds[2] = Dnum3/Ddenom3;\n    return Ds;\n}\n\n\ndouble stringToDouble(std::string s) {\n    double d;\n    std::stringstream ss(s); //turn the string into a stream\n    ss >> d; //convert\n    return d;\n}\n\n\n// Remove a single file extension from the filename\nstd::string stripExtension(const std::string& filename)\n{\n    size_t suffixPos = filename.find_last_of('.');\n    if(suffixPos == std::string::npos)\n        return filename; // no suffix\n    else\n        return filename.substr(0, suffixPos);\n}\n\n\nvoid split(const std::string &s, char delim, std::vector<std::string> &elems) {\n    std::stringstream ss(s);\n    std::string item;\n    while (std::getline(ss, item, delim)) {\n        elems.push_back(item);\n    }\n}\n\nstd::vector<std::string> split(const std::string &s, char delim) {\n    std::vector<std::string> elems;\n    split(s, delim, elems);\n    return elems;\n}\n    \n    \nvoid splitToDouble(const std::string &s, char delim, std::vector<double> &elems) {\n    std::stringstream ss(s);\n    std::string item;\n    while (std::getline(ss, item, delim)) {\n        elems.push_back(stringToDouble(item));\n    }\n}\n\nstd::vector<double> splitToDouble(const std::string &s, char delim) {\n    std::vector<double> elems;\n    splitToDouble(s, delim, elems);\n    return elems;\n}\n\nstd::vector<std::string> split2(std::string s, string delim) {\n    std::vector<std::string> elems;\n    size_t pos = 0;\n    std::string token;\n    while ((pos = s.find(delim)) != std::string::npos) {\n        token = s.substr(0, pos);\n        elems.push_back(token);\n        s.erase(0, pos + delim.length());\n    }\n    elems.push_back(s);\n    return elems;\n}\n\n\nstd::vector<size_t> locateSet(const std::vector<std::string>& sample_names, const std::vector<std::string>& set) {\n    std::vector<size_t> setLocs;\n    for (std::vector<std::string>::size_type i = 0; i != set.size(); i++) {\n        std::vector<std::string>::const_iterator it = std::find(sample_names.begin(), sample_names.end(), set[i]);\n        if (it == sample_names.end()) {\n            std::cerr << \"Did not find the sample: \\\"\" << set[i] << \"\\\"\" << std::endl;\n            print_vector(sample_names, std::cerr,',');\n        } else {\n            size_t loc = std::distance(sample_names.begin(), it);\n            setLocs.push_back(loc);\n        }\n    }\n    return setLocs;\n}\n\n\n//\nstd::string suffix(const std::string& seq, size_t len)\n{\n    assert(seq.length() >= len);\n    return seq.substr(seq.length() - len);\n}\n\n// Returns true if the filename has an extension indicating it is compressed\nbool isGzip(const std::string& filename)\n{\n    size_t suffix_length = sizeof(GZIP_EXT) - 1;\n    \n    // Assume files without an extension are not compressed\n    if(filename.length() < suffix_length)\n        return false;\n    \n    std::string extension = suffix(filename, suffix_length);\n    return extension == GZIP_EXT;\n}\n\n// Ensure a filehandle is open\nvoid assertFileOpen(std::ifstream& fh, const std::string& fn)\n{\n    if(!fh.is_open())\n    {\n        std::cerr << \"ERROR: Could not open \" << fn << \" for read\\n\";\n        exit(EXIT_FAILURE);\n    }\n}\n// Ensure a filehandle is open\nvoid assertFileOpen(std::ofstream& fh, const std::string& fn)\n{\n    if(!fh.is_open())\n    {\n        std::cerr << \"ERROR: Could not open \" << fn << \" for write\\n\";\n        exit(EXIT_FAILURE);\n    }\n}\n\n\nvoid assertGZOpen(gzstreambase& gh, const std::string& fn)\n{\n    if(!gh.good())\n    {\n        std::cerr << \"ERROR: Could not open \" << fn << std::endl;\n        exit(EXIT_FAILURE);\n    }\n}\n\nvoid checkGenotypesExist(const std::vector<std::string>& fields, const int variantNum) {\n    if (fields.size() <= NUM_NON_GENOTYPE_COLUMNS) {\n        std::cerr << \"ERROR: Variant \" << variantNum << \" in the VCF appears to be truncated.\"  << std::endl;\n        print_vector(fields, std::cerr);\n        std::cerr << \"Exiting...\" << std::endl; exit(1);\n    }\n}\n\n// Open a file that may or may not be gzipped for reading\n// The caller is responsible for freeing the handle\nstd::istream* createReader(const std::string& filename, std::ios_base::openmode mode)\n{\n    if(isGzip(filename))\n    {\n        igzstream* pGZ = new igzstream(filename.c_str(), mode);\n        assertGZOpen(*pGZ, filename);\n        return pGZ;\n    }\n    else\n    {\n        std::ifstream* pReader = new std::ifstream(filename.c_str(), mode);\n        assertFileOpen(*pReader, filename);\n        return pReader;\n    }\n}\n\n// Open a file that may or may not be gzipped for writing\n// The caller is responsible for freeing the handle\nstd::ostream* createWriter(const std::string& filename,\n                           std::ios_base::openmode mode)\n{\n    if(isGzip(filename))\n    {\n        ogzstream* pGZ = new ogzstream(filename.c_str(), mode);\n        assertGZOpen(*pGZ, filename);\n        return pGZ;\n    }\n    else\n    {\n        std::ofstream* pWriter = new std::ofstream(filename.c_str(), mode);\n        assertFileOpen(*pWriter, filename);\n        return pWriter;\n    }\n}\n\nbool file_exists(const std::string& name) {\n    std::ifstream f(name.c_str());\n    return f.good();\n}\n\n\nvoid assignSplits01FromAlleleFrequency(const double p, double& splitA, double& splitB) {\n    double r = ((double) rand() / (RAND_MAX));\n    if (r <= p) { splitA = 1; }\n    double r2 = ((double) rand() / (RAND_MAX));\n    if (r2 <= p) { splitB = 1; }\n}\n"
  },
  {
    "path": "Dsuite_utils.h",
    "content": "//\n//  Dsuite_utils.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#ifndef Dsuite_utils_h\n#define Dsuite_utils_h\n#include <getopt.h>\n#include <stdio.h>\n#include <map>\n#include <vector>\n#include <sstream>\n#include <fstream>\n#include <cmath>\n#include <algorithm>\n#include <assert.h>\n#include <time.h>\n#include <regex>\n#include <iterator>\n#include <algorithm>\n#include <limits>\n#include <random>\n#include <list>\n#include <cstdint>\n#include <iterator>\n#include \"gzstream.h\"\n#include \"kstest.h\"\n\n#define PROGRAM_BIN \"Dsuite\"\n#define PACKAGE_BUGREPORT \"milan.malinsky@iee.unibe.ch\"\n#define GZIP_EXT \".gz\"\n#define F4HEADER \"f4-ratio\"\n#define ploidy 2\n\n#define CHECK_TREE_ERROR_MSG \"It seems that this species is in the SETS.txt file but can't be found in the tree. Please check the spelling and completeness of your tree file.\"\n\n#define P3isTrios2_Dpositive 1      // 1 - trios[i][0] and trios[i][1] are P1 and P2; D >= 0\n#define P3isTrios2_Dnegative 2      // 2 - trios[i][0] and trios[i][1] are P1 and P2; D < 0\n#define P3isTrios1_Dpositive 3      // 3 - trios[i][0] and trios[i][2] are P1 and P2; D >= 0\n#define P3isTrios1_Dnegative 4      // 4 - trios[i][0] and trios[i][2] are P1 and P2; D < 0\n#define P3isTrios0_Dpositive 5      // 5 - trios[i][2] and trios[i][1] are P1 and P2; D >= 0\n#define P3isTrios0_Dnegative 6      // 6 - trios[i][2] and trios[i][1] are P1 and P2; D < 0\n\n#define P3isTrios2 7    // 7 - trios[i][0] and trios[i][1] are P1 and P2;\n#define P3isTrios1 8    // 8 - trios[i][0] and trios[i][2] are P1 and P2;\n#define P3isTrios0 9    // 9 - trios[i][1] and trios[i][2] are P1 and P2;\n\n#define ABBAvector 0\n#define BABAvector 1\n#define BBAAvector 2\n\n\n#define OutgroupNotRequired 0\n#define OutgroupRequired 1\n\n#define LikelihoodsProbabilitiesAbsent 0\n#define LikelihoodsProbabilitiesGP 1\n#define LikelihoodsProbabilitiesGL 2\n#define LikelihoodsProbabilitiesPL 3\n\n#define AncestralAlleleMissing -1\n#define AncestralAlleleRef 0\n#define AncestralAlleleAlt 1\n\nusing std::string;\n// VCF format constant\nstatic const int NUM_NON_GENOTYPE_COLUMNS=9;  // 8 mendatory columns + 1 column with definition of the genotype columns\n\nvoid assertFileOpen(std::ifstream& fh, const std::string& fn);\nvoid assertFileOpen(std::ofstream& fh, const std::string& fn);\nvoid checkGenotypesExist(const std::vector<std::string>& fields, const int variantNum);\ndouble calculateOneDs(double ABBAtotal, double BABAtotal);\ndouble* calculateThreeDs(double ABBAtotal, double BABAtotal, double BBAAtotal);\ndouble f4_perVariant(double p1, double p2, double p3, double p4);\ndouble Fd_Denom_perVariant(double p1, double p2, double p3, double pO);\ndouble fG_Denom_perVariant(double p1, double p3a, double p3b, double pO);\ndouble FdM_Denom_perVariant(double p1, double p2, double p3, double pO);\nlong double normalCDF(double x);\ndouble stringToDouble(std::string s);\nstd::string stripExtension(const std::string& filename);\nstd::vector<std::string> split2(std::string s, string delim);\nstd::vector<std::string> split(const std::string &s, char delim);\nstd::vector<double> splitToDouble(const std::string &s, char delim);\nstd::vector<size_t> locateSet(const std::vector<std::string>& sample_names, const std::vector<std::string>& set);\nstd::istream* createReader(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in);\nstd::ostream* createWriter(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out);\nbool file_exists(const std::string& name);\nvoid assignSplits01FromAlleleFrequency(const double p, double& splitA, double& splitB);\n\n// Converting numbers (int, double, size_t, and char) to string\ntemplate <typename T> std::string numToString(T i) {\n    std::string ret;\n    std::stringstream out;\n    out << i;\n    ret = out.str();\n    return ret;\n}\n\n///Represents the exception for taking the median of an empty list\nclass median_of_empty_list_exception:public std::exception{\n  virtual const char* what() const throw() {\n    return \"Attempt to take the median of an empty list of numbers.  \"\n      \"The median of an empty list is undefined.\";\n  }\n};\n\n///Return the median of a sequence of numbers defined by the random\n///access iterators begin and end.  The sequence must not be empty\n///(median is undefined for an empty set).\n///\n///The numbers must be convertible to double.\ntemplate<class RandAccessIter> double median(RandAccessIter begin, RandAccessIter end) {\n  if(begin == end){ throw median_of_empty_list_exception(); }\n  std::size_t size = end - begin;\n  std::size_t middleIdx = size/2;\n  RandAccessIter target = begin + middleIdx;\n  std::nth_element(begin, target, end);\n\n  if(size % 2 != 0){ //Odd number of elements\n    return *target;\n  }else{            //Even number of elements\n    double a = *target;\n    RandAccessIter targetNeighbor= target-1;\n    std::nth_element(begin, targetNeighbor, end);\n    return (a+*targetNeighbor)/2.0;\n  }\n}\n\n\n\n// Print an arbitrary vector to a file\ntemplate <class T> void print_vector(T vector, std::ostream& outFile, char delim = '\\t', bool endLine = true) {\n    for (int i = 0; i < vector.size(); i++) {\n        if (i == (vector.size()-1)) {\n            if (endLine) outFile << vector[i] << std::endl;\n            else outFile << vector[i];\n        } else {\n            outFile << vector[i] << delim;\n        }\n    }\n}\n\ntemplate <class T> double vector_average(T vector) {\n    double sum = 0;\n    for (int i = 0; i < vector.size(); i++) {\n        sum += vector[i];\n    }\n    double average = (double)sum / (double)vector.size();\n    return average;\n}\n\ntemplate <class T> double vector_sum(T vector) {\n    double sum = 0;\n    for (int i = 0; i < vector.size(); i++) {\n        sum += vector[i];\n    }\n    return sum;\n}\n\ninline void copy_except(int i, std::vector<double>& inVec, std::vector<double>& outVec) {\n    std::copy(inVec.begin(), inVec.begin() + i, outVec.begin());\n    std::copy(inVec.begin() + i + 1, inVec.end(), outVec.begin()+i);\n    //std::cerr << \"copying:\" << i << \" \"; print_vector_stream(inVec, std::cerr);\n    //std::cerr << \"copied: \" << i << \" \"; print_vector_stream(outVec, std::cerr);\n}\n\ninline unsigned nChoosek( unsigned n, unsigned k )\n{\n    if (k > n) return 0;\n    if (k * 2 > n) k = n-k;\n    if (k == 0) return 1;\n    \n    int result = n;\n    for( int i = 2; i <= k; ++i ) {\n        result *= (n-i+1);\n        result /= i;\n    }\n    return result;\n}\n\n// jackknive standard error\ntemplate <class T> double jackknive_std_err(T& vector) {\n    if (vector.size() < 5) {\n        throw \"WARNING: Fewer than five blocks to calculate jackknife!!\";\n    }\n    std::vector<double> jackkniveAverages;\n    std::vector<double> JregionDs; JregionDs.resize(vector.size()-1);\n    for (std::vector<double>::size_type i = 0; i != vector.size(); i++) {\n        // std::cerr << \"copying \" << i << std::endl;\n        copy_except(i, vector, JregionDs);\n        jackkniveAverages.push_back(vector_average(JregionDs));\n        JregionDs.clear(); JregionDs.resize(vector.size()-1);\n    }\n    double jackkniveOverallMean = vector_average(jackkniveAverages);\n    double sum = 0;\n    for (int i = 0; i < jackkniveAverages.size(); i++) {\n        sum += std::pow((jackkniveAverages[i] - jackkniveOverallMean), 2.0);\n    }\n    double var = ((double)(jackkniveAverages.size()-1)/(double)jackkniveAverages.size()) * sum;\n    double Dstd_err = std::sqrt(var);\n    return Dstd_err;\n}\n\nclass GeneralSetCounts {\npublic:\n    GeneralSetCounts(const std::map<string, std::vector<size_t>>& setsToPosMap, const int nSamples) : overall(0), averageAAF(-1.0), averageDAF(-1.0),  likelihoodsProbabilitiesType(LikelihoodsProbabilitiesAbsent), AAint(AncestralAlleleMissing) {\n        for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {\n            setAltCounts[it->first] = 0; setAlleleCounts[it->first] = 0; setAlleleProbCounts[it->first] = 0;\n            setAAFs[it->first] = -1.0; setDAFs[it->first] = -1.0;\n            setAAFsFromLikelihoods[it->first] = -1.0; setDAFsFromLikelihoods[it->first] = -1.0;\n            setPoolAAFs[it->first] = -1.0; setPoolDAFs[it->first] = -1.0;\n            setSizes.push_back(it->second.size()); setCorrectionFactors[it->first] = -1.0;\n            setHWEpriorsFromAAFfromGT[it->first].assign(3, -1.0);\n            setHWEpriorsFromDAFfromGT[it->first].assign(3, -1.0);\n            std::vector<int> thisSetGenotypes; setGenotypes[it->first] = thisSetGenotypes;\n            std::vector<double> thisSetIndividualAFs; setIndividualAFs[it->first] = thisSetIndividualAFs;\n            std::vector<double> thisSetIndividualExpGenotypes; setIndividualExpectedGenotypes[it->first] = thisSetIndividualExpGenotypes;\n        }\n        individualsWithVariant.assign(nSamples, 0);\n        individualPoolAAFs.assign(nSamples, -1.0);\n    };\n    \n    void getSetVariantCountsSimple(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);\n    void getSetVariantCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);\n    \n    int checkForGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& vcfLineFields);\n    int findADtagPosition(const std::vector<std::string>& vcfLineFields);\n    \n    void getAFsFromGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition);\n    void getAFsFromADtag(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth);\n    \n    int overall; int AAint;\n    std::map<string,std::vector<int>> setGenotypes;\n    std::map<string,std::vector<double>> setIndividualAFs;\n    std::map<string,std::vector<double>> setIndividualExpectedGenotypes;\n    std::map<string,int> setAltCounts;\n    std::map<string,int> setAlleleCounts; // The number of non-missing alleles for this set\n    std::map<string,int> setAlleleProbCounts; // The number of non-missing alleles for this set in terms of likelihoods/probabilities\n    std::vector<double> individualPoolAAFs;  // Allele frequency for each individual pool estimated from Allelic Depth (AD tag in VCF) - for pool-seq data\n    std::map<string,double> setPoolAAFs; // The above individual pool values are then averaged if multiple pools form a set (i.e., a population or species)\n    std::map<string,double> setPoolDAFs;\n    \n    std::vector<size_t> setSizes; std::map<string,double> setCorrectionFactors;\n    std::map<string,double> setAAFs; double averageAAF;     // Allele frequencies - alternative allele\n    std::map<string,double> setDAFs; double averageDAF;     // Allele frequencies - derived allele\n    std::map<string,double> setAAFsFromLikelihoods; double averageAAFFromLikelihoods;   // Allele frequencies - alternative allele\n    std::map<string,double> setDAFsFromLikelihoods; double averageDAFFromLikelihoods;   // Allele frequencies - derived allele\n    std::vector<int> individualsWithVariant; // 0 homRef, 1 het, 2 homAlt\n    int likelihoodsProbabilitiesType;\n    // std::vector<int> set1individualsWithVariant; std::vector<int> set2individualsWithVariant;\n    // std::vector<int> set3individualsWithVariant; std::vector<int> set4individualsWithVariant;\n    \n\n    int returnFormatTagPosition(std::vector<std::string>& format, const std::string& tag);\n    void setHWEpriorsFromAFfromGT();\n    std::vector<double> probabilitiesFromLikelihoods(const std::vector<double>& thisLikelihoods, const string& species);\n    std::map<string,std::vector<double> > setHWEpriorsFromAAFfromGT;\n    std::map<string,std::vector<double> > setHWEpriorsFromDAFfromGT;\n    \nprivate:\n    void getBasicCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);\n    \n    \n};\n\n// Split sets for the f_G statistic\nclass GeneralSetCountsWithSplits : public GeneralSetCounts {\npublic:\n    GeneralSetCountsWithSplits(const std::map<string, std::vector<size_t>>& setsToPosMap, const int nSamples) : GeneralSetCounts(setsToPosMap,nSamples) {\n        for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {\n            setAAFsplit1[it->first] = -1.0; setAAFsplit2[it->first] = -1.0; setDAFsplit1[it->first] = -1.0; setDAFsplit2[it->first] = -1.0;\n            setAlleleCountsSplit1[it->first] = 0; setAlleleCountsSplit2[it->first] = 0; setAltCountsSplit1[it->first] = 0; setAltCountsSplit2[it->first] = 0;\n            \n            setAAFsplit1fromLikelihoods[it->first] = -1.0; setAAFsplit2fromLikelihoods[it->first] = -1.0; setDAFsplit1fromLikelihoods[it->first] = -1.0;\n            setDAFsplit2fromLikelihoods[it->first] = -1.0; setAlleleCountsSplit1fromLikelihoods[it->first] = 0;\n            setAlleleCountsSplit2fromLikelihoods[it->first] = 0;\n            \n            setPoolAAFsplit1[it->first] = -1.0; setPoolAAFsplit2[it->first] = -1.0;\n            setPoolDAFsplit1[it->first] = -1.0; setPoolDAFsplit2[it->first] = -1.0;\n\n        }\n    }\n    std::map<string,int> setAltCountsSplit1;\n    std::map<string,int> setAltCountsSplit2;\n    std::map<string,double> setAAFsplit1; // Allele frequencies - alternative allele\n    std::map<string,double> setAAFsplit2; //\n    std::map<string,double> setDAFsplit1; // Allele frequencies - derived allele, in the complement of the set\n    std::map<string,double> setDAFsplit2;\n    std::map<string,int> setAlleleCountsSplit1; // The number of non-missing alleles for the complement of this set\n    std::map<string,int> setAlleleCountsSplit2;\n    \n    \n\n    std::map<string,double> setAAFsplit1fromLikelihoods; // Allele frequencies - alternative allele\n    std::map<string,double> setAAFsplit2fromLikelihoods; //\n    std::map<string,double> setDAFsplit1fromLikelihoods; // Allele frequencies - derived allele, in the complement of the set\n    std::map<string,double> setDAFsplit2fromLikelihoods;\n    std::map<string,int> setAlleleCountsSplit1fromLikelihoods; // The number of non-missing alleles for the complement of this set\n    std::map<string,int> setAlleleCountsSplit2fromLikelihoods;\n    \n    std::map<string,double> setPoolAAFsplit1;\n    std::map<string,double> setPoolDAFsplit1;\n    std::map<string,double> setPoolAAFsplit2; \n    std::map<string,double> setPoolDAFsplit2;\n    \n    \n    \n    void getSplitCountsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);\n    \n    void getAFsFromADtagWithSplits(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth);\n    void getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition, const int pos);\n    \n\nprivate:\n    void getBasicCountsWithSplitsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);\n    void getBasicCountsFromLikelihoodsOrProbabilities(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition);\n};\n\nclass TrioDinfo {\npublic:\n    TrioDinfo() {\n        ABBAtotal = 0;\n        BABAtotal = 0;\n        BBAAtotal = 0;\n        treeArrangement = 0; BBAAarrangement = 0; DminArrangement = 0;\n        regionDs.resize(3); usedVars.resize(3); totalUsedVars.resize(3); numStrongVars.resize(3);\n        linearStrongABBApos.resize(3);  linearStrongBABApos.resize(3);\n        linearStrongABBAposStrongSitesOnly.resize(3); linearStrongBABAposStrongSitesOnly.resize(3);\n        usedVars[0] = 0; usedVars[1] = 0; usedVars[2] = 0;\n        totalUsedVars[0] = 0; totalUsedVars[1] = 0; totalUsedVars[2] = 0;\n        numStrongVars[0] = 0; numStrongVars[1] = 0; numStrongVars[2] = 0;\n        localD1num = 0; localD2num = 0; localD3num = 0;\n        localD1denom = 0; localD2denom = 0; localD3denom = 0;\n        F_d_denom1 = 0; F_d_denom1_reversed = 0; F_dM_denom1 = 0; F_dM_denom1_reversed = 0; F_G_denom1 = 0; F_G_denom1_reversed = 0;\n        F_d_denom2 = 0; F_d_denom2_reversed = 0; F_dM_denom2 = 0; F_dM_denom2_reversed = 0; F_G_denom2 = 0; F_G_denom2_reversed = 0;\n        F_d_denom3 = 0; F_d_denom3_reversed = 0; F_dM_denom3 = 0; F_dM_denom3_reversed = 0; F_G_denom3 = 0; F_G_denom3_reversed = 0;\n    };\n    \n    // string P1; string P2; string P3;\n    double ABBAtotal; double BABAtotal; double BBAAtotal;\n    double D1; double D2; double D3; long double D1_p; long double D2_p; long double D3_p; double D1_Z; double D2_Z; double D3_Z;\n    double F_d_denom1; double F_d_denom1_reversed; double F_dM_denom1; double F_dM_denom1_reversed; double F_G_denom1; double F_G_denom1_reversed;\n    double F_d_denom2; double F_d_denom2_reversed; double F_dM_denom2; double F_dM_denom2_reversed; double F_G_denom2; double F_G_denom2_reversed;\n    double F_d_denom3; double F_d_denom3_reversed; double F_dM_denom3; double F_dM_denom3_reversed; double F_G_denom3; double F_G_denom3_reversed;\n    \n    std::vector<std::vector<int>> linearStrongABBApos; // positions of strong (> 0.5) ABBA for the three tree orientations\n    std::vector<std::vector<int>> linearStrongBABApos; // positions of strong (> 0.5) BABA for the three tree orientations\n    \n    std::vector<std::vector<int>> linearStrongABBAposStrongSitesOnly; // positions of strong (> 0.5) ABBA in a vector of only ABBA and BABA sites\n    std::vector<std::vector<int>> linearStrongBABAposStrongSitesOnly; // positions of strong (> 0.5) BABA in a vector of only ABBA and BABA sites\n    \n    double localD1num; double localD2num; double localD3num;\n    double localD1denom; double localD2denom; double localD3denom;\n    std::vector<std::vector<double>> regionDs; // vector with three empty (double) vectors\n    std::vector<int> usedVars; // Used vars for local windows\n    std::vector<int> numStrongVars;\n    std::vector<int> totalUsedVars; // Used vars for local windows\n    \n    \n    // 1 - trios[i][0] and trios[i][1] are P1 and P2; D >= 0\n    // 2 - trios[i][0] and trios[i][1] are P1 and P2; D < 0\n    // 3 - trios[i][0] and trios[i][2] are P1 and P2; D >= 0\n    // 4 - trios[i][0] and trios[i][2] are P1 and P2; D < 0\n    // 5 - trios[i][1] and trios[i][2] are P1 and P2; D >= 0\n    // 6 - trios[i][1] and trios[i][2] are P1 and P2; D < 0\n    int treeArrangement; int BBAAarrangement; int DminArrangement;    \n    \n    \n    int assignTreeArrangement(const std::vector<int>& treeLevels, const int loc1, const int loc2, const int loc3) {\n        int midLoc = std::max(std::min(loc1,loc2), std::min(std::max(loc1,loc2),loc3));\n        int arrangement = 0;\n        if (midLoc == loc1) {\n            if (loc2 < loc1) {\n                int m1 = *std::min_element(treeLevels.begin()+loc2, treeLevels.begin()+loc1);\n                int m2 = *std::min_element(treeLevels.begin()+loc1, treeLevels.begin()+loc3);\n                if (m1 < m2) { if (D2 >= 0) { arrangement = P3isTrios1_Dpositive; } else { arrangement = P3isTrios1_Dnegative;}}\n                else { if (D1 >= 0) { arrangement = P3isTrios2_Dpositive; } else { arrangement = P3isTrios2_Dnegative; } }\n            } else {\n                int m1 = *std::min_element(treeLevels.begin()+loc3, treeLevels.begin()+loc1);\n                int m2 = *std::min_element(treeLevels.begin()+loc1, treeLevels.begin()+loc2);\n                if (m1 < m2) { if (D1 >= 0) { arrangement = P3isTrios2_Dpositive; } else { arrangement = P3isTrios2_Dnegative; }}\n                else { if (D2 >= 0) arrangement = P3isTrios1_Dpositive; else arrangement = P3isTrios1_Dnegative;}\n            }\n        } else if (midLoc == loc2) {\n            if (loc1 < loc2) {\n                int m1 = *std::min_element(treeLevels.begin()+loc1, treeLevels.begin()+loc2);\n                int m2 = *std::min_element(treeLevels.begin()+loc2, treeLevels.begin()+loc3);\n                if (m1 < m2) { if (D3 >= 0) arrangement = P3isTrios0_Dpositive; else arrangement = P3isTrios0_Dnegative; }\n                else { if (D1 >= 0) { arrangement = P3isTrios2_Dpositive; } else { arrangement = P3isTrios2_Dnegative; }}\n            } else {\n                int m1 = *std::min_element(treeLevels.begin()+loc3, treeLevels.begin()+loc2);\n                int m2 = *std::min_element(treeLevels.begin()+loc2, treeLevels.begin()+loc1);\n                if (m1 < m2) { if (D1 >= 0) { arrangement = P3isTrios2_Dpositive;} else {arrangement = P3isTrios2_Dnegative;} }\n                else { if (D3 >= 0) arrangement = P3isTrios0_Dpositive; else arrangement = P3isTrios0_Dnegative; }\n            }\n        } else if (midLoc == loc3) {\n            if (loc1 < loc3) {\n                int m1 = *std::min_element(treeLevels.begin()+loc1, treeLevels.begin()+loc3);\n                int m2 = *std::min_element(treeLevels.begin()+loc3, treeLevels.begin()+loc2);\n                if (m1 < m2) { if (D3 >= 0) arrangement = P3isTrios0_Dpositive; else arrangement = P3isTrios0_Dnegative; }\n                else { if (D2 >= 0) arrangement = P3isTrios1_Dpositive; else arrangement = P3isTrios1_Dnegative;}\n                \n            } else {\n                int m1 = *std::min_element(treeLevels.begin()+loc2, treeLevels.begin()+loc3);\n                int m2 = *std::min_element(treeLevels.begin()+loc3, treeLevels.begin()+loc1);\n                if (m1 < m2) { if (D2 >= 0) arrangement = P3isTrios1_Dpositive; else arrangement = P3isTrios1_Dnegative;}\n                else { if (D3 >= 0) arrangement = P3isTrios0_Dpositive; else arrangement = P3isTrios0_Dnegative; }\n            }\n        }\n        return arrangement;\n    }\n    \n    void assignBBAAarrangement() {\n        // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns\n        if (BBAAtotal >= BABAtotal && BBAAtotal >= ABBAtotal) {\n            if (D1 >= 0) BBAAarrangement = P3isTrios2_Dpositive; else BBAAarrangement = P3isTrios2_Dnegative;\n        } else if (BABAtotal >= BBAAtotal && BABAtotal >= ABBAtotal) {\n            if (D2 >= 0) BBAAarrangement = P3isTrios1_Dpositive; else BBAAarrangement = P3isTrios1_Dnegative;\n        } else if (ABBAtotal >= BBAAtotal && ABBAtotal >= BABAtotal) {\n            if (D3 >= 0) BBAAarrangement = P3isTrios0_Dpositive; else BBAAarrangement = P3isTrios0_Dnegative;\n        }\n    }\n    \n    \n    std::vector<string> makeOutVec(const std::vector<string>& trio, const bool fStats, const bool KS_test, const int arrangement) {\n        \n        int numCols = 9; if (fStats) numCols++; if(KS_test) numCols = numCols + 2;\n        std::vector<string> outVec; outVec.resize(numCols);\n        int patternsI = 6; if (fStats) patternsI++; if(KS_test) patternsI = patternsI + 2; // Where will the BBAA, ABBA, etc. counts be put\n        \n        double KSpval; double KSpvalStrongSitesOnly;\n        if (KS_test) {\n            KSpval = testIfStrongSitesUniformlyDistributed(arrangement);\n            if (KSpval < 2.3e-16) KSpval = 2.3e-16;\n            KSpvalStrongSitesOnly = testIfStrongSitesUniformlyDistributed(arrangement, \"strongSitesOnly\");\n            if (KSpvalStrongSitesOnly < 2.3e-16) KSpvalStrongSitesOnly = 2.3e-16;\n        }\n        \n        if (std::fabs(D1_p) < 2.3e-16) { D1_p = 2.3e-16; }\n        if (std::fabs(D2_p) < 2.3e-16) { D2_p = 2.3e-16; }\n        if (std::fabs(D3_p) < 2.3e-16) { D3_p = 2.3e-16; }\n        \n        double f4ratio; double Dnum;\n        \n        switch (arrangement) {\n                \n        case P3isTrios2_Dpositive:\n            outVec[0] = trio[0]; outVec[1] = trio[1]; outVec[2] = trio[2];\n            outVec[3] = numToString(D1); outVec[4] = numToString(D1_Z);\n            outVec[5] = numToString(D1_p); outVec[patternsI] = numToString(BBAAtotal);\n            outVec[patternsI+1] = numToString(ABBAtotal); outVec[patternsI+2] = numToString(BABAtotal);\n            if (fStats) { Dnum = ABBAtotal-BABAtotal; f4ratio = Dnum/F_G_denom1; }\n            break;\n                \n        case P3isTrios2_Dnegative:\n            outVec[0] = trio[1]; outVec[1] = trio[0]; outVec[2] = trio[2];\n            outVec[3] = numToString(std::fabs(D1)); outVec[4] = numToString(D1_Z);\n            outVec[5] = numToString(D1_p); outVec[patternsI] = numToString(BBAAtotal);\n            outVec[patternsI+1] = numToString(BABAtotal); outVec[patternsI+2] = numToString(ABBAtotal);\n            if (fStats) { Dnum = BABAtotal-ABBAtotal; f4ratio = Dnum/F_G_denom1_reversed; }\n            break;\n                \n        case P3isTrios1_Dpositive:\n            outVec[0] = trio[0]; outVec[1] = trio[2];\n            outVec[2] = trio[1]; outVec[3] = numToString(std::fabs(D2)); outVec[4] = numToString(D2_Z);\n            outVec[5] = numToString(D2_p); outVec[patternsI] = numToString(BABAtotal);\n            outVec[patternsI+1] = numToString(ABBAtotal); outVec[patternsI+2] = numToString(BBAAtotal);\n            if (fStats) { Dnum = ABBAtotal - BBAAtotal; f4ratio = Dnum/F_G_denom2; }\n            break;\n        \n        case P3isTrios1_Dnegative:\n            outVec[0] = trio[2]; outVec[1] = trio[0];\n            outVec[2] = trio[1]; outVec[3] = numToString(std::fabs(D2)); outVec[4] = numToString(D2_Z);\n            outVec[5] = numToString(D2_p); outVec[patternsI] = numToString(BABAtotal);\n            outVec[patternsI+1] = numToString(BBAAtotal); outVec[patternsI+2] = numToString(ABBAtotal);\n            if (fStats) { Dnum = BBAAtotal - ABBAtotal; f4ratio = Dnum/F_G_denom2_reversed; }\n            break;\n                \n        case P3isTrios0_Dpositive:\n            outVec[0] = trio[2]; outVec[1] = trio[1];\n            outVec[2] = trio[0]; outVec[3] = numToString(std::fabs(D3)); outVec[4] = numToString(D3_Z);\n            outVec[5] = numToString(D3_p); outVec[patternsI] = numToString(ABBAtotal);\n            outVec[patternsI+1] = numToString(BBAAtotal); outVec[patternsI+2] = numToString(BABAtotal);\n            if (fStats) { Dnum = BBAAtotal - BABAtotal; f4ratio = Dnum/F_G_denom3; }\n            break;\n        \n        case P3isTrios0_Dnegative:\n            outVec[0] = trio[1]; outVec[1] = trio[2];\n            outVec[2] = trio[0]; outVec[3] = numToString(std::fabs(D3)); outVec[4] = numToString(D3_Z);\n            outVec[5] = numToString(D3_p); outVec[patternsI] = numToString(ABBAtotal);\n            outVec[patternsI+1] = numToString(BABAtotal); outVec[patternsI+2] = numToString(BBAAtotal);\n            if (fStats) { Dnum = BABAtotal - BBAAtotal; f4ratio = Dnum/F_G_denom3_reversed; }\n            break;\n                \n        }\n        \n        /* investigating rare cases of unexpected f4-ratio values\n        if (f4ratio < 0 || f4ratio > 1) {\n            std::cerr << \"trio[0]: \" << trio[0] << \"trio[1]: \" << trio[1] << \"trio[2]: \" << trio[2] << std::endl;\n            std::cerr << \"Arrangement: \" << arrangement << std::endl;\n            std::cerr << \"D1: \" << D1 << \" D2: \" << D2  << \" D3: \" << D3 << std::endl;\n            std::cerr << \"ABBAtotal: \" << ABBAtotal << \"BBAAtotal: \" << BBAAtotal  << \"BABAtotal: \" << BABAtotal << std::endl;\n            std::cerr << \"ABBAtotal-BABAtotal: \" << ABBAtotal-BABAtotal << \"ABBAtotal-BBAAtotal: \" << ABBAtotal- BBAAtotal  << \"BBAAtotal-BABAtotal: \" << BBAAtotal-BABAtotal << std::endl;\n            std::cerr << \"F_G_denom1: \" << F_G_denom1 << \"; F_G_denom1_reversed: \" << F_G_denom1_reversed << std::endl;\n            std::cerr << \"F_G_denom2: \" << F_G_denom2 << \"; F_G_denom2_reversed: \" << F_G_denom2_reversed << std::endl;\n            std::cerr << \"F_G_denom3: \" << F_G_denom3 << \"; F_G_denom3_reversed: \" << F_G_denom3_reversed << std::endl;\n            std::cerr << \"f4ratio: \" << f4ratio << std::endl;\n        }\n        */\n        \n        \n        if (fStats) { // For now just bounding those unexpected cases to the 0 to 1 range\n            if (f4ratio < 0) { f4ratio = 0; }\n            if (f4ratio > 1) { f4ratio = 1; }\n            outVec[6] = numToString(f4ratio);\n        }\n        if (KS_test) {\n            if (fStats) { outVec[7] = numToString(KSpval); outVec[8] = numToString(KSpvalStrongSitesOnly); }\n            else  { outVec[6] = numToString(KSpval); outVec[7] = numToString(KSpvalStrongSitesOnly); }\n        }\n        \n        return outVec;\n    }\n    \n    void assignDminArrangement() {\n        \n        if (std::fabs(D1) <= std::fabs(D2) && std::fabs(D1) <= std::fabs(D3)) {\n            if (D1 >= 0) DminArrangement = P3isTrios2_Dpositive; else DminArrangement = P3isTrios2_Dnegative;\n        } else if (std::fabs(D2) <= std::fabs(D1) && std::fabs(D2) <= std::fabs(D3)) { // (P3 == S2)\n            if (D2 >= 0) DminArrangement = P3isTrios1_Dpositive; else DminArrangement = P3isTrios1_Dnegative;\n        } else if (std::fabs(D3) <= std::fabs(D1) && std::fabs(D3) <= std::fabs(D2)) { // (P3 == S1)\n            if (D3 >= 0) DminArrangement = P3isTrios0_Dpositive; else DminArrangement = P3isTrios0_Dnegative;\n        }\n        \n    }\n    \n    \n    \n    void calculateFinalDs() {\n        double* Ds = calculateThreeDs(ABBAtotal, BABAtotal, BBAAtotal);\n        D1 = Ds[0]; D2 = Ds[1]; D3 = Ds[2];\n        \n        // Get the standard error values:\n        double D1stdErr = jackknive_std_err(regionDs[0]); double D2stdErr = jackknive_std_err(regionDs[1]);\n        double D3stdErr = jackknive_std_err(regionDs[2]);\n        //std::cerr << \"Here: \" << regionDs[2][0] << std::endl;\n        // Get the Z-scores\n        D1_Z = std::fabs(D1)/D1stdErr; D2_Z = std::fabs(D2)/D2stdErr; D3_Z = std::fabs(D3)/D3stdErr;\n        // And p-values\n        D1_p = 2 * (1 - normalCDF(D1_Z)); D2_p = 2 * (1 - normalCDF(D2_Z));\n        D3_p = 2 * (1 - normalCDF(D3_Z));\n    }\n    \n    double testIfStrongSitesUniformlyDistributed(int arrangement, string type = \"allSites\") {\n        std::vector<int> linearStrongPosVector;\n        \n        switch (arrangement) {\n            case P3isTrios2_Dpositive:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongABBApos[0].begin(),linearStrongABBApos[0].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongABBAposStrongSitesOnly[0].begin(),linearStrongABBAposStrongSitesOnly[0].end());\n                }\n                break;\n            case P3isTrios2_Dnegative:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongBABApos[0].begin(),linearStrongBABApos[0].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongBABAposStrongSitesOnly[0].begin(),linearStrongBABAposStrongSitesOnly[0].end());\n                }\n                break;\n            case P3isTrios1_Dpositive:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongABBApos[1].begin(),linearStrongABBApos[1].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongABBAposStrongSitesOnly[1].begin(),linearStrongABBAposStrongSitesOnly[1].end());\n                }\n                break;\n            case P3isTrios1_Dnegative:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongBABApos[1].begin(),linearStrongBABApos[1].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongBABAposStrongSitesOnly[1].begin(),linearStrongBABAposStrongSitesOnly[1].end());\n                }\n                break;\n            case P3isTrios0_Dpositive:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongABBApos[2].begin(),linearStrongABBApos[2].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongABBAposStrongSitesOnly[2].begin(),linearStrongABBAposStrongSitesOnly[2].end());\n                }\n                break;\n            case P3isTrios0_Dnegative:\n                if (type == \"allSites\") {\n                    linearStrongPosVector.assign(linearStrongBABApos[2].begin(),linearStrongBABApos[2].end());\n                } else if (type == \"strongSitesOnly\") {\n                    linearStrongPosVector.assign(linearStrongBABAposStrongSitesOnly[2].begin(),linearStrongBABAposStrongSitesOnly[2].end());\n                }\n                break;\n        }\n        \n        if(linearStrongPosVector.size() < 2) {\n           // KSpvalForStrongSites = 1;\n            return 1.0;\n        } else {\n        \n            std::vector<double> linearStrongPosVector0to1(linearStrongPosVector.size(),0.0);\n            for (int i = 0; i < linearStrongPosVector0to1.size(); i++) {\n                linearStrongPosVector0to1[i] = (double)linearStrongPosVector[i]/(double)linearStrongPosVector.back();\n            }\n           // print_vector(linearStrongPosVector0to1, std::cout, ',');\n            \n            double KSpvalForStrongSitesOneSample = ks_test_of_uniformity(linearStrongPosVector0to1, std::cerr, false);\n            return KSpvalForStrongSitesOneSample;\n        }\n    }\n    \n    \n    \n    void addRegionDs(const int arrangement) {\n        switch (arrangement) {\n            case P3isTrios2: if(localD1denom > 0) regionDs[0].push_back(localD1num/localD1denom); localD1num = 0; localD1denom = 0; usedVars[0] = 0; break;\n            case P3isTrios1: if(localD2denom > 0) regionDs[1].push_back(localD2num/localD2denom); localD2num = 0; localD2denom = 0; usedVars[1] = 0; break;\n            case P3isTrios0: if(localD3denom > 0) regionDs[2].push_back(localD3num/localD3denom); localD3num = 0; localD3denom = 0; usedVars[2] = 0; break;\n        }\n    }\n    \n};\n\n//  P3 \\     / P2\n//      -----\n//  P4 /     \\ P1\n\n\n/*\n Quartets are unrooted, so there are three arrangements\n (P1,P2)(P3,P4)   // BBAA (AABB) highest\n (P1,P3)(P2,P4)   // BABA (ABAB) highest\n (P1,P4)(P2,P3)   // ABBA (BAAB) highest\n \n */\nclass QuartetDinfo: public TrioDinfo {\npublic:\n    \n    QuartetDinfo() {\n        TrioDinfo();\n        F_G_denoms.resize(24);\n    }\n    \n    std::vector<double> F_G_denoms;\n    \n    int assignQuartetTreeArrangement(const std::vector<int>& treeLevels, const int loc1, const int loc2, const int loc3, const int loc4) {\n        int firstThreeArranged = assignTreeArrangement(treeLevels, loc1, loc2, loc3);\n        int withFourthArranged = 0; int overallTreeArrangment = 0;\n        switch (firstThreeArranged) {\n            case P3isTrios2_Dpositive:\n            case P3isTrios2_Dnegative:\n                withFourthArranged = assignTreeArrangement(treeLevels, loc1, loc2, loc4);\n                switch (withFourthArranged) {\n                    case P3isTrios2_Dpositive:\n                    case P3isTrios2_Dnegative:\n                        overallTreeArrangment = P3isTrios2; break;\n                    case P3isTrios1_Dpositive:\n                    case P3isTrios1_Dnegative:\n                        overallTreeArrangment = P3isTrios0; break;\n                    case P3isTrios0_Dpositive:\n                    case P3isTrios0_Dnegative:\n                        overallTreeArrangment = P3isTrios1; break;\n                } break;\n            case P3isTrios1_Dpositive:\n            case P3isTrios1_Dnegative:\n                withFourthArranged = assignTreeArrangement(treeLevels, loc1, loc3, loc4);\n                switch (withFourthArranged) {\n                    case P3isTrios2_Dpositive:\n                    case P3isTrios2_Dnegative:\n                        overallTreeArrangment = P3isTrios1; break;\n                    case P3isTrios1_Dpositive:\n                    case P3isTrios1_Dnegative:\n                        overallTreeArrangment = P3isTrios0; break;\n                    case P3isTrios0_Dpositive:\n                    case P3isTrios0_Dnegative:\n                        overallTreeArrangment = P3isTrios2; break;\n                } break;\n            case P3isTrios0_Dpositive:\n            case P3isTrios0_Dnegative:\n            withFourthArranged = assignTreeArrangement(treeLevels, loc2, loc3, loc4);\n            switch (withFourthArranged) {\n                case P3isTrios2_Dpositive:\n                case P3isTrios2_Dnegative:\n                    overallTreeArrangment = P3isTrios0; break;\n                case P3isTrios1_Dpositive:\n                case P3isTrios1_Dnegative:\n                    overallTreeArrangment = P3isTrios1; break;\n                case P3isTrios0_Dpositive:\n                case P3isTrios0_Dnegative:\n                    overallTreeArrangment = P3isTrios2; break;\n            } break;\n        }\n        return overallTreeArrangment;\n    }\n    \n    \n    std::vector<string> makeOutVec(const std::vector<string>& quartet, const bool fStats, const int arrangement, bool allF4 = false) {\n        \n        int vecSize = 10; if (fStats) vecSize++; if (allF4) vecSize += 4;\n        std::vector<string> outVec; outVec.resize(vecSize);\n        int patternsI = 7; if (fStats) patternsI++; // Where will be put the BBAA, ABBA, etc. counts\n        int allF4Pos = patternsI + 3;\n        \n        switch (arrangement) {\n            \n        case P3isTrios2: // Orientation 1\n        case P3isTrios2_Dpositive:\n        case P3isTrios2_Dnegative:\n            outVec[2] = quartet[2]; outVec[3] = quartet[3];\n            outVec[4] = numToString(std::fabs(D1)); outVec[5] = numToString(D1_Z);\n            outVec[6] = numToString(D1_p); outVec[patternsI] = numToString(BBAAtotal);\n            if (D1 >= 0) {\n                outVec[0] = quartet[0]; outVec[1] = quartet[1];\n                outVec[patternsI+1] = numToString(ABBAtotal); outVec[patternsI+2] = numToString(BABAtotal);\n                if (fStats) {\n                    double Dnum = ABBAtotal-BABAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom1);\n                   \n                }\n                if (allF4) {\n                    outVec[allF4Pos] = numToString(F_G_denoms[0]); outVec[allF4Pos+1] = numToString(F_G_denoms[1]);\n                    outVec[allF4Pos+2] = numToString(F_G_denoms[2]); outVec[allF4Pos+3] = numToString(F_G_denoms[3]);\n                }\n            } else {\n                outVec[0] = quartet[1]; outVec[1] = quartet[0];\n                outVec[patternsI+1] = numToString(BABAtotal); outVec[patternsI+2] = numToString(ABBAtotal);\n                if (fStats) {\n                    double Dnum = BABAtotal-ABBAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom1_reversed);\n                }\n                if (allF4) {\n                    outVec[allF4Pos] = numToString(F_G_denoms[4]); outVec[allF4Pos+1] = numToString(F_G_denoms[5]);\n                    outVec[allF4Pos+2] = numToString(F_G_denoms[6]); outVec[allF4Pos+3] = numToString(F_G_denoms[7]);\n                }\n            }\n        break;\n                \n        case P3isTrios1: // Orientation 2\n        case P3isTrios1_Dpositive:\n        case P3isTrios1_Dnegative:\n            outVec[2] = quartet[1]; outVec[3] = quartet[3];\n            outVec[4] = numToString(std::fabs(D2)); outVec[5] = numToString(D2_Z);\n            outVec[6] = numToString(D2_p); outVec[patternsI] = numToString(BABAtotal);\n            if (D2 >= 0) {\n                outVec[0] = quartet[0]; outVec[1] = quartet[2];\n                outVec[patternsI+1] = numToString(ABBAtotal); outVec[patternsI+2] = numToString(BBAAtotal);\n                if (fStats) {\n                    double Dnum = ABBAtotal - BBAAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom2);\n                }\n                if (allF4) {\n                    outVec[allF4Pos] = numToString(F_G_denoms[8]); outVec[allF4Pos+1] = numToString(F_G_denoms[9]);\n                    outVec[allF4Pos+2] = numToString(F_G_denoms[10]); outVec[allF4Pos+3] = numToString(F_G_denoms[11]);\n                }\n            } else {\n                outVec[0] = quartet[2]; outVec[1] = quartet[0];\n                outVec[patternsI+1] = numToString(BBAAtotal); outVec[patternsI+2] = numToString(ABBAtotal);\n                if (fStats) {\n                    double Dnum = BBAAtotal - ABBAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom2_reversed);\n                }\n                if (allF4) {\n                    outVec[allF4Pos] = numToString(F_G_denoms[12]); outVec[allF4Pos+1] = numToString(F_G_denoms[13]);\n                    outVec[allF4Pos+2] = numToString(F_G_denoms[14]); outVec[allF4Pos+3] = numToString(F_G_denoms[15]);\n                }\n            }\n        break;\n                \n        case P3isTrios0: // Orientation 3\n        case P3isTrios0_Dpositive:\n        case P3isTrios0_Dnegative:\n            outVec[2] = quartet[0]; outVec[3] = quartet[3];\n            outVec[4] = numToString(std::fabs(D3)); outVec[5] = numToString(D3_Z);\n            outVec[6] = numToString(D3_p); outVec[patternsI] = numToString(ABBAtotal);\n            if (D3 >= 0) {\n                outVec[0] = quartet[2]; outVec[1] = quartet[1];\n                outVec[patternsI+1] = numToString(BBAAtotal); outVec[patternsI+2] = numToString(BABAtotal);\n                if (fStats) {\n                    double Dnum = BBAAtotal - BABAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom3);\n                }\n                if (allF4) {\n                    outVec[allF4Pos] = numToString(F_G_denoms[16]); outVec[allF4Pos+1] = numToString(F_G_denoms[17]);\n                    outVec[allF4Pos+2] = numToString(F_G_denoms[18]); outVec[allF4Pos+3] = numToString(F_G_denoms[19]);\n                }\n            } else {\n                outVec[0] = quartet[1]; outVec[1] = quartet[2];\n                outVec[patternsI+1] = numToString(BABAtotal); outVec[patternsI+2] = numToString(BBAAtotal);\n                if (fStats) {\n                    double Dnum = BABAtotal - BBAAtotal;\n                    outVec[7] = numToString(Dnum/F_G_denom3_reversed);\n                }\n            }\n            if (allF4) {\n                outVec[allF4Pos] = numToString(F_G_denoms[20]); outVec[allF4Pos+1] = numToString(F_G_denoms[21]);\n                outVec[allF4Pos+2] = numToString(F_G_denoms[22]); outVec[allF4Pos+3] = numToString(F_G_denoms[23]);\n            }\n        break;\n        }\n        return outVec;\n    }\n    \n};\n\n\n#endif /* Dsuite_utils_h */\n\n"
  },
  {
    "path": "KolmogorovSmirnovDist.cpp",
    "content": "//\n//  KolmogorovSmirnovDist.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 30.10.22.\n//\n\n/********************************************************************\n *\n * File:          KolmogorovSmirnovDist.c\n * Environment:   ISO C99 or ANSI C89\n * Author:        Richard Simard\n * Organization:  DIRO, Université de Montréal\n * Version:       10 december 2010\n\n * Copyright 1 march 2010 by Université de Montréal,\n                             Richard Simard and Pierre L'Ecuyer\n =====================================================================\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, version 3 of the License.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\n =====================================================================*/\n\n#include \"KolmogorovSmirnovDist.hpp\"\n#include <math.h>\n#include <stdlib.h>\n\n#define num_Pi     3.14159265358979323846 /* PI */\n#define num_Ln2    0.69314718055994530941 /* log(2) */\n\n/* For x close to 0 or 1, we use the exact formulae of Ruben-Gambino in all\n   cases. For n <= NEXACT, we use exact algorithms: the Durbin matrix and\n   the Pomeranz algorithms. For n > NEXACT, we use asymptotic methods\n   except for x close to 0 where we still use the method of Durbin\n   for n <= NKOLMO. For n > NKOLMO, we use asymptotic methods only and\n   so the precision is less for x close to 0.\n   We could increase the limit NKOLMO to 10^6 to get better precision\n   for x close to 0, but at the price of a slower speed. */\n#define NEXACT 140\n#define NKOLMO 100000\n\n/* The Durbin matrix algorithm for the Kolmogorov-Smirnov distribution */\nstatic double DurbinMatrix (int n, double d);\n\n\n/*========================================================================*/\n#if 0\n\n/* For ANSI C89 only, not for ISO C99 */\n#define MAXI 50\n#define EPSILON 1.0e-15\n\ndouble log1p(double x)\n{\n   /* returns a value equivalent to log(1 + x) accurate also for small x. */\n   if (fabs (x) > 0.1) {\n      return log (1.0 + x);\n   } else {\n      double term = x;\n      double sum = x;\n      int s = 2;\n      while ((fabs (term) > EPSILON * fabs (sum)) && (s < MAXI)) {\n         term *= -x;\n         sum += term / s;\n         s++;\n      }\n      return sum;\n   }\n}\n\n#undef MAXI\n#undef EPSILON\n\n#endif\n\n/*========================================================================*/\n#define NFACT 20\n\n/* The factorial n! for  0 <= n <= NFACT */\nstatic double Factorial[NFACT + 1] = {\n   1,\n   1,\n   2,\n   6,\n   24,\n   120,\n   720,\n   5040,\n   40320,\n   362880,\n   3628800,\n   39916800,\n   479001600.,\n   6227020800.,\n   87178291200.,\n   1307674368000.,\n   20922789888000.,\n   355687428096000.,\n   6402373705728000.,\n   1.21645100408832e+17,\n   2.43290200817664e+18\n};\n\n\n/*========================================================================*/\n#define MFACT 30\n\n/* The natural logarithm of factorial n! for  0 <= n <= MFACT */\nstatic double LnFactorial[MFACT + 1] = {\n   0.,\n   0.,\n   0.6931471805599453,\n   1.791759469228055,\n   3.178053830347946,\n   4.787491742782046,\n   6.579251212010101,\n   8.525161361065415,\n   10.60460290274525,\n   12.80182748008147,\n   15.10441257307552,\n   17.50230784587389,\n   19.98721449566188,\n   22.55216385312342,\n   25.19122118273868,\n   27.89927138384088,\n   30.67186010608066,\n   33.50507345013688,\n   36.39544520803305,\n   39.33988418719949,\n   42.33561646075348,\n   45.3801388984769,\n   48.47118135183522,\n   51.60667556776437,\n   54.7847293981123,\n   58.00360522298051,\n   61.26170176100199,\n   64.55753862700632,\n   67.88974313718154,\n   71.257038967168,\n   74.65823634883016\n};\n\n/*------------------------------------------------------------------------*/\n\nstatic double getLogFactorial (int n)\n{\n   /* Returns the natural logarithm of factorial n! */\n   if (n <= MFACT) {\n      return LnFactorial[n];\n\n   } else {\n      double x = (double) (n + 1);\n      double y = 1.0 / (x * x);\n      double z = ((-(5.95238095238E-4 * y) + 7.936500793651E-4) * y -\n         2.7777777777778E-3) * y + 8.3333333333333E-2;\n      z = ((x - 0.5) * log (x) - x) + 9.1893853320467E-1 + z / x;\n      return z;\n   }\n}\n\n\n/*========================================================================*/\n\nstatic double **CreateMatrixD (int N, int M)\n{\n   int i;\n   double **T2;\n\n   T2 = (double **) malloc (N * sizeof (double *));\n   T2[0] = (double *) malloc (N * M * sizeof (double));\n   for (i = 1; i < N; i++)\n      T2[i] = T2[0] + i * M;\n   return T2;\n}\n\n\nstatic void DeleteMatrixD (double **T)\n{\n   free (T[0]);\n   free (T);\n}\n\n\n/*========================================================================*/\n\nstatic double KSPlusbarAsymp (int n, double x)\n{\n   /* Compute the probability of the KS+ distribution using an asymptotic\n      formula */\n   double t = (6.0 * n * x + 1);\n   double z = t * t / (18.0 * n);\n   double v = 1.0 - (2.0 * z * z - 4.0 * z - 1.0) / (18.0 * n);\n   if (v <= 0.0)\n      return 0.0;\n   v = v * exp (-z);\n   if (v >= 1.0)\n      return 1.0;\n   return v;\n}\n\n\n/*-------------------------------------------------------------------------*/\n\nstatic double KSPlusbarUpper (int n, double x)\n{\n   /* Compute the probability of the KS+ distribution in the upper tail\n      using Smirnov's stable formula */\n   const double EPSILON = 1.0E-12;\n   double q;\n   double Sum = 0.0;\n   double term;\n   double t;\n   double LogCom;\n   double LOGJMAX;\n   int j;\n   int jdiv;\n   int jmax = (int) (n * (1.0 - x));\n\n   if (n > 200000)\n      return KSPlusbarAsymp (n, x);\n\n   /* Avoid log(0) for j = jmax and q ~ 1.0 */\n   if ((1.0 - x - (double) jmax / n) <= 0.0)\n      jmax--;\n\n   if (n > 3000)\n      jdiv = 2;\n   else\n      jdiv = 3;\n\n   j = jmax / jdiv + 1;\n   LogCom = getLogFactorial (n) - getLogFactorial (j) -\n            getLogFactorial (n - j);\n   LOGJMAX = LogCom;\n\n   while (j <= jmax) {\n      q = (double) j / n + x;\n      term = LogCom + (j - 1) * log (q) + (n - j) * log1p (-q);\n      t = exp (term);\n      Sum += t;\n      LogCom += log ((double) (n - j) / (j + 1));\n      if (t <= Sum * EPSILON)\n         break;\n      j++;\n   }\n\n   j = jmax / jdiv;\n   LogCom = LOGJMAX + log ((double) (j + 1) / (n - j));\n\n   while (j > 0) {\n      q = (double) j / n + x;\n      term = LogCom + (j - 1) * log (q) + (n - j) * log1p (-q);\n      t = exp (term);\n      Sum += t;\n      LogCom += log ((double) j / (n - j + 1));\n      if (t <= Sum * EPSILON)\n         break;\n      j--;\n   }\n\n   Sum *= x;\n   /* add the term j = 0 */\n   Sum += exp (n * log1p (-x));\n   return Sum;\n}\n\n\n/*========================================================================*/\n\nstatic double Pelz (int n, double x)\n{\n   /* Approximating the Lower Tail-Areas of the Kolmogorov-Smirnov One-Sample\n      Statistic,\n      Wolfgang Pelz and I. J. Good,\n      Journal of the Royal Statistical Society, Series B.\n      Vol. 38, No. 2 (1976), pp. 152-156\n   */\n\n   const int JMAX = 20;\n   const double EPS = 1.0e-10;\n   const double C = 2.506628274631001;         /* sqrt(2*Pi) */\n   const double C2 = 1.2533141373155001;       /* sqrt(Pi/2) */\n   const double PI2 = num_Pi * num_Pi;\n   const double PI4 = PI2 * PI2;\n   const double RACN = sqrt ((double) n);\n   const double z = RACN * x;\n   const double z2 = z * z;\n   const double z4 = z2 * z2;\n   const double z6 = z4 * z2;\n   const double w = PI2 / (2.0 * z * z);\n   double ti, term, tom;\n   double sum;\n   int j;\n\n   term = 1;\n   j = 0;\n   sum = 0;\n   while (j <= JMAX && term > EPS * sum) {\n      ti = j + 0.5;\n      term = exp (-ti * ti * w);\n      sum += term;\n      j++;\n   }\n   sum *= C / z;\n\n   term = 1;\n   tom = 0;\n   j = 0;\n   while (j <= JMAX && fabs (term) > EPS * fabs (tom)) {\n      ti = j + 0.5;\n      term = (PI2 * ti * ti - z2) * exp (-ti * ti * w);\n      tom += term;\n      j++;\n   }\n   sum += tom * C2 / (RACN * 3.0 * z4);\n\n   term = 1;\n   tom = 0;\n   j = 0;\n   while (j <= JMAX && fabs (term) > EPS * fabs (tom)) {\n      ti = j + 0.5;\n      term = 6 * z6 + 2 * z4 + PI2 * (2 * z4 - 5 * z2) * ti * ti +\n         PI4 * (1 - 2 * z2) * ti * ti * ti * ti;\n      term *= exp (-ti * ti * w);\n      tom += term;\n      j++;\n   }\n   sum += tom * C2 / (n * 36.0 * z * z6);\n\n   term = 1;\n   tom = 0;\n   j = 1;\n   while (j <= JMAX && term > EPS * tom) {\n      ti = j;\n      term = PI2 * ti * ti * exp (-ti * ti * w);\n      tom += term;\n      j++;\n   }\n   sum -= tom * C2 / (n * 18.0 * z * z2);\n\n   term = 1;\n   tom = 0;\n   j = 0;\n   while (j <= JMAX && fabs (term) > EPS * fabs (tom)) {\n      ti = j + 0.5;\n      ti = ti * ti;\n      term = -30 * z6 - 90 * z6 * z2 + PI2 * (135 * z4 - 96 * z6) * ti +\n         PI4 * (212 * z4 - 60 * z2) * ti * ti + PI2 * PI4 * ti * ti * ti * (5 -\n         30 * z2);\n      term *= exp (-ti * w);\n      tom += term;\n      j++;\n   }\n   sum += tom * C2 / (RACN * n * 3240.0 * z4 * z6);\n\n   term = 1;\n   tom = 0;\n   j = 1;\n   while (j <= JMAX && fabs (term) > EPS * fabs (tom)) {\n      ti = j * j;\n      term = (3 * PI2 * ti * z2 - PI4 * ti * ti) * exp (-ti * w);\n      tom += term;\n      j++;\n   }\n   sum += tom * C2 / (RACN * n * 108.0 * z6);\n\n   return sum;\n}\n\n\n/*=========================================================================*/\n\nstatic void CalcFloorCeil (\n   int n,                         /* sample size */\n   double t,                      /* = nx */\n   double *A,                     /* A_i */\n   double *Atflo,                 /* floor (A_i - t) */\n   double *Atcei                  /* ceiling (A_i + t) */\n   )\n{\n   /* Precompute A_i, floors, and ceilings for limits of sums in the\n      Pomeranz algorithm */\n   int i;\n   int ell = (int) t;             /* floor (t) */\n   double z = t - ell;            /* t - floor (t) */\n   double w = ceil (t) - t;\n\n   if (z > 0.5) {\n      for (i = 2; i <= 2 * n + 2; i += 2)\n         Atflo[i] = i / 2 - 2 - ell;\n      for (i = 1; i <= 2 * n + 2; i += 2)\n         Atflo[i] = i / 2 - 1 - ell;\n\n      for (i = 2; i <= 2 * n + 2; i += 2)\n         Atcei[i] = i / 2 + ell;\n      for (i = 1; i <= 2 * n + 2; i += 2)\n         Atcei[i] = i / 2 + 1 + ell;\n\n   } else if (z > 0.0) {\n      for (i = 1; i <= 2 * n + 2; i++)\n         Atflo[i] = i / 2 - 1 - ell;\n\n      for (i = 2; i <= 2 * n + 2; i++)\n         Atcei[i] = i / 2 + ell;\n      Atcei[1] = 1 + ell;\n\n   } else {                       /* z == 0 */\n      for (i = 2; i <= 2 * n + 2; i += 2)\n         Atflo[i] = i / 2 - 1 - ell;\n      for (i = 1; i <= 2 * n + 2; i += 2)\n         Atflo[i] = i / 2 - ell;\n\n      for (i = 2; i <= 2 * n + 2; i += 2)\n         Atcei[i] = i / 2 - 1 + ell;\n      for (i = 1; i <= 2 * n + 2; i += 2)\n         Atcei[i] = i / 2 + ell;\n   }\n\n   if (w < z)\n      z = w;\n   A[0] = A[1] = 0;\n   A[2] = z;\n   A[3] = 1 - A[2];\n   for (i = 4; i <= 2 * n + 1; i++)\n      A[i] = A[i - 2] + 1;\n   A[2 * n + 2] = n;\n}\n\n\n/*========================================================================*/\n\nstatic double Pomeranz (int n, double x)\n{\n   /* The Pomeranz algorithm to compute the KS distribution */\n   const double EPS = 1.0e-15;\n   const int ENO = 350;\n   const double RENO = ldexp (1.0, ENO); /* for renormalization of V */\n   int coreno;                    /* counter: how many renormalizations */\n   const double t = n * x;\n   double w, sum, minsum;\n   int i, j, k, s;\n   int r1, r2;                    /* Indices i and i-1 for V[i][] */\n   int jlow, jup, klow, kup, kup0;\n   double *A;\n   double *Atflo;\n   double *Atcei;\n   double **V;\n   double **H;                    /* = pow(w, j) / Factorial(j) */\n\n   A = (double *) calloc ((size_t) (2 * n + 3), sizeof (double));\n   Atflo = (double *) calloc ((size_t) (2 * n + 3), sizeof (double));\n   Atcei = (double *) calloc ((size_t) (2 * n + 3), sizeof (double));\n   V = (double **) CreateMatrixD (2, n + 2);\n   H = (double **) CreateMatrixD (4, n + 2);\n\n   CalcFloorCeil (n, t, A, Atflo, Atcei);\n\n   for (j = 1; j <= n + 1; j++)\n      V[0][j] = 0;\n   for (j = 2; j <= n + 1; j++)\n      V[1][j] = 0;\n   V[1][1] = RENO;\n   coreno = 1;\n\n   /* Precompute H[][] = (A[j] - A[j-1]^k / k! for speed */\n   H[0][0] = 1;\n   w = 2.0 * A[2] / n;\n   for (j = 1; j <= n + 1; j++)\n      H[0][j] = w * H[0][j - 1] / j;\n\n   H[1][0] = 1;\n   w = (1.0 - 2.0 * A[2]) / n;\n   for (j = 1; j <= n + 1; j++)\n      H[1][j] = w * H[1][j - 1] / j;\n\n   H[2][0] = 1;\n   w = A[2] / n;\n   for (j = 1; j <= n + 1; j++)\n      H[2][j] = w * H[2][j - 1] / j;\n\n   H[3][0] = 1;\n   for (j = 1; j <= n + 1; j++)\n      H[3][j] = 0;\n\n   r1 = 0;\n   r2 = 1;\n   for (i = 2; i <= 2 * n + 2; i++) {\n      jlow = 2 + Atflo[i];\n      if (jlow < 1)\n         jlow = 1;\n      jup = Atcei[i];\n      if (jup > n + 1)\n         jup = n + 1;\n\n      klow = 2 + Atflo[i - 1];\n      if (klow < 1)\n         klow = 1;\n      kup0 = Atcei[i - 1];\n\n      /* Find to which case it corresponds */\n      w = (A[i] - A[i - 1]) / n;\n      s = -1;\n      for (j = 0; j < 4; j++) {\n         if (fabs (w - H[j][1]) <= EPS) {\n            s = j;\n            break;\n         }\n      }\n      /* assert (s >= 0, \"Pomeranz: s < 0\"); */\n\n      minsum = RENO;\n      r1 = (r1 + 1) & 1;          /* i - 1 */\n      r2 = (r2 + 1) & 1;          /* i */\n\n      for (j = jlow; j <= jup; j++) {\n         kup = kup0;\n         if (kup > j)\n            kup = j;\n         sum = 0;\n         for (k = kup; k >= klow; k--)\n            sum += V[r1][k] * H[s][j - k];\n         V[r2][j] = sum;\n         if (sum < minsum)\n            minsum = sum;\n      }\n\n      if (minsum < 1.0e-280) {\n         /* V is too small: renormalize to avoid underflow of probabilities */\n         for (j = jlow; j <= jup; j++)\n            V[r2][j] *= RENO;\n         coreno++;                /* keep track of log of RENO */\n      }\n   }\n\n   sum = V[r2][n + 1];\n   free (A);\n   free (Atflo);\n   free (Atcei);\n   DeleteMatrixD (H);\n   DeleteMatrixD (V);\n   w = getLogFactorial (n) - coreno * ENO * num_Ln2 + log (sum);\n   if (w >= 0.)\n      return 1.;\n   return exp (w);\n}\n\n\n/*========================================================================*/\n\nstatic double cdfSpecial (int n, double x)\n{\n   /* The KS distribution is known exactly for these cases */\n\n   /* For nx^2 > 18, KSfbar(n, x) is smaller than 5e-16 */\n   if ((n * x * x >= 18.0) || (x >= 1.0))\n      return 1.0;\n\n   if (x <= 0.5 / n)\n      return 0.0;\n\n   if (n == 1)\n      return 2.0 * x - 1.0;\n\n   if (x <= 1.0 / n) {\n      double t = 2.0 * x - 1.0 / n;\n      double w;\n      if (n <= NFACT) {\n         w = Factorial[n];\n         return w * pow (t, (double) n);\n      }\n      w = getLogFactorial (n) + n * log (t);\n      return exp (w);\n   }\n\n   if (x >= 1.0 - 1.0 / n) {\n      return 1.0 - 2.0 * pow (1.0 - x, (double) n);\n   }\n\n   return -1.0;\n}\n\n\n/*========================================================================*/\n\ndouble KScdf (int n, double x)\n{\n   const double w = n * x * x;\n   double u = cdfSpecial (n, x);\n   if (u >= 0.0)\n      return u;\n\n   if (n <= NEXACT) {\n      if (w < 0.754693)\n         return DurbinMatrix (n, x);\n      if (w < 4.0)\n         return Pomeranz (n, x);\n      return 1.0 - KSfbar (n, x);\n   }\n\n   /* if (n * x * sqrt(x) <= 1.4) */\n   if ((w * x * n <= 2.0) && (n <= NKOLMO))\n      return DurbinMatrix(n, x);\n\n   return Pelz(n, x);\n}\n\n\n/*=========================================================================*/\n\nstatic double fbarSpecial (int n, double x)\n{\n   const double w = n * x * x;\n\n   if ((w >= 370.0) || (x >= 1.0))\n      return 0.0;\n   if ((w <= 0.0274) || (x <= 0.5 / n))\n      return 1.0;\n   if (n == 1)\n      return 2.0 - 2.0 * x;\n\n   if (x <= 1.0 / n) {\n      double z;\n      double t = 2.0 * x - 1.0 / n;\n      if (n <= NFACT) {\n         z = Factorial[n];\n         return 1.0 - z * pow (t, (double) n);\n      }\n      z = getLogFactorial (n) + n * log (t);\n      return 1.0 - exp (z);\n   }\n\n   if (x >= 1.0 - 1.0 / n) {\n      return 2.0 * pow (1.0 - x, (double) n);\n   }\n   return -1.0;\n}\n\n\n/*========================================================================*/\n\ndouble KSfbar (int n, double x)\n{\n   double w = n * x * x;\n   double v = fbarSpecial (n, x);\n   if (v >= 0.0)\n      return v;\n\n   if (n <= NEXACT) {\n      if (w < 4.0)\n         return 1.0 - KScdf (n, x);\n      else\n         return 2.0 * KSPlusbarUpper (n, x);\n   }\n\n   if (w >= 2.2)\n      return 2.0 * KSPlusbarUpper (n, x);\n\n   return 1.0 - KScdf (n, x);\n}\n\n\n/*=========================================================================\n\nThe following implements the Durbin matrix algorithm and was programmed by\nG. Marsaglia, Wai Wan Tsang and Jingbo Wong.\n\nI have made small modifications in their program. (Richard Simard)\n\n\n\n=========================================================================*/\n\n/*\n The C program to compute Kolmogorov's distribution\n\n             K(n,d) = Prob(D_n < d),         where\n\n      D_n = max(x_1-0/n,x_2-1/n...,x_n-(n-1)/n,1/n-x_1,2/n-x_2,...,n/n-x_n)\n\n    with  x_1<x_2,...<x_n  a purported set of n independent uniform [0,1)\n    random variables sorted into increasing order.\n    See G. Marsaglia, Wai Wan Tsang and Jingbo Wong,\n       J.Stat.Software, 8, 18, pp 1--4, (2003).\n*/\n\n#define NORM 1.0e140\n#define INORM 1.0e-140\n#define LOGNORM 140\n\n\n/* Matrix product */\nstatic void mMultiply (double *A, double *B, double *C, int m);\n\n/* Matrix power */\nstatic void mPower (double *A, int eA, double *V, int *eV, int m, int n);\n\n\nstatic double DurbinMatrix (int n, double d)\n{\n   int k, m, i, j, g, eH, eQ;\n   double h, s, *H, *Q;\n   /* OMIT NEXT TWO LINES IF YOU REQUIRE >7 DIGIT ACCURACY IN THE RIGHT TAIL */\n#if 0\n   s = d * d * n;\n   if (s > 7.24 || (s > 3.76 && n > 99))\n      return 1 - 2 * exp (-(2.000071 + .331 / sqrt (n) + 1.409 / n) * s);\n#endif\n   k = (int) (n * d) + 1;\n   m = 2 * k - 1;\n   h = k - n * d;\n   H = (double *) malloc ((m * m) * sizeof (double));\n   Q = (double *) malloc ((m * m) * sizeof (double));\n   for (i = 0; i < m; i++)\n      for (j = 0; j < m; j++)\n         if (i - j + 1 < 0)\n            H[i * m + j] = 0;\n         else\n            H[i * m + j] = 1;\n   for (i = 0; i < m; i++) {\n      H[i * m] -= pow (h, (double)(i + 1));\n      H[(m - 1) * m + i] -= pow (h, (double)(m - i));\n   }\n   H[(m - 1) * m] += (2 * h - 1 > 0 ? pow (2 * h - 1, (double) m) : 0);\n   for (i = 0; i < m; i++)\n      for (j = 0; j < m; j++)\n         if (i - j + 1 > 0)\n            for (g = 1; g <= i - j + 1; g++)\n               H[i * m + j] /= g;\n   eH = 0;\n   mPower (H, eH, Q, &eQ, m, n);\n   s = Q[(k - 1) * m + k - 1];\n\n   for (i = 1; i <= n; i++) {\n      s = s * (double) i / n;\n      if (s < INORM) {\n         s *= NORM;\n         eQ -= LOGNORM;\n      }\n   }\n   s *= pow (10., (double) eQ);\n   free (H);\n   free (Q);\n   return s;\n}\n\n\nstatic void mMultiply (double *A, double *B, double *C, int m)\n{\n   int i, j, k;\n   double s;\n   for (i = 0; i < m; i++)\n      for (j = 0; j < m; j++) {\n         s = 0.;\n         for (k = 0; k < m; k++)\n            s += A[i * m + k] * B[k * m + j];\n         C[i * m + j] = s;\n      }\n}\n\n\nstatic void renormalize (double *V, int m, int *p)\n{\n   int i;\n   for (i = 0; i < m * m; i++)\n      V[i] *= INORM;\n   *p += LOGNORM;\n}\n\n\nstatic void mPower (double *A, int eA, double *V, int *eV, int m, int n)\n{\n   double *B;\n   int eB, i;\n   if (n == 1) {\n      for (i = 0; i < m * m; i++)\n         V[i] = A[i];\n      *eV = eA;\n      return;\n   }\n   mPower (A, eA, V, eV, m, n / 2);\n   B = (double *) malloc ((m * m) * sizeof (double));\n   mMultiply (V, V, B, m);\n   eB = 2 * (*eV);\n   if (B[(m / 2) * m + (m / 2)] > NORM)\n      renormalize (B, m, &eB);\n\n   if (n % 2 == 0) {\n      for (i = 0; i < m * m; i++)\n         V[i] = B[i];\n      *eV = eB;\n   } else {\n      mMultiply (A, B, V, m);\n      *eV = eA + eB;\n   }\n\n   if (V[(m / 2) * m + (m / 2)] > NORM)\n      renormalize (V, m, eV);\n   free (B);\n}\n\n\n/*=========================================================================*/\n#if 0\n#include <stdio.h>\n\nint main(void)\n{\n   double x, y, z;\n   const int K = 100;\n   int n = 60;\n   int j;\n   printf (\"n = %5d\\n\\n\", n);\n   printf (\"      x                    cdf                     fbar\\n\");\n\n   for (j = 0; j <= K; j++) {\n      x = (double) j / K;\n      y = KScdf (n, x);\n      z = KSfbar (n, x);\n      printf (\"%8.3g     %22.15g      %22.15g\\n\", x, y, z);\n   }\n   return 0;\n}\n#endif\n"
  },
  {
    "path": "KolmogorovSmirnovDist.hpp",
    "content": "//\n//  KolmogorovSmirnovDist.hpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 30.10.22.\n//  Copyright © 2022 Milan Malinsky. All rights reserved.\n//\n\n#ifndef KolmogorovSmirnovDist_hpp\n#define KolmogorovSmirnovDist_hpp\n\n/********************************************************************\n *\n * File:          KolmogorovSmirnovDist.h\n * Environment:   ISO C99 or ANSI C89\n * Author:        Richard Simard\n * Organization:  DIRO, Université de Montréal\n * Date:          1 March 2010\n *\n * Copyright March 2010 by Université de Montréal,\n                           Richard Simard and Pierre L'Ecuyer\n =====================================================================\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, version 3 of the License.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\n =====================================================================*/\n/*\n *\n * The Kolmogorov-Smirnov test statistic D_n is defined by\n *\n *        D_n = sup_x |F(x) - S_n(x)|\n *\n * where n is the sample size, F(x) is a completely specified theoretical\n * distribution, and S_n(x) is an empirical distribution function.\n *\n *\n * The function\n *\n *        double KScdf (int n, double x);\n *\n * computes the cumulative probability P[D_n <= x] of the 2-sided 1-sample\n * Kolmogorov-Smirnov distribution with sample size n at x.\n * It returns at least 13 decimal degits of precision for n <= 140,\n * at least 5 decimal degits of precision for 140 < n <= 100000,\n * and a few correct decimal digits for n > 100000.\n *\n */\n\ndouble KScdf (int n, double x);\n\n\n/*\n * The function\n *\n *        double KSfbar (int n, double x);\n *\n * computes the complementary cumulative probability P[D_n >= x] of the\n * 2-sided 1-sample Kolmogorov-Smirnov distribution with sample size n at x.\n * It returns at least 10 decimal degits of precision for n <= 140,\n * at least 5 decimal degits of precision for 140 < n <= 200000,\n * and a few correct decimal digits for n > 200000.\n *\n */\n\ndouble KSfbar (int n, double x);\n\n\n/*\n * NOTE:\n * The ISO C99 function log1p of the standard math library does not exist in\n * ANSI C89. Here, it is programmed explicitly in KolmogorovSmirnovDist.c.\n\n * For ANSI C89 compilers, change the preprocessor condition to make it\n * available.\n */\n\n#endif /* KolmogorovSmirnovDist_hpp */\n"
  },
  {
    "path": "Makefile",
    "content": "\nCXXFLAGS=-std=c++11\nCXX=g++\nBIN := Build\nLDFLAGS=-lz\n\nall: $(BIN)/Dsuite\n\n$(BIN)/Dsuite: $(BIN)/Dsuite.o $(BIN)/Dsuite_utils.o $(BIN)/D.o $(BIN)/gzstream.o $(BIN)/Dmin.o $(BIN)/Dmin_combine.o $(BIN)/Dsuite_fBranch.o $(BIN)/Dquartets.o $(BIN)/Dsuite_common.o $(BIN)/kstest.o $(BIN)/KolmogorovSmirnovDist.o\n\t$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)\n\n$(BIN)/%.o: %.cpp\n\t$(CXX) -c $(CXXFLAGS) $< -o $@\n\n$(BIN):\n\tmkdir -p $@\n\n# Dependencies\n$(BIN)/Dsuite: $(BIN)/Dsuite.o $(BIN)/Dsuite_utils.o $(BIN)/D.o $(BIN)/gzstream.o $(BIN)/Dmin.o $(BIN)/Dmin_combine.o $(BIN)/Dsuite_fBranch.o $(BIN)/Dquartets.o $(BIN)/Dsuite_common.o $(BIN)/kstest.o $(BIN)/KolmogorovSmirnovDist.o | $(BIN)\n\nclean:\n\trm $(BIN)/*.o $(BIN)/Dsuite\n"
  },
  {
    "path": "README.md",
    "content": "#  Dsuite\nPublication:  \nMalinsky, M., Matschiner, M. and Svardal, H. (2021) Dsuite ‐ fast D‐statistics and related admixture evidence from VCF files. Molecular Ecology Resources 21, 584–595. doi: [https://doi.org/10.1111/1755-0998.13265](https://doi.org/10.1111/1755-0998.13265)  \nFree to view author [link](https://onlinelibrary.wiley.com/share/author/QNEE6JI7DGUSBA4Y8ZGU?target=10.1111/1755-0998.13265)  \nMalawi cichlid data used in the manuscript:\n[VCF file](https://github.com/millanek/tutorials/blob/master/DsuiteData/Malinsky_et_al_2018_LakeMalawiCichlids_scaffold_0.vcf.gz); [SETS.txt file](https://github.com/millanek/tutorials/blob/master/DsuiteData/sets.txt)<br>\nSimulated 20-species data used in the manuscript: [VCF file](https://github.com/millanek/tutorials/blob/master/DsuiteData/with_geneflow.vcf.gz); [SETS.txt file](https://github.com/millanek/tutorials/blob/master/DsuiteData/species_sets.txt); [TREE_FILE.nwk](https://github.com/millanek/tutorials/blob/master/DsuiteData/simulated_tree_with_geneflow.nwk) (input tree)\n\nThere is also a very detailed [tutorial](https://github.com/millanek/tutorials/tree/master/analysis_of_introgression_with_snp_data) that I prepared with input from [@mmatschiner](https://github.com/mmatschiner).\n\nA manuscript describing the `--ABBAclustering` option for analyses of gene-flow among divergent species:  \nKoppetsch, T., Malinsky, M. & Matschiner, M. (2024) Towards Reliable Detection of Introgression in the Presence of Among-Species Rate Variation. Systematic Biology, syae028; doi: [https://doi.org/10.1093/sysbio/syae028](https://doi.org/10.1093/sysbio/syae028) \n\n## Quickstart:\n```\nCommands:\n           Dtrios                  Calculate D (ABBA-BABA) and f4-ratio statistics for all possible trios of populations/species\n           DtriosCombine           Combine results from Dtrios runs across genomic regions (e.g. per-chromosome)\n           Dinvestigate            Follow up analyses for trios with significantly elevated D:\n                                   calculates f_d, f_dM, and d_f in windows along the genome\n           Fbranch                 Calculate D and f statistics for branches on a tree that relates the populations/species\n\nExperimental:\n           Dquartets               Calculate D (ABBA-BABA) and f4-ratio statistics for all possible quartets of populations/species\n                                   (no outgroup specified)\n\nUsage:\na) Dsuite Dtrios [OPTIONS] INPUT_FILE.vcf SETS.txt\nb) Dsuite Dquartets [OPTIONS] INPUT_FILE.vcf SETS.txt\nc) Dsuite Dinvestigate [OPTIONS] INPUT_FILE.vcf.gz SETS.txt test_trios.txt\nd) Dsuite Fbranch [OPTIONS] TREE_FILE.nwk FVALS_tree.txt\n```\n\n## Input files:\n### Required files:\n1. A [VCF](http://www.internationalgenome.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40/) file, which can be compressed with gzip or bgzip. It can contain multiallelic loci and indels, but only biallelic SNPs will be used.\n2. Population/species map `SETS.txt`: a text file with one individual per row and a tab separating the individual’s name from the name of the species/population it belongs to, as shown below:\n```\nInd1    Species1\nInd2    Species1\nInd3    Species2\nInd4    Species2\nInd5    Species3\nInd6    Outgroup\nInd7    Outgroup\nInd8    xxx\n...     ...\nIndN    Species_n\n```\nIf you want some individuals to be ignored, use the `xxx` keyword. Therefore, you don't have to subset your VCF file if you want to use only a subset of the samples in it.\n\nFor `Dtrios`, at least one individual needs to be specified to be the outgroup by using the `Outgroup` keyword as shown above.\n\nAll species/populations are treated equally in `Dquartets` - there should not be any outgroup.\n### Optional files:\n3. A tree in Newick format. The tree should have leaf labels corresponding to the species/population names. Branch lengths can be present but are not used. To use  `Fbranch`, the tree must be rooted using the Outgroup. \nValid examples:  \n`(Species2,(Species1,(Species3,Species4)));`  \n`(Species2:6.0,(Species1:5.0,(Species3:3.0,Species4:4.0)));`\n4. The `test_trios.txt` file for `Dinvestigate`. One trio of populations/species per line, separated by a tab in the order `P1  P2  P3`:\n```\nSpecies1    Species2    Species3\nSpecies1    Species4    Species2\n...         ...         ...\n```\n### Piped VCF input:\nIt is possible to 'pipe' the genotype data into  `Dsuite Dtrios` or `Dsuite Dquartets` from another program, such as bcftools, allowing custom filtering of the VCF file.  Just use the `stdin` keyword in place of the VCF file name. It is also necessary to provide the number of lines in the filtered VCF via the  `-l` option to the Dsuite programs. For example, to filter a VCF for overall mimimum depth of at least 1000 across all samples, you would use the following commands:\n```\nNUMLINES=$(bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf.gz | wc -l)  # to get NUMLINES\nbcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf.gz | Dsuite Dtrios -l $NUMLINES stdin SETS.txt\n```\n\n## Installation\n### Main program:\nTo compile you must have a reasonably recent GCC (>=4.9.0) or clang compiler (on mac OS this comes with Command Line Tools) and the zlib compression library (https://www.zlib.net). Both should already be present on most systems. \n\n```console\n$ git clone https://github.com/millanek/Dsuite.git\n$ cd Dsuite\n$ make\n```\n\nThe Dsuite executable will be in the Build folder, so to run it type e.g. `./Build/Dsuite`; this will show the available commands. To execute e.g. the Dtrios command, type `./Build/Dsuite Dtrios`.\n\n### [Optional] Installing the python3 Fbranch plotting script\n\nIf you want to plot the results of the f-branch calcuation (see below), you will need to install the python script for this using setuptools. You need an internet connection as some python dependencies will be downloaded from `pypi.org`. It may be necessary to exit python or conda virtual environments for this to work correctly.\n\n```console\n$ cd utils\n$ python3 setup.py install --user --prefix=\n```\n\nThe above should work on both mac and linux. Note that there is no text (not even whitespace) after the `=` above. If you want to use your own virtual environments, you can alternatively not run setup.py and just install the dependencies with `pip` or `conda`.\n\n\n## Commands (v0.5 r53):\n### Dsuite Dtrios - Calculate the D (ABBA-BABA) and f4-ratio statistics for all possible trios of populations/species\n```\nUsage: Dsuite Dtrios [OPTIONS] INPUT_FILE.vcf SETS.txt\nCalculate the D (ABBA/BABA) and f4-ratio statistics for all trios of species in the dataset (the outgroup being fixed)\nThe results are as definded in Patterson et al. 2012 (equivalent to Durand et al. 2011 when the Outgroup is fixed for the ancestral allele)\nThe SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\nThe outgroup (can be multiple samples) should be specified by using the keywork Outgroup in place of the SPECIES_ID\n\nUse 'stdin' for the VCF file when piping from another program into Dsuite via standard input\nin this case it is necessary to provide the number of lines in the filtered VCF via the -l option\nFor example, to filter the VCF for overall mimimum depth of at least 1000 across all samples:\nNUMLINES=$(bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | wc -l)  # to get NUMLINES\nbcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | Dsuite Dtrios -l $NUMLINES stdin SETS.txt\n\n       -h, --help                              display this help and exit\n       -k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\n       -j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\n                                               when specified, this is used in place of the --JKnum option\n       -r, --region=start,length               (optional) only process a subset of the VCF file; both \"start\" and \"length\" indicate variant numbers\n                                               e.g. --region=20001,10000 will process variants from 20001 to 30000\n       -t, --tree=TREE_FILE.nwk                (optional) a file with a tree in the newick format specifying the relationships between populations/species\n                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\n       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\n                                               output will be put in OUT_FILE_PREFIX_BBAA.txt, OUT_FILE_PREFIX_Dmin.txt, OUT_FILE_PREFIX_tree.txt etc.\n                                               by default, the prefix is taken from the name of the SETS.txt file\n       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\n       --no-f4-ratio                           (optional) don't calculate the f4-ratio\n       -l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\n       -g, --use-genotype-probabilities        (optional) use probabilities (GP tag) or calculate them from likelihoods (GL or PL tags) using a Hardy-Weinberg prior\n                                               the probabilities are used to estimate allele frequencies in each population/species\n       -p, --pool-seq=MIN_DEPTH                (optional) VCF contains pool-seq data; i.e., each 'individual' is a population\n                                               allele frequencies are then estimated from the AD (Allelic Depth) field, as long as there are MIN_DEPTH reads\n                                               e.g MIN_DEPTH=5 may be reasonable; when there are fewer reads, the allele frequency is set to missing\n       -c, --no-combine                        (optional) do not output the \"_combine.txt\" and \"_combine_stderr.txt\" files\n       --ABBAclustering                        (optional) Test whether strong ABBA-informative sites cluster along the genome\n```\n#### Output:\nThe output files with suffixes  `BBAA.txt`, `Dmin.txt`, and optionally `tree.txt` (if the `-t` option was used) contain the results: the D statistics, Zscore, unadjusted p-values, the f4-ratios, and counts of the BBAA, BABA, and ABBA patterns. Please read the [manuscript](https://doi.org/10.1111/1755-0998.13265) for more details. \n\nThe output files with suffixes  `combine.txt` and  `combine_stderr.txt` are used as input to DtriosCombine. If you don't need to use DtriosCombine, you can safely delete these files.\n\n#### ABBA-clustering test:\nWhen testing for introgression among highly divergent species and/or in cases where introgression happened a long time ago (rule of thumb - millions of generations), there is a risk that the Dstatistic could show a false signal of introgression due to substitution rate variation among different branches on the phylogeny. Such substitution rate variation can lead to homoplasies appearing as ABBA sites.    \n\nImportantly, ABBA sites introduced by introgression would substantially cluster along the genome, while homoplasies would appear one by one. To test this, I have added the `--ABBAclustering` option. The more significant clustering of ABBA sites the more confidence you can have that a gene-flow event is real and not a false positive caused by homoplasies. Two p-values are produced by this test: the `clustering_sensitive` value and the `clustering_robust-val1` value. As the names suggest, the \"sensitive\" test has greater power but can produce some false positives due to mutation rate variation along the genome. The \"robust\" test has lower statistical power, but it is robust to mutation rate variation along the genome.      \n\nFor additional details please see: \\\nKoppetsch, T., Malinsky, M. & Matschiner, M. (2024) Towards Reliable Detection of Introgression in the Presence of Among-Species Rate Variation. Systematic Biology, syae028; doi: [https://doi.org/10.1093/sysbio/syae028](https://doi.org/10.1093/sysbio/syae028)\n\n### DtriosParallel\n\nWe provide a python script for parallel execution at `<Dsuite_path>/utils/DtriosParallel`. The usage is analogous to Dsuite Dtrios, except that the order of `SETS.txt` and `INPUT_FILE.vcf` is swapped in the command line so that the user can optionally provide multiple VCF files (whitespace separated). The script will autmatically call `DtriosCombine` to combine results of all VCF files into a single set of results files.\n\n```\n$ ./utils/DtriosParallel --help\nusage: DtriosParallel [-h] [-k JKNUM] [-j JKWINDOW] [-t TREE] [-n RUN_NAME]\n                      [-l NUMLINES] [-g] [-p --pool-seq=MIN_DEPTH] [-c]\n                      [--cores CORES] [--keep-intermediate]\n                      [--logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]\n                      [--dsuite-path DSUITE_PATH]\n                      [--environment-setup ENVIRONMENT_SETUP]\n                      SETS.txt INPUT_FILE.vcf [INPUT_FILE.vcf ...]\n\nThis python script automates parallelisation of Dsuite Dtrios/ Dsuite\nDtriosCombine. The usage is analogous to Dsuite Dtrios but computation is\nperformed on multiple cores (default: number of available CPUs). ATTENTION:\nThe order of SETS.txt and INPUT_FILE.vcf is swapped compared to Dsuite Dtrios.\nThis is so that multiple VCF input files can be provided. All vcf files should\nhave the same samples, (e.g., different chromosomes of the same callset).\nOutput_files are placed in the same folder as as the the SETS.txt file and\nnamed DTParallel_<SETS_basename>_<run_name>_combined_BBAA.txt etc. This script\nshould run on most systems with a standard python installation (tested with\npython 2.7 and 3.6).\n\n\npositional arguments:\n  SETS.txt              The SETS.txt should have two columns: SAMPLE_ID\n                        SPECIES_ID The outgroup (can be multiple samples)\n                        should be specified by using the keyword Outgroup in\n                        place of the SPECIES_ID\n  INPUT_FILE.vcf        One or more whitespace separated SNP vcf files.\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -k JKNUM, --JKnum JKNUM\n                        (default=20) the number of Jackknife blocks to divide\n                        the dataset into; should be at least 20 for the whole\n                        dataset\n  -j JKWINDOW, --JKwindow JKWINDOW\n                        Jackknife block size in number of informative SNPs (as\n                        used in v0.2) when specified, this is used in place of\n                        the --JKnum option\n  -t TREE, --tree TREE  a file with a tree in the newick format specifying the\n                        relationships between populations/species D and\n                        f4-ratio values for trios arranged according to the\n                        tree will be output in a file with _tree.txt suffix\n  -n RUN_NAME, --run-name RUN_NAME\n                        run-name will be included in the output file name\n  -l NUMLINES           (optional) the number of lines (SNPs) in the VCF\n                        input(s) - speeds up operation if known. If N\n                        INPUT_FILE.vcf files are provided, there must be N\n                        comma-separated integers provided without whitespace\n                        between them.\n  -g, --use-genotype-probabilities\n                        (optional) use probabilities (GP tag) or calculate\n                        them from likelihoods (GL or PL tags) using a Hardy-\n                        Weinberg prior\n  -p --pool-seq=MIN_DEPTH, --pool-seq --pool-seq=MIN_DEPTH\n                        (default=20) the number of Jackknife blocks to divide\n                        the dataset into; should be at least 20 for the whole\n                        dataset\n  -c, --no-combine      (optional) do not run DtriosCombine to obtain a single\n                        combined results file\n  --cores CORES         (default=CPU count) Number of Dsuite Dtrios processes\n                        run in parallel.\n  --keep-intermediate   Keep region-wise Dsuite Dtrios results.\n  --logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}, -v {DEBUG,INFO,WARNING,ERROR,CRITICAL}\n                        Minimun level of logging.\n  --dsuite-path DSUITE_PATH\n                        Explicitly set the path to the directory in which\n                        Dsuite is located. By default the script will first\n                        check whether Dsuite is accessible from $PATH. If not\n                        it will try to locate Dsuite at ../Build/Dsuite.\n  --environment-setup ENVIRONMENT_SETUP\n                        Command that should be run to setup the environment\n                        for Dsuite. E.g., 'module load GCC' or 'conda\n\n\n```\n\n#### Output:\n\nOutput are \\_BBAA, \\_Dmin files analogus to Dtrios/DtriosCombine and are placed in the same folder as as the the `SETS.txt` file and\nnamed `DTParallel_<SETS_basename>_<run_name>_combined_BBAA.txt` etc.\n\n### DtriosCombine - Combine results from Dtrios runs across genomic regions (e.g. per chromosome)\n```\nUsage: Dsuite DtriosCombine [OPTIONS] DminFile1 DminFile2 DminFile3 ....\n\nCombine the BBAA, ABBA, and BABA counts from multiple files (e.g per-chromosome) and output the overall D stats,\np-values and f4-ratio values\n\n       -h, --help                              display this help and exit\n       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\n                                               output will be put in OUT_FILE_PREFIX_combined_BBAA.txt, OUT_FILE_PREFIX_combined_Dmin.txt, OUT_FILE_PREFIX_combined_tree.txt etc.\n                                               by default, the prefix is \"out\"\n       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\n       -t , --tree=TREE_FILE.nwk               (optional) a file with a tree in the newick format specifying the relationships between populations/species\n                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\n       -s , --subset=start,length              (optional) only process a subset of the trios\n```\n#### Output:\nAs for `Dtrios`, there are output files with suffixes  `BBAA.txt`, `Dmin.txt`, and optionally `tree.txt` (if the `-t` option was used). They contain the overall combined results: the D statistics, Zscore, unadjusted p-values, the f4-ratios, and counts of the BBAA, BABA, and ABBA patterns.\n\n###  Dinvestigate - Follow up analyses for trios with significantly elevated D: calculates D, f_d and f_dM in windows along the genome\n```\nUsage: Dsuite Dinvestigate [OPTIONS] INPUT_FILE.vcf.gz SETS.txt test_trios.txt\n\nOutputs D, f_d (Martin et al. 2014 MBE), f_dM (Malinsky et al., 2015), and d_f (Pfeifer & Kapan, 2019) in genomic windows\nThe SETS.txt file should have two columns: SAMPLE_ID    POPULATION_ID\nThe test_trios.txt should contain names of three populations for which the statistics will be calculated:\nPOP1   POP2    POP3\nThere can be multiple lines and then the program generates multiple ouput files, named like POP1_POP2_POP3_localFstats_SIZE_STEP.txt\n\n       -h, --help                              display this help and exit\n       -w SIZE,STEP --window=SIZE,STEP         (required) D, f_D, f_dM, and d_f statistics for windows containing SIZE useable SNPs, moving by STEP (default: 50,25)\n       -n, --run-name                          run-name will be included in the output file name\n```\n\n###  Fbranch - A heuristic approach designed to aid the interpretation of many correlated f4-ratio results \n```\nUsage: Dsuite Fbranch [OPTIONS] TREE_FILE.nwk FVALS_tree.txt\nImplements the 'f-branch' type calculations developed by Hannes Svardal for Malinsky et al., 2018, Nat. Ecol. Evo.\nUses the f4-ratio (f_G) values produced by Dsuite Dtrios (or DtriosCombine) with the --tree option; this is the output of Dtrios with the \"_tree.txt\" suffix\nTo use  Fbranch, the tree in TREE_FILE.nwk must be rooted with the Outgroup.\nOutput to stdout\n\n      -p, --pthresh                           (default=0.01) fb scores whose associated p-value is less than \n      -Z, --Zb-matrix                         (optional)  output the equivalent of fb-statistic, but with Z-scores to assess statistical significance\n                                              this will be printed below the f-branch matrix\n      -h, --help                              display this help and exit\n```\n\n#### Output:\nThe f-branch statistic in matrix-like format. Use the plotting function below to display the f-branch statistic.\n\n###  Plotting Fbranch\nThe output of `Dsuite Fbranch` can be plotted with `./utils/dtools.py` (see installation instructions above).\n\n```\nusage: dtools.py [-h] [-n RUN_NAME] [--outgroup OUTGROUP] [--use_distances]\n                 [--ladderize]\n                 fbranch.txt tree.newick\n\nPlot f-branch statistic as produced by Dsuite. Produces .png and .svg files.\n\npositional arguments:\n  fbranch               Path to file containing f-branch matrix as produced by\n                        Dsuite Fbranch.\n  tree                  Path to .newick tree file as given to Dsuite Fbranch.\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -n RUN_NAME, --run-name RUN_NAME\n                        Base file name for output plots. (default: fbranch)\n  --outgroup OUTGROUP   Outgroup name in newick file. (default: Outgroup)\n  --use_distances       Use actual node distances from newick file when\n                        plotting tree. (default: False)\n  --ladderize           Ladderize the input tree before plotting. (default:\n                        False)\n```\n\nRunning `dtools.py` yields a .png and an .svg file of the f-branch statistic along the input tree. You can edit the .svg file in a vector graphics editor (e.g., [inkscape](https://inkscape.org/)) to your liking. See [Malinsky et al. 2018](https://www.nature.com/articles/s41559-018-0717-x) Fig. 3 and the Dsuite [paper](https://doi.org/10.1111/1755-0998.13265) for examples and interpretation of f-branch plots.\n\n### (experimental) Dsuite Dquartets - Calculate the D (ABBA-BABA) and f4-ratio statistics for all possible quartets of populations/species (no outgroup)\n```\nUsage: Dsuite Dquartets [OPTIONS] INPUT_FILE.vcf SETS.txt\nCalculate the D (ABBA/BABA) and f4-ratio (f_G) statistics for all quartets of species in the dataset (there is no outgroup)\nThe results are as definded in Patterson et al. 2012\nThe SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\n\n-h, --help                              display this help and exit\n-k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\n-j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\n                                        when specified, this is used in place of the --JKnum option\n-r, --region=start,length               (optional) only process a subset of the VCF file\n-t, --tree=TREE_FILE.nwk                (optional) a file with a tree in the newick format specifying the relationships between populations/species\n                                        D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\n-o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\n                                        output will be put in OUT_FILE_PREFIX_BBAA.txt, OUT_FILE_PREFIX_Dmin.txt, OUT_FILE_PREFIX_tree.txt etc.\n                                        by default, the prefix is taken from the name of the SETS.txt file\n-n, --run-name                          (optional; default=quartets) run-name will be included in the output file name after the PREFIX\n--no-f4-ratio                           (optional) don't calculate the f4-ratio\n-l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\n```\n\n### Parallelisation with DtriosParallel\n\nThis python script, included in the `./utils/` subfolder, automates parallel runs of `Dsuite Dtrios` across multiple cores on one computer and automatically combines the results using `Dsuite DtriosCombine`. The usage is analogous to `Dsuite Dtrios` (currently with more limited options) but computation is performed on multiple cores (default: number of available CPUs). It should run on most systems with a standard python installation (tested with python 2.7 and 3.6).\n\n```\nDtriosParallel [-h] [--cores CORES] [-k JKNUM] [-j JKWINDOW] [-t TREE]\n                      [-n RUN_NAME] [--keep-intermediate]\n                      [--logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]\n                      [--dsuite-path DSUITE_PATH]\n                      [--environment-setup ENVIRONMENT_SETUP]\n                      INPUT_FILE.vcf SETS.txt\n\n\npositional arguments:\n  INPUT_FILE.vcf\n  SETS.txt              The SETS.txt should have two columns: SAMPLE_ID\n                        SPECIES_ID The outgroup (can be multiple samples)\n                        should be specified by using the keyword Outgroup in\n                        place of the SPECIES_ID\n\noptional arguments:\n  -h, --help            show this help message and exit\n  --cores CORES         (default=CPU count) Number of Dsuite Dtrios processes\n                        run in parallel.\n  -k JKNUM, --JKnum JKNUM\n                        (default=20) the number of Jackknife blocks to divide\n                        the dataset into; should be at least 20 for the whole\n                        dataset\n  -j JKWINDOW, --JKwindow JKWINDOW\n                        Jackknife block size in number of informative SNPs (as\n                        used in v0.2) when specified, this is used in place of\n                        the --JKnum option\n  -t TREE, --tree TREE  a file with a tree in the newick format specifying the\n                        relationships between populations/species D and\n                        f4-ratio values for trios arranged according to the\n                        tree will be output in a file with _tree.txt suffix\n  -n RUN_NAME, --run-name RUN_NAME\n                        run-name will be included in the output file name\n  --keep-intermediate   Keep region-wise Dsuite Dtrios results.\n  --logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}, -l {DEBUG,INFO,WARNING,ERROR,CRITICAL}\n                        Minimun level of logging.\n  --dsuite-path DSUITE_PATH\n                        Explicitly set the path to the directory in which\n                        Dsuite is located. By default the script will first\n                        check whether Dsuite is accessible from $PATH. If not\n                        it will try to locate Dsuite at ../Build/Dsuite.\n  --environment-setup ENVIRONMENT_SETUP\n                        Command that should be run to setup the environment\n                        for Dsuite. E.g., 'module load GCC' or 'conda\n                        activate'\n```\n\n## Change log:\n\n```\nSelected updates (full update history is accessible on gitHub):\nv0.5 r53:   Changed --ABBAclustering and provide propoer documentation for this test  \nv0.5 r48:   --KS-test-for-homoplasy now works accurately for sample sizes >= 16,000  \nv0.5 r47:   --KS-test-for-homoplasy output now has more accurate p-values (one-sample KS-test for uniformity)\nv0.5 r46:   Support for arbitrary ploidy in Dtrios\nv0.5 r45:   BUG FIX for r44 where the trio orientation in the \"_tree.txt\" output files was wrong (P1 and P2 swapped). This is fixed now.\nv0.5 r44:   Major update:   - code re-factoring, including proper subsampling for f4-ratio calculations\n                            - First implementation of the Kolgomorov-Sminov test for homoplasy (--KS-test-for-homoplasy) in Dtrios; still somewhat experimental and works only in the \"_BBAA.txt\" output\n                            - Very small p-values now don't get rounded to 0 but are bounded at 2.3e-16\nv0.4 r43:   First implementation of the pool-seq (-p) option in in Dtrios \nv0.4 r28:   Merged DtriosParallel from https://github.com/feilchenfeldt and refreshed documentation\nv0.3 r27:   Added the -o (--out-prefix) option to allow more flexibility in naming output files\nv0.3 r25:   Added the Dquartets program - D and f4-ratio calculation without any outgroup, for all quartets of populations/species\nv0.3 r24:   Z-scores and site pattern counts (BBAA, ABBA, BABA) are now in the output of Dtrios, DtriosCombine, and Dquartets \nv0.3 r23:   Allow piped (stdin) VCF format input to Dtrios; this facilitates e.g. pre-filtering and/or bcf input using bcftools \nv0.3 r22:   Adding the d_f statistic (Pfeifer & Kapan, 2019) to Dinvestigate\nv0.3 r21:   Automatic estimation of Jackknife window size to get a desired number of blocks\n            Progress update in %\n            f4-ratios are calculated by default by Dtrios \n            Updated documentation\nv0.2 r20:   Subset option returns to DtriosCombine\nv0.2 r19:   Fixed a bug in Fbranch where P1 and P2 positions where A in P2 positions and B in P1 positions were not considered\nv0.2 r18:   Full implementation of D and f4-ratio in line with the Patterson et al. 2012 definitions \n            (affects only analyses where the outgroup allele is not fixed)\nv0.2 r15:   Ironed bugs in Dinvestigate, truly useable from this point  \nv0.2 r6:    First Fbranch version\nv0.1 r1:    First workable Dsuite release 8th May 2019    \n\n```\n"
  },
  {
    "path": "gzstream.cpp",
    "content": "// ============================================================================\n// gzstream, C++ iostream classes wrapping the zlib compression library.\n// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner\n//\n// This library is free software; you can redistribute it and/or\n// modify it under the terms of the GNU General Public\n// License as published by the Free Software Foundation; either\n// version 3 of the License, or (at your option) any later version.\n//\n// This library is distributed in the hope that it will be useful,\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n// General Public License for more details.\n//\n//\n// See src/COPYING for the full terms of the license.\n//\n// You should have received a copy of the GNU Lesser General Public\n// License along with this library; if not, write to the Free Software\n// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\n// ============================================================================\n//\n// File          : gzstream.C\n// Revision      : $Revision: 1.7 $\n// Revision_date : $Date: 2003/01/08 14:41:27 $\n// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner\n// Modified      : Jared Simpson 2010/04/16. Converted license notification to\n//                 GPLv3 to be compatible with the rest of the code base as\n//                 allowed by LGPLv2.1. No other changes were made.\n//\n// Standard streambuf implementation following Nicolai Josuttis, \"The\n// Standard C++ Library\".\n// ============================================================================\n\n\n#include \"gzstream.h\"\n#include <iostream>\n#include <string.h>  // for memcpy\n\n#ifdef GZSTREAM_NAMESPACE\nnamespace GZSTREAM_NAMESPACE {\n#endif\n    \n    // ----------------------------------------------------------------------------\n    // Internal classes to implement gzstream. See header file for user classes.\n    // ----------------------------------------------------------------------------\n    \n    // --------------------------------------\n    // class gzstreambuf:\n    // --------------------------------------\n    \n    gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {\n        if ( is_open())\n            return (gzstreambuf*)0;\n        mode = open_mode;\n        // no append nor read/write mode\n        if ((mode & std::ios::ate) || (mode & std::ios::app)\n            || ((mode & std::ios::in) && (mode & std::ios::out)))\n            return (gzstreambuf*)0;\n        char  fmode[10];\n        char* fmodeptr = fmode;\n        if ( mode & std::ios::in)\n            *fmodeptr++ = 'r';\n        else if ( mode & std::ios::out)\n            *fmodeptr++ = 'w';\n        *fmodeptr++ = 'b';\n        *fmodeptr = '\\0';\n        file = gzopen( name, fmode);\n        if (file == 0)\n            return (gzstreambuf*)0;\n        opened = 1;\n        return this;\n    }\n    \n    gzstreambuf * gzstreambuf::close() {\n        if ( is_open()) {\n            sync();\n            opened = 0;\n            if ( gzclose( file) == Z_OK)\n                return this;\n        }\n        return (gzstreambuf*)0;\n    }\n    \n    int gzstreambuf::underflow() { // used for input buffer only\n        if ( gptr() && ( gptr() < egptr()))\n            return * reinterpret_cast<unsigned char *>( gptr());\n        \n        if ( ! (mode & std::ios::in) || ! opened)\n            return EOF;\n        // Josuttis' implementation of inbuf\n        int n_putback = gptr() - eback();\n        if ( n_putback > 4)\n            n_putback = 4;\n        memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);\n        \n        int num = gzread( file, buffer+4, bufferSize-4);\n        if (num <= 0) // ERROR or EOF\n            return EOF;\n        \n        // reset buffer pointers\n        setg( buffer + (4 - n_putback),   // beginning of putback area\n             buffer + 4,                 // read position\n             buffer + 4 + num);          // end of buffer\n        \n        // return next character\n        return * reinterpret_cast<unsigned char *>( gptr());\n    }\n    \n    int gzstreambuf::flush_buffer() {\n        // Separate the writing of the buffer from overflow() and\n        // sync() operation.\n        int w = pptr() - pbase();\n        if ( gzwrite( file, pbase(), w) != w)\n            return EOF;\n        pbump( -w);\n        return w;\n    }\n    \n    int gzstreambuf::overflow( int c) { // used for output buffer only\n        if ( ! ( mode & std::ios::out) || ! opened)\n            return EOF;\n        if (c != EOF) {\n            *pptr() = c;\n            pbump(1);\n        }\n        if ( flush_buffer() == EOF)\n            return EOF;\n        return c;\n    }\n    \n    int gzstreambuf::sync() {\n        // Changed to use flush_buffer() instead of overflow( EOF)\n        // which caused improper behavior with std::endl and flush(),\n        // bug reported by Vincent Ricard.\n        if ( pptr() && pptr() > pbase()) {\n            if ( flush_buffer() == EOF)\n                return -1;\n        }\n        return 0;\n    }\n    \n    // --------------------------------------\n    // class gzstreambase:\n    // --------------------------------------\n    \n    gzstreambase::gzstreambase( const char* name, int mode) {\n        init( &buf);\n        open( name, mode);\n    }\n    \n    gzstreambase::~gzstreambase() {\n        buf.close();\n    }\n    \n    void gzstreambase::open( const char* name, int open_mode) {\n        if ( ! buf.open( name, open_mode))\n            clear( rdstate() | std::ios::badbit);\n    }\n    \n    void gzstreambase::close() {\n        if ( buf.is_open())\n            if ( ! buf.close())\n                clear( rdstate() | std::ios::badbit);\n    }\n    \n#ifdef GZSTREAM_NAMESPACE\n} // namespace GZSTREAM_NAMESPACE\n#endif\n// ============================================================================\n// EOF //\n"
  },
  {
    "path": "gzstream.h",
    "content": "// ============================================================================\n// gzstream, C++ iostream classes wrapping the zlib compression library.\n// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner\n//\n// This library is free software; you can redistribute it and/or\n// modify it under the terms of the GNU General Public\n// License as published by the Free Software Foundation; either\n// version 3 of the License, or (at your option) any later version.\n//\n// This library is distributed in the hope that it will be useful,\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n// General Public License for more details.\n//\n//\n// See src/COPYING for the full terms of the license.\n//\n// You should have received a copy of the GNU Lesser General Public\n// License along with this library; if not, write to the Free Software\n// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\n// ============================================================================\n//\n// File          : gzstream.h\n// Revision      : $Revision: 1.5 $\n// Revision_date : $Date: 2002/04/26 23:30:15 $\n// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner\n// Modified      : Jared Simpson 2010/04/16. Converted license notification to\n//                 GPLv3 to be compatible with the rest of the code base as\n//                 allowed by LGPLv2.1. No other changes were made.\n//\n//\n// Standard streambuf implementation following Nicolai Josuttis, \"The\n// Standard C++ Library\".\n// ============================================================================\n\n#ifndef GZSTREAM_H\n#define GZSTREAM_H 1\n// standard C++ with new header file names and std:: namespace\n#include <iostream>\n#include <fstream>\n#include <zlib.h>\n\n\n\n\n// ----------------------------------------------------------------------------\n// Internal classes to implement gzstream. See below for user classes.\n// ----------------------------------------------------------------------------\n\nclass gzstreambuf : public std::streambuf {\nprivate:\n    static const int bufferSize = 47+256;    // size of data buff\n    // totals 512 bytes under g++ for igzstream at the end.\n    \n    gzFile           file;               // file handle for compressed file\n    char             buffer[bufferSize]; // data buffer\n    char             opened;             // open/close state of stream\n    int              mode;               // I/O mode\n    \n    int flush_buffer();\npublic:\n    gzstreambuf() : opened(0) {\n        setp( buffer, buffer + (bufferSize-1));\n        setg( buffer + 4,     // beginning of putback area\n             buffer + 4,     // read position\n             buffer + 4);    // end position\n        // ASSERT: both input & output capabilities will not be used together\n    }\n    int is_open() { return opened; }\n    gzstreambuf* open( const char* name, int open_mode);\n    gzstreambuf* close();\n    ~gzstreambuf() { close(); }\n    \n    virtual int     overflow( int c = EOF);\n    virtual int     underflow();\n    virtual int     sync();\n};\n\nclass gzstreambase : virtual public std::ios {\nprotected:\n    gzstreambuf buf;\npublic:\n    gzstreambase() { init(&buf); }\n    gzstreambase( const char* name, int open_mode);\n    ~gzstreambase();\n    void open( const char* name, int open_mode);\n    void close();\n    gzstreambuf* rdbuf() { return &buf; }\n};\n\n// ----------------------------------------------------------------------------\n// User classes. Use igzstream and ogzstream analogously to ifstream and\n// ofstream respectively. They read and write files based on the gz*\n// function interface of the zlib. Files are compatible with gzip compression.\n// ----------------------------------------------------------------------------\n\nclass igzstream : public gzstreambase, public std::istream {\npublic:\n    igzstream() : std::istream( &buf) {}\n    igzstream( const char* name, int open_mode = std::ios::in)\n    : gzstreambase( name, open_mode), std::istream( &buf) {}\n    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }\n    void open( const char* name, int open_mode = std::ios::in) {\n        gzstreambase::open( name, open_mode);\n    }\n};\n\nclass ogzstream : public gzstreambase, public std::ostream {\npublic:\n    ogzstream() : std::ostream( &buf) {}\n    ogzstream( const char* name, int mode = std::ios::out)\n    : gzstreambase( name, mode), std::ostream( &buf) {}\n    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }\n    void open( const char* name, int open_mode = std::ios::out) {\n        gzstreambase::open( name, open_mode);\n    }\n};\n\n#endif // GZSTREAM_H\n// ============================================================================\n// EOF //\n\n"
  },
  {
    "path": "kstest.cpp",
    "content": "/**************************************************************************/\n/*    Copyright (C) 2006 Romain Michalec                                  */\n/*                                                                        */\n/*    This library is free software; you can redistribute it and/or       */\n/*    modify it under the terms of the GNU Lesser General Public          */\n/*    License as published by the Free Software Foundation; either        */\n/*    version 2.1 of the License, or (at your option) any later version.  */\n/*                                                                        */\n/*    This library is distributed in the hope that it will be useful,     */\n/*    but WITHOUT ANY WARRANTY; without even the implied warranty of      */\n/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   */\n/*    Lesser General Public License for more details.                     */\n/*                                                                        */\n/*    You should have received a copy of the GNU Lesser General Public    */\n/*    License along with this library; if not, write to the Free Software   */\n/*    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,          */\n/*    MA  02110-1301, USA                                                 */\n/*                                                                        */\n/**************************************************************************/\n\n#include \"kstest.h\"\n#include \"KolmogorovSmirnovDist.hpp\"\n#include \"Dsuite_utils.h\"\n#include <cmath>\n\n\n\n/* KOLMOGOROV-SMIRNOV TEST OF HOMOGENEITY\n *\n * This long explaination provides you with everything you must know to\n * understand totally the Kolmogorov-Smirnov test of homogeneity.\n *\n * When x = (x_1, ..., x_n) are the observed values (the \"realisation\")\n * of a random sample X = (X_1, ..., X_n), we define the \"sample cumulative\n * distribution function\", or \"empirical cumulative distribution function\"\n * of X, and denote it X_emp_cdf(x), as the proportion of observed values\n * in X that are less or equal to x. In other words, if k observed values\n * in the sample are less than or equal to x, then X_emp_cdf(x) = k/n.\n *\n * Thus X_emp_cdf is a step function, defined on R and with values\n * from 0 to 1, with jumps of magnitude 1/n at each observation.\n *\n * The (non-empirical) cumulative distribution function of X, X_cdf,\n * defined on R, with values from 0 to 1, is X_cdf(x) = Pr (X_k <= x),\n * with k in 1:n (it doesn't matter which one since they all have the\n * same distribution).\n *\n * The empirical c.d.f is of course an approximation of the c.d.f:\n *\n *     X_emp_cdf(x) --> X_cdf(x)   when n --> infty\n *\n * In fact, a much stronger result, known as the Glivenko-Cantelli theorem,\n * states that when sample size is large, X_emp_cdf converges uniformly\n * to X_cdf:\n *\n *     D_n = sup |X_emp_cdf(x) - X_cdf(x)| --> 0   when n --> infty\n *\n * the sup function is over all real numbers (i.e. sup_{x \\in R}).\n *\n * We consider a problem in which random samples are taken from two\n * populations: sample1, or X_1, ..., X_n, from population 1; sample2,\n * or Y_1, ..., Y_m, from population 2.\n * We must determine, on the basis of the observed values in the\n * samples, whether they come from the same distribution or not\n * (hence the name \"test of homogeneity\").\n *\n * The hypotheses to be tested are as follow:\n *\n *     H0: sample1 and sample2 are taken from distributions having\n *         the same distribution function\n *     H1: sample1 and sample2 are taken from distributions having\n *         different distribution functions\n *\n * To determine which hypothesis is the most likely, we must shape\n * a test statistic. Since the sample c.d.f.'s appeared to be estimators\n * of the non-empirical c.d.f.'s, and since c.d.f.'s are caracteristic of\n * the distribution of their random variable (i.e., if two random variables\n * have same c.d.f., then they also have same d.f.), it seems a good idea\n * to base a test on the difference between the two sample c.d.f.'s, and take\n * large sample sizes to approximate the difference between the corresponding\n * distribution functions.\n *\n * Hence we consider the statistic D_nm defined as follows:\n *\n *     D_nm = sup | X_emp_cdf(x) - Y_emp_cdf(x) |\n *\n * When the null hypothesis H0 is true, X_cdf and Y_cdf are identical functions\n * and we can easily deduce from the Glivenko-Cantelli theorem, by bounding\n * |X_emp_cdf(x) - Y_emp_cdf(x)|, that:\n *\n *     D_nm --> 0 when n,m --> infty\n *\n * It seems therefore reasonable to use a test of the form:\n *\n *     Reject H0 if D_nm > some critical value c\n *\n * but in fact, we will use the following test:\n *\n *     Reject H0 if sqrt(n*m/(n+m))*D_nm > c\n *\n * because of the following result, established in 1933 by Andreï Nikolaevitch\n * Kolmogorov and Vladimir Ivanovitch Smirnov:\n *\n *     For any t > 0,\n *     Pr (sqrt(n)*D_n <= t) -->\n *               1 - 2*Sum_{i=1:infty} (-1)^(i-1) exp(-2 i^2 t^2)\n *     when n --> infty\n *\n * a result that has been adapted to two-sample testing:\n *\n *     For any t > 0, if the null hypothesis H0 is true, then\n *     Pr (sqrt(n_approx)*D_nm <= t) -->\n *               1 - 2*Sum_{i=1:infty} (-1)^(i-1) exp(-2 i^2 t^2)\n *     when n,m --> infty and with n_approx = n*m/(n+m)\n *\n * We call \"Kolmogorov's statistic\" the statistic D_nm, and \"limiting form\n * of Kolmogorov's statistic\" the function\n *\n *     L(t) = 1 - 2*Sum_{i=1:infty} (-1)^(i-1) exp(-2 i^2 t^2)\n *\n * The result of Kolmogorov and Smirnov is of crucial importance, because\n * it enables the computation of Pr ( sqrt(nm/n+m)*D_nm <= t ) through the\n * (much easier) computation of the limiting form L(t), for large samples.\n *\n * As for Pearson chi-square test or Wilcoxon-Mann-Whitney ranks test,\n * we are interested in computing p-values for this test.\n *\n * For instance, let's suppose the observed values of sample1 and sample2\n * yield the value d for Kolmogorov's statistic D_nm, i.e. the value\n * sqrt(nm/n+m)*d for the statistic sqrt(nm/n+m)*D_nm. The very definition\n * of p as the significance level alpha corresponding to the critical value\n * c = sqrt(nm/n+m)*d reads:\n *\n *     p = Pr (H0 rejected | H0 true)\n *       = Pr (sqrt(nm/n+m)*D_nm > c = sqrt(nm/n+m)*d | H0 true)\n *       = Pr (D_nm > d | H0 true)\n *       = 1 - Pr (D_nm < d | H0 true)\n *\n * Moreover, we know that when H0 is true and sample sizes are large,\n *\n *     Pr (sqrt(nm/n+m)*D_nm < t) ~= L(t)\n *\n * i.e. with t = sqrt(nm/n+m)*d\n *\n *     Pr (D_nm < d) ~= L(sqrt(nm/n+m)*d)\n *\n * (we use ~= for \"approximately equal to\").\n *\n * Summary: once we have the value d of the test statistic D_nm (a value\n * that is really easy to obtain from the values in the samples),\n * we compute the p-value of the observed sample thanks to the formula\n * p = 1 - L(sqrt(nm/n+m)*d) (we stop the Sum in L when we think a\n * sufficient accuracy has been reached).\n *\n * In fact, in this implementation of Kolmogorov-Smirnov test of homogeneity,\n * we don't use the limiting form, but a small C procedure provided by\n * Marsaglia et al., K(int n,double d) (source code below), that computes\n * very quickly K(n,d) = Pr(D_n < d) with 13-15 digit accuracy -- much more\n * that what we need. The fact that it computes Pr(D_n < d) rather than\n * Pr(D_nm < d) is not a problem as we learned previously that both had the\n * same limiting forms, that samples sizes are large and that we do not\n * require a really high accuracy...\n *\n * Hence we have at last:\n *\n *     p = 1 - K(n_approx,d)    with  n_approx = nm/n+m\n *\n * Last thing to know: the differences between the one-sided and two-sided\n * versions of this test.\n *\n * \"Reject H0 if sqrt(n*m/(n+m))*D_nm > c\" is clearly one-sided; however,\n * because of the |.| in D_nm, the _meaning_ of the test is in fact two-sided.\n * Indeed, as D_nm = sup |X_emp_cdf(x) - Y_emp_cdf(x)|, rejecting H0 when\n * sqrt(nm/n+m)*D_nm > c means rejecting it when the \"gap\" between\n * X_emp_cdf(x) and Y_emp_cdf(x) is too large, in a direction or in the other,\n * i.e. X_emp_cdf(x) \"above\" or \"under\" Y_emp_cdf(x). Which translates\n * respectively into the X_k being globally smaller or larger than the Y_l.\n *\n * It is quite easy to make a two-sided test (with the meaning of a one-sided\n * test) from this one-sided test (with the meaning of a two-sided). This test\n * would be \"Reject H0 if sqrt(nm/n+m)*D_nm != 0\", or, better, the approximated\n * two-sided test: \"Reject H_0 if |sqrt(nm/n+m)*D_nm| > epsilon\", with epsilon\n * a very small number. However, this would not make much sense if D_nm was\n * still defined as sup |X_emp_cdf(x) - Y_emp_cdf(x)|; we should better\n * redefine it as D_nm = sup (X_emp_cdf(x) - Y_emp_cdf(x)).\n *\n * That would mean that, instead of rejecting H0 when the \"gap\" between\n * the c.d.f. X_emp_cdf(x) and Y_emp_cdf(x) is too large, in a direction or\n * in the other, we would reject it only if the \"gap\" is too large in the\n * sense of X_emp_cdf(x) being \"above\" Y_emp_cdf(x). Translation: when the\n * X_k are globally smaller than the Y_l. For instance, the X_k could be\n * the \"old\" traffic samples, and the Y_l the \"new\" ones, and we would\n * consider as anomalies only those that are more likely to be attacks\n * (increases in network traffic).\n *\n * As the p-value depends on the form of the test, we have to find another\n * formula for the p-value of this two-sided test, \"Reject H_0 if\n * |sqrt(nm/n+m)*D_nm| > epsilon\". Let's denote d the value of the\n * statistic D_nm yielded by the values in the sample, i.e. the value\n * of the statistic sqrt(nm/n+m)*D_nm is sqrt(nm/n+m)*d. Note that D_nm\n * is no longer Kolmogorov's statistic, but a modified Kolmogorov's statistic:\n * D_nm = sup (X_emp_cdf(x) - Y_emp_cdf(x)). As previously:\n *\n *     p = Pr (H0 rejected | H0 true)\n *       = Pr (|sqrt(nm/n+m)*D_nm| > sqrt(nm/n+m)*d | H0 true)\n *\n * Here we need a little drawing representing the distribution function of\n * the random variable sqrt(nm/n+m)*D_nm (let's assume it is something like\n * a regular curve from -infty to +infty, somewhat symetrical around 0 --\n * wishful thinking: there are no reasons it should be so):\n *\n *     p is the sum of the areas under the curve that are located further\n *     away from 0 than the value sqrt(nm/n+m)*d, i.e. from -infty to\n *     -sqrt(nm/n+m)*d and from sqrt(nm/n+m)*d to infty\n *\n * We suppose these areas are symetrical, each equals p/2 and we have:\n *\n *     1 - p/2 is the area under the curve from -infty to sqrt(nm/n+m)*d\n *\n * i.e.:\n *\n *     1 - p/2 = Pr (sqrt(nm/n+m)*D_nm < sqrt(nm/n+m)*d)\n *             = Pr (D_nm < d)\n *     p = 2 * (1 - Pr (D_nm < d))\n *\n * The difficulty resides in computing Pr (D_nm < d) now that D_nm is no\n * longer Kolmogorov's statistic... Neither the limiting form nor Marsaglia's\n * procedure are able to compute it, and although there exists a two-sample\n * version for Kolmogorov-Smirnov c.d.f. test, we are limited to the\n * one-sample version (unless we find one day a method to compute this\n * probability).\n */\n\n\n/* The following three functions are copied from\n * G. Marsaglia, W. W. Tsang, J. Wang: \"Evaluating  Kolmogorov's Distribution\"\n *\n * The third one compute K(n,d) = Pr(D_n < d), where D_n is Kolmogorov's\n * goodness-of-fit measure for a sample c.d.f., n being the size of the\n * random sample: D_n = sup |X_emp_cdf(x) - X_cdf(x)| with the notations\n * of the previous text.\n *\n * The results correspond more or less to the tables found in\n * Hartung: \"Statistik\", 13rd Edition, pp. 521-523\n */\nvoid mMultiply(double *A,double *B,double *C,int m)\n{\n    int i,j,k; double s;\n    for(i=0;i<m;i++) for(j=0; j<m; j++)\n    {s=0.; for(k=0;k<m;k++) s+=A[i*m+k]*B[k*m+j]; C[i*m+j]=s;}\n}\n\nvoid mPower(double *A,int eA,double *V,int *eV,int m,int n)\n{\n    double *B;int eB,i;\n    if(n==1) {for(i=0;i<m*m;i++) V[i]=A[i];*eV=eA; return;}\n    mPower(A,eA,V,eV,m,n/2);\n    B=(double*)malloc((m*m)*sizeof(double));\n    mMultiply(V,V,B,m); eB=2*(*eV);\n    if(n%2==0){for(i=0;i<m*m;i++) V[i]=B[i]; *eV=eB;}\n    else {mMultiply(A,B,V,m); *eV=eA+eB;}\n    if(V[(m/2)*m+(m/2)]>1e140) {for(i=0;i<m*m;i++) V[i]=V[i]*1e-140;*eV+=140;}\n    free(B);\n}\n\ndouble K(int n,double d)\n{\n   int k,m,i,j,g,eH,eQ;\n   double h,s,*H,*Q;\n    /* OMIT NEXT TWO LINES IF YOU REQUIRE >7 DIGIT ACCURACY IN THE RIGHT TAIL*/\ns=d*d*n;\nif(s>7.24||(s>3.76&&n>99) || n > 15000) return 1-2*exp(-(2.000071+.331/sqrt(n)+1.409/n)*s);\n   k=(int)(n*d)+1;\n   m=2*k-1;\n   h=k-n*d;\n   H=(double*)malloc((m*m)*sizeof(double));\n   Q=(double*)malloc((m*m)*sizeof(double));\n       for(i=0;i<m;i++)\n         for(j=0;j<m;j++)\n           if(i-j+1<0) H[i*m+j]=0;\n          else     H[i*m+j]=1;\n    for(i=0;i<m;i++)\n    {\n    H[i*m]-=pow(h,i+1);\n    H[(m-1)*m+i]-=pow(h,(m-i));\n    }\n    H[(m-1)*m]+=(2*h-1>0?pow(2*h-1,m):0);\n    for(i=0;i<m;i++)\n    for(j=0;j<m;j++)\n    if(i-j+1>0)\n        for(g=1;g<=i-j+1;g++) H[i*m+j]/=g;\n    eH=0;\n    mPower(H,eH,Q,&eQ,m,n);\n    s=Q[(k-1)*m+k-1];\n    for(i=1;i<=n;i++)\n    {\n    s=s*i/n;\n    if(s<1e-140){s*=1e140; eQ-=140;}\n    }\n    s*=pow(10.,eQ);\n    std::cerr << \"s: \" << s << std::endl;\n    \n    free(H);\n    free(Q);\n    return s;\n}\n\n\n/* This function returns an approximated p-value of the Kolmogorov-Smirnov\n * c.d.f. test of homogeneity.\n *\n * The hypothesis to be tested are:\n *\n *     H0: the samples are drawn from populations having same distribution\n *     H1: they come from populations with different distributions\n *\n * The test statistic is (notations from the long explanation at the\n * beginning of this file):\n *\n *     D_nm = sup | X_emp_cdf(x) - Y_emp_cdf(x) |\n *\n * The p-value when the observed value for D_nm is d is computed through\n * (proof in the text at the beginning of this file):\n *\n *     p = 1 - Pr (D_nm < d)\n *       = 1 - K(n_approx,d)    with  n_approx = nm/n+m\n *\n * We reject H0 to significance level alpha if p-value < alpha,\n * i.e. considering the data we have, we reject H0 to any significance\n * level alpha > p-value.\n *\n * As explained at the end of the \"opening text\", Kolmogorov-Smirnov test\n * is a one-sided test (with a meaning of a two-sided), and the corresponding\n * two-sided test (with the meaning of a one-sided), although it exists and\n * makes sense, cannot be computed easily with our current means (Marsaglia's\n * procedure or the limiting form of D_nm).\n */\ndouble ks_test (std::list<int64_t> sample1, std::list<int64_t> sample2,\n        std::ostream& outfile, bool printDebug) {\n\n  unsigned int n1, n2, n_approx;\n    // sample sizes\n  float d;\n    // the value of Kolmogorov's statistic\n    // for the particular values of sample1 and sample2\n  int D, Dmin, Dmax, s;\n    // used in computing this value d\n\n  std::list<int64_t>::iterator it1, it2;\n\n  // Determine sample sizes\n  n1 = sample1.size();\n  n2 = sample2.size();\n\n  // Calculate a conservative n approximation\n  n_approx = (unsigned) ceil(float(n1*n2)/(n1+n2));\n  if (printDebug) outfile << \"n_approx=\" << n_approx << std::endl;\n\n  // Sort samples\n  sample1.sort(); //outfile << \"sorted sample1: \" << sample1 << std::endl;\n  sample2.sort(); //outfile << \"sorted sample2: \" << sample2 << std::endl;\n\n  // We divide the range 0..1 into n1*n2 intervals of equal size 1/(n1*n2).\n  //\n  // Each item in sample1 makes the sample c.d.f of sample1\n  // jump by a step of n2 intervals.\n  // Each item in sample2 makes the sample c.d.f of sample2\n  // jump by a step of n1 intervals.\n  //\n  // For each item we compute D, related to the distance between the two\n  // sample c.d.f., s_cdf_1 - s_cdf_2, by:\n  //\n  //    D/(n1*n2) = s_cdf_1 - s_cdf_2\n  //\n  // We want to determine:\n  //\n  //    Dmin/(n1*n2) = min [s_cdf_1 - s_cdf_2] <= 0\n  //    Dmax/(n1*n2) = max [s_cdf_1 - s_cdf_2] >= 0\n  //\n  // And then the value of Kolmorogov's statistic D_n1n2 is just:\n  //\n  //    D_n1n2 = sup |s_cdf_1 - s_cdf_2|\n  //           = max [ |Dmin/(n1*n2)| ; |Dmax/(n1*n2)| ]\n\n  D = 0; Dmin = 0; Dmax = 0;\n  it1 = sample1.begin();\n  it2 = sample2.begin();\n\n  while ( (it1 != sample1.end()) && (it2 != sample2.end()) ) {\n\n    if (*it1 == *it2) {\n\n        if (printDebug) outfile << *it1 << \" tie!\";\n      // steps in both sample c.d.f., we need to perform all steps\n      // in this point before comparing D to Dmin and Dmax\n\n      s = *it1;\n      // perform all steps in s_cdf_1 first\n      do {\n    D += n2;\n    it1++;\n      }\n      while ( (*it1 == s) && (it1 != sample1.end()) );\n      // perform all steps in s_cdf_2 now\n      do {\n    D -= n1;\n    it2++;\n      }\n      while ( (*it2 == s) && (it2 != sample2.end()) );\n\n      // now adapt Dmin, Dmax if necessary\n      if (D > Dmax)\n    Dmax = D;\n      else if (D < Dmin)\n    Dmin = D;\n\n    }\n\n    else if (*it1 < *it2) {\n\n    if (printDebug) outfile << *it1;\n      // step in s_cdf_1, increase D by n2\n      D += n2;\n      it1++;\n\n      if (D > Dmax)\n    Dmax = D;\n\n    }\n\n    else {\n\n      if (printDebug) outfile << *it2;\n      // step in F2, decrease D by n1\n      D -= n1;\n      it2++;\n\n      if (D < Dmin)\n    Dmin = D;\n\n    }\n\n      if (printDebug) outfile << \" D=\" << D << \" Dmin=\" << Dmin << \" Dmax=\" << Dmax << std::endl;\n\n  }\n\n  // For two-sided test, take D = max (|Dmax|, |Dmin|) and compute\n  // the value d of Kolmogorov's statistic (two-sided only)\n\n  if (-Dmin > Dmax)\n    D = -Dmin;\n  else\n    D = Dmax;\n\n  // Hence the observed value of Kolmogorov's statistic:\n  d = float(D)/(n1*n2);\n\n  // Return p-value\n  return 1 - K(n_approx,d);\n\n}\n\n\n/*\n D+ = max{(i/N)-Ri}, 1<=i<=N\n D- = max{(Ri-((i-1)/N)}, 1<=i<=N\n\n Step4: Compute calculated D:\n D= max(D+, D-):\n \n \n \n */\n\ndouble ks_test_of_uniformity(std::vector<double> sampleVect0to1, std::ostream& outfile, bool printDebug) {\n\n    unsigned int N = sampleVect0to1.size();\n    \n    double d, Dplusmax, Dminusmax;\n    std::vector<double> DplusVals(N,0.0);\n    std::vector<double> DminusVals(N,0.0);\n    \n    for (int i = 0; i < N; i++) {\n        int j = i+1;\n        double ratio = (double)j/N;\n        double ratiominus = (double)i/N;\n        DplusVals[i] = ratio - sampleVect0to1[i];\n        DminusVals[i] = sampleVect0to1[i] - ratiominus;\n    }\n    \n    Dplusmax = *max_element(DplusVals.begin(), DplusVals.end());\n    Dminusmax = *max_element(DminusVals.begin(), DminusVals.end());\n    \n    \n  //  std::cerr << \"Dplusmax: \" << Dplusmax << std::endl;\n  //  std::cerr << \"Dminusmax: \" << Dminusmax << std::endl;\n    \n  //  print_vector(DplusVals, std::cerr, ',');\n  //  print_vector(DminusVals, std::cerr, ',');\n    \n    \n    if (Dplusmax > Dminusmax) {\n        d = Dplusmax;\n    } else {\n        d = Dminusmax;\n    }\n    \n   // std::cerr << \"d: \" << d << std::endl;\n    \n    // Return p-value\n    return 1 - KScdf(N,d);\n\n}\n"
  },
  {
    "path": "kstest.h",
    "content": "/**************************************************************************/\n/*    Copyright (C) 2006 Romain Michalec                                  */\n/*                                                                        */\n/*    This library is free software; you can redistribute it and/or       */\n/*    modify it under the terms of the GNU Lesser General Public          */\n/*    License as published by the Free Software Foundation; either        */\n/*    version 2.1 of the License, or (at your option) any later version.  */\n/*                                                                        */\n/*    This library is distributed in the hope that it will be useful,     */\n/*    but WITHOUT ANY WARRANTY; without even the implied warranty of      */\n/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   */\n/*    Lesser General Public License for more details.                     */\n/*                                                                        */\n/*    You should have received a copy of the GNU Lesser General Public    */\n/*    License along with this library; if not, write to the Free Software   */\n/*    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,          */\n/*    MA  02110-1301, USA                                                 */\n/*                                                                        */\n/**************************************************************************/\n\n#ifndef kstest_h\n#define kstest_h\n#include <list>\n#include <cstdlib>\n#include <stdio.h>\n#include <sstream>\n#include <fstream>\n#include <vector>\n\nvoid mMultiply(double *A,double *B,double *C,int m);\nvoid mPower(double *A,int eA,double *V,int *eV,int m,int n);\ndouble K(int n,double d);\n\ndouble ks_test(std::list<int64_t> s1, std::list<int64_t> s2, std::ostream& output, bool printDebug);\ndouble ks_test_of_uniformity(std::vector<double> sampleVect0to1, std::ostream& outfile, bool printDebug);\n\n\n#endif /* kstest_h */\n"
  },
  {
    "path": "utils/DtriosParallel",
    "content": "#!/usr/bin/env python\n\n\"\"\"\nThis script automates parallelisation of Dsuite Dtrios/ Dsuite DtriosCombine.\nThis script was tested with python 2.7 and 3.6. It only uses standard \npython libraries. Hence it should run on most systems with a standard \npython installation.\n\"\"\"\n\nfrom __future__ import (print_function, unicode_literals, division)\n\nimport os, sys\nimport subprocess\nfrom multiprocessing import Pool\nimport argparse\nimport logging\n\n\n\nlogger = logging.getLogger()\nlogging.basicConfig(format='%(levelname)-8s %(asctime)s  %(message)s')\n#logging.basicConfig(format='%(levelname)-8s %(asctime)s %(funcName)20s()  %(message)s')\nlogger.setLevel(logging.DEBUG)\n\n\nout_prefix = 'DTparallel'\n\n\ndef get_n_snps(vcf_fn):\n    \"\"\"\n    Gets the number of variants in the VCF file.\n    \"\"\"\n    vcf_ext = os.path.splitext(vcf_fn)[-1]\n    catfun = 'gzip -dc' if vcf_ext in ['.gz','.bz'] else 'cat'\n    \n\n\n    command = '{catfun} {vcf_fn} | grep -v \"^#\" | wc -l'.format(catfun=catfun, vcf_fn=vcf_fn)\n    p = subprocess.Popen(command,\n                     shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n    n_snps, err = p.communicate(b\"input data that is passed to subprocess' stdin\")\n    if p.returncode:\n        #logging.error('{e}'.format(e=e))\n        raise subprocess.CalledProcessError(p.returncode, command)\n\n    n_snps = int(n_snps)\n\n\n    return n_snps\n        \n\n\ndef run_command(command_str):\n    p = subprocess.Popen(command_str,\n                         shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n    o, e = p.communicate()\n    rc = p.returncode\n    return rc, o, e\n\n\ndef dsuite_dtrios(vcf_fns, sets_fn, run_name, n_cores, tree_fn=None, JKnum=None,\n                  JKwindow=None,\n                  n_snps=None, dsuite_path='', environment_setup='',\n                  use_genotype_probabilities=False,\n                  pool_seq=None\n                  ):\n\n    if run_name is None:\n        run_str = ''\n    else:\n        run_str = \"--run-name {}\".format(run_name)\n\n\n    if tree_fn is None:\n        tree_str = ''\n    else:\n        tree_str = \"--tree {}\".format(tree_fn)\n\n    if JKnum is None:\n        jknum_str = \"\"\n    else:\n        jknum_str = \"--JKnum {}\".format(JKnum)\n\n    if JKwindow is None:\n        jkwindow_str = \"\"\n    else:\n        jkwindow_str = \"--JKwindow {}\".format(JKwindow)\n\n    if use_genotype_probabilities:\n        gt_prob_str = \"--use-genotype-probabilities\"\n    else:\n        gt_prob_str = \"\"\n\n    if pool_seq is not None:\n        pool_seq_str = \"--pool-seq {}\".format(pool_seq)\n    else:\n        pool_seq_str = \"\"\n\n    sets_base = os.path.splitext(sets_fn)[0]\n\n    pool = Pool(n_cores)\n\n\n    if n_snps is None:\n        logging.info(\"Checking number of variants in vcf file(s)\")\n        snp_map = pool.map_async(get_n_snps, vcf_fns)\n        n_snps = snp_map.get()\n        logging.info(\"Lines in in vcf file(s): {n_snps}\".format(n_snps=n_snps))\n\n    assert len(n_snps) == len(vcf_fns)\n\n    if dsuite_path and dsuite_path[-1] != '/':\n        dsuite_path += '/'\n\n    #determine number of chunks per vcf file based on cpus per file and line numbers\n    chunks_per_file = [1] * len(vcf_fns)\n    for i in range(len(vcf_fns), n_cores):\n        lines_per_chunk = [1. * a / b for a, b in zip(n_snps, chunks_per_file)]\n        most_lines_per_core = lines_per_chunk.index(max(lines_per_chunk))\n        chunks_per_file[most_lines_per_core] += 1\n\n    lines_per_chunk = [int(1. * a / b) + 1 for a, b in zip(n_snps, chunks_per_file)]\n\n    params = []\n    i = 0\n    for fn, n_snp, n_lines, n_chunks in zip(vcf_fns, n_snps, lines_per_chunk, chunks_per_file):\n        if n_chunks == 1:\n            params.append((i, fn, 0, 'all'))\n            i += 1\n        else:\n            for start in range(1, n_snp, n_lines):\n                params.append( (i, fn, start, n_lines))\n                i += 1\n\n    def get_command_str(i, vcf_fn, start, n_lines):\n\n        out_base = sets_base + '_' + str(i)\n\n        if n_lines == 'all':\n            region_str = \"\"\n        else:\n            region_str = \"--region={start},{n_lines}\".format(start=start, n_lines=n_lines)\n\n        command_str = (\n        '{environment_setup} {dsuite_path}Dsuite Dtrios --out-prefix {out_base} {run_str} {tree_str} {jknum_str}'\n        '{jkwindow_str} {gt_prob_str} {pool_seq_str} {region_str} {vcf_fn} {sets_fn}'.format(\n            environment_setup=environment_setup, dsuite_path=dsuite_path,\n            run_str=run_str, tree_str=tree_str,\n            jknum_str=jknum_str, jkwindow_str=jkwindow_str, region_str=region_str,\n            vcf_fn=vcf_fn, sets_fn=sets_fn,\n            gt_prob_str=gt_prob_str, pool_seq_str=pool_seq_str, out_base=out_base))\n        return command_str\n\n\n    logging.info(\"Parallelizing {n_vcfs} vcf files across {n_cores} cores. \"\n                 \"Using the following numbers of chunks per file based on line-numbers: {chunks_per_file}.\".format(n_vcfs=len(vcf_fns),\n                                                                                                    n_cores=n_cores,\n                                                                                                    chunks_per_file=chunks_per_file))\n\n    dtrios_commnands = [get_command_str(*p) for p in params]\n\n    for c in dtrios_commnands:\n        logging.info('Starting process: {command_str}'.format(command_str=c))\n\n    \n    map_dsuite = pool.map_async(run_command, dtrios_commnands)\n\n    results = map_dsuite.get()\n\n    for (rc, o, e), c, (run_id, fn, start, n_lines) in zip(results, dtrios_commnands, params):\n        o = o.decode('utf-8')\n        e = e.decode('utf-8')\n        if o:\n            logging.debug(o)\n        if rc:\n            logging.error(e)\n            raise subprocess.CalledProcessError(rc, c)\n        else:\n            if e:\n                logging.debug('{e}'.format(e=e))\n            logging.info('Successfully finished process for parameters: fn={fn},'\n                         ' start={start}, n_lines={n_lines}'.format(fn=fn, start=start, n_lines=n_lines))\n            #print('------------------------------------------------------', file=sys.stderr)\n    \n    return params\n    \ndef dsuite_combine(params, run_name, sets_fn,  tree_fn=None,\n                   remove_intermediate_files=True, dsuite_path='',  environment_setup=''):\n    \"\"\"\n    Combine Dsuite Dtrios runs run with dsuite_dtrios()\n    :param params: This should be a list of tuples of length three,\n                    each containing combinations of (n_run, vcf_fn, start, n_lines)\n    :param run_name:\n    :param sets_fn:\n    :param tree_fn:\n    :param remove_intermediate_files:\n    :param dsuite_path:\n    :param environment_setup:\n    :return: Returncode of the Dsuite DtriosCombine command. (0 if no error)\n    \"\"\"\n    \n    extensions = ['_combine.txt', '_combine_stderr.txt', '_BBAA.txt', '_Dmin.txt']\n    \n    sets_dir = os.path.dirname(os.path.abspath(sets_fn))\n    sets_fn1 = os.path.basename(sets_fn)\n    sets_base = os.path.splitext(sets_fn1)[0]\n\n    if dsuite_path and dsuite_path[-1] != '/':\n        dsuite_path += '/'\n\n    if run_name is None:\n        run_str = \"\"\n        run_name = \"\"\n    else:\n        run_str = \"--run-name \" + run_name\n\n    if tree_fn is None:\n        tree_str = ''\n    else:\n        tree_str = \"--tree {}\".format(tree_fn)\n\n    outbases = []\n\n\n    for (n_run, vcf_fn, start, n_lines) in params:\n        if n_lines == 'all':\n            region_str = ''\n        else:\n            region_str = \"_{s}_{e}\".format(s=start,e=start + n_lines)\n\n        outbase =  '{sets_base}_{n_run}_{run_name}{region_str}'.format(sets_base=sets_base,\n                                                                   n_run=n_run,run_name=run_name,\n                                                                       region_str=region_str)\n        outbases.append(outbase)\n\n\n    combine_command = 'cd {sets_dir}; {environment_setup} {dsuite_path}Dsuite DtriosCombine --out-prefix {out_prefix}_{sets_base} {run_str} {tree_str} '.format(\n                        out_prefix = out_prefix,\n                        sets_base=sets_base,\n                        sets_dir=sets_dir,\n                        environment_setup=environment_setup, dsuite_path=dsuite_path, run_str=run_str, tree_str=tree_str) \\\n                        + ' '.join(outbases)\n\n\n    #print('------------------------------------------------------', file=sys.stderr)\n    logging.info('Combining output from {} runs with: '.format(len(params)) + combine_command)\n\n    p = subprocess.Popen(combine_command,\n                         shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n\n    o, e = p.communicate()\n    o = o.decode('utf-8')\n    e = e.decode('utf-8')\n    if o:\n        print(o, file=sys.stdout)\n    if p.returncode:\n        if e:\n            logging.error(e)\n        raise subprocess.CalledProcessError(p.returncode, combine_command)\n    else:\n        if e:\n            logging.debug(e)\n            logging.info('Successfully combined {} runs into output files'\n              ' with base {}'.format(len(params),\n                                                os.path.join(sets_dir,out_prefix + '_' + sets_base + '_' +\n                                                            run_name+'_combined_*')))\n\n\n\n    #print('------------------------------------------------------', file=sys.stderr)\n\n    if remove_intermediate_files:\n        \n        logging.info('Removing intermediate files.')\n        intermediate_files = []\n        for b in outbases:\n            for ex in extensions:\n                intermediate_files.append(b+ex)\n\n        p = subprocess.Popen('cd {}; rm {}'.format(sets_dir, ' '.join(intermediate_files)),\n                             shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n\n        o, e = p.communicate()\n        o = o.decode('utf-8')\n        e = e.decode('utf-8')\n        if o:\n            print(o, file=sys.stdout)\n        if p.returncode:\n            logging.error(e)\n\n    return p.returncode \n\n\n\n\ndef main():\n\n\n    class SplitArgs(argparse.Action):\n        def __call__(self, parser, namespace, values, option_string=None):\n            setattr(namespace, self.dest, [int(s) for s in values.split(',')])\n\n\n    parser = argparse.ArgumentParser(description=(\"This python script automates parallelisation of Dsuite Dtrios/ Dsuite DtriosCombine. \"\n                                                  \"The usage is analogous to Dsuite Dtrios but computation is performed \"\n                                                  \" on multiple cores (default: number of available CPUs). \"\n                                                  \"[ATTENTION: The order of SETS.txt and INPUT_FILE.vcf is swapped compared \"\n                                                  \" to Dsuite Dtrios. This is so that multiple VCF intput files can be provided.]\"\n                                                  \" Output_files are  placed in the same folder as as the the SETS.txt file \"\n                                                  \" and named DTParallel_<SETS_basename>_<run_name>_combined_BBAA.txt etc. .\"\n                                                  \"This script should run on most systems with a standard python installation \"\n                                                  \"(tested with python 2.7 and 3.6).\"))\n    parser.add_argument(\"sets_fn\", metavar=\"SETS.txt\",\n                        help=(\"The SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\\n\"\n                        \"The outgroup (can be multiple samples) should be specified by using the \"\n                        \"keyword Outgroup in place of the SPECIES_ID\"))\n\n\n\n    parser.add_argument(\"vcf_fns\", metavar=\"INPUT_FILE.vcf\", nargs='+', help=\"One or more whitespace separated SNP vcf files.\")\n\n\n\n    #DSUITE options\n    parser.add_argument(\"-k\", \"--JKnum\", type=int,\n                                help=(\"(default=20) the number of Jackknife blocks to divide the dataset into;\"\n                                      \" should be at least 20 for the whole dataset\"),\n                                                        default=None)\n    parser.add_argument(\"-j\", \"--JKwindow\", type=int,\n                                help=(\"Jackknife block size in number of informative SNPs (as used in v0.2)\"\n                                      \" when specified, this is used in place of the --JKnum option\"),\n                                                        required=False)\n\n    parser.add_argument(\"-t\", \"--tree\", type=str,\n                        help=(\"a file with a tree in the newick format specifying the relationships between populations/species\"\n                             \" D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\"),\n                        required=False)\n\n    parser.add_argument(\"-n\", \"--run-name\", type=str,\n                        help=\"run-name will be included in the output file name\",\n                        required=False)\n\n    parser.add_argument(\"-l\",metavar='NUMLINES',\n                                help=(\"(optional) the number of lines (SNPs) in the VCF input(s) - speeds up operation if known. \"\n                                      \" If N INPUT_FILE.vcf files are provided, there must be N comma-separated integers provided \"\n                                      \"without whitespace between them.\"\n                                      ),\n                                                        default=None,action=SplitArgs)\n\n    parser.add_argument(\"-g\", \"--use-genotype-probabilities\", action='store_true',\n                        help=(\"(optional) use probabilities (GP tag) or calculate them from \"\n                              \"likelihoods (GL or PL tags) using a Hardy-Weinberg prior\"))\n\n    parser.add_argument(\"-p\", \"--pool-seq\", metavar=\"--pool-seq=MIN_DEPTH\", type=int,\n                                help=(\"(default=20) the number of Jackknife blocks to divide the dataset into;\"\n                                      \" should be at least 20 for the whole dataset\"),\n                                                        default=None)\n    parser.add_argument(\"-c\", \"--no-combine\", action='store_true',\n                        help=(\"(optional) do not run DtriosCombine to obtain a single combined results file\"))\n\n\n    # DtriosParallel specific options\n    parser.add_argument(\"--cores\", type=int,\n                                help=(\"(default=CPU count) Number of Dsuite Dtrios processes run in parallel.\"),\n                                                        default=None)\n    parser.add_argument( \"--keep-intermediate\", action='store_true',\n                        help=\"Keep region-wise Dsuite Dtrios results.\")\n    parser.add_argument('--logging_level','-v',\n                                    choices=['DEBUG','INFO','WARNING','ERROR','CRITICAL'],\n                                                            default='INFO',\n                                                            help='Minimun level of logging.')\n    parser.add_argument('--dsuite-path',type=str, required=False,\n                        help=\"Explicitly set the path to the directory in which Dsuite is located. By default the script will first check\"\n                            \" whether Dsuite is accessible from $PATH. \"\n                             \" If not it will try to locate Dsuite at ../Build/Dsuite.\")\n    parser.add_argument('--environment-setup',type=str, required=False,\n                        help=\"Command that should be run to setup the environment for Dsuite. E.g., 'module load GCC' or 'conda activate'\")\n    \n\n\n    args, unknown = parser.parse_known_args()\n\n\n\n    logger.setLevel(getattr(logging, args.logging_level))\n\n\n    if unknown:\n        logger.warning(\"The following unrecognized arguments are not used: {}\".format(unknown))\n\n    if args.sets_fn.endswith('vcf.gz') or args.sets_fn.endswith('vcf'):\n        parser.print_usage()\n        print('\\n')\n        raise Exception(\"SETS.txt file seems to have a vcf extension. \"\n                        \"Note that the SETS.txt file should be in the command line before the INPUT_FILE.vcf file(s).\"\n                        \"The order is swapped compared to Dsuite Dtrios to allow for multiple vcf files. \"\n                        \"\\nThis is the sets filename: {}\".format(args.sets_fn))\n\n\n    def which(program):\n        def is_exe(fpath):\n            return os.path.isfile(fpath) and os.access(fpath, os.X_OK)\n\n        fpath, fname = os.path.split(program)\n        if fpath:\n            if is_exe(program):\n                return program\n        else:\n            for path in os.environ[\"PATH\"].split(os.pathsep):\n                exe_file = os.path.join(path, program)\n                if is_exe(exe_file):\n                    return exe_file\n\n        return None\n\n\n    #check whether Dsuite is accessible\n    if args.dsuite_path is None:\n        dsuite_path = which(\"Dsuite\")\n        if dsuite_path is not None:\n            args.dsuite_path = ''\n        else: \n            args.dsuite_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'Build'))\n\n    if args.environment_setup is None:\n        args.environment_setup = \"\"\n    else:\n        args.environment_setup += ';'\n    \n    if args.cores is None:\n\n        if sys.version_info[0] < 3:\n            import multiprocessing\n            args.cores = multiprocessing.cpu_count()\n        else:\n            args.cores = os.cpu_count()\n    if args.JKwindow is not None:\n        args.JKnum = None\n\n    vcf_fns = [os.path.abspath(vcf_fn) for vcf_fn in args.vcf_fns]\n    sets_fn = os.path.abspath(args.sets_fn)\n    if args.tree is not None:\n        args.tree = os.path.abspath(args.tree)\n\n\n    if args.l is not None:\n        assert len(args.l) == len(vcf_fns), (\"Comma separated line numbers provided do \"\n                                            \"not match number of INPUT_FILE.vcf files provided. Line numbers: {}; Vcf files: {}\".format(args.l, args.vcf_fns))\n\n\n\n\n    params  = dsuite_dtrios(vcf_fns, sets_fn, args.run_name, args.cores, tree_fn=args.tree,\n                                            n_snps=args.l, JKnum=args.JKnum, JKwindow=args.JKwindow,dsuite_path=args.dsuite_path,\n                                            environment_setup=args.environment_setup,\n                                           use_genotype_probabilities=args.use_genotype_probabilities,\n                                            pool_seq=args.pool_seq)\n\n    #lines_per_core = int(n_snps * 1. / args.cores) + 1\n    #starts = range(1, n_snps, lines_per_core)\n\n    if not args.no_combine:\n        rc = dsuite_combine(params , args.run_name, sets_fn, tree_fn=args.tree,\n                            remove_intermediate_files=not args.keep_intermediate, dsuite_path=args.dsuite_path,\n                                                                        environment_setup=args.environment_setup)\n\n    else:\n        rc = 0\n\n    return rc\n\n\nif __name__ == \"__main__\":\n        sys.exit(main())\n\n"
  },
  {
    "path": "utils/dtools.py",
    "content": "#!/usr/bin/env python3\n\n# #START_LICENSE###########################################################\n#\n# Parts of the code used here is modified from the Environment for Tree\n# Exploration program (ETE).  http://etetoolkit.org\n#\n# Both this code and ETE is free software:\n# you can redistribute it and/or modify it\n# under the terms of the GNU General Public License as published by\n# the Free Software Foundation, either version 3 of the License, or\n# (at your option) any later version.\n#\n# ETE is distributed in the hope that it will be useful, but WITHOUT\n# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY\n# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public\n# License for more details.\n#\n# <http://www.gnu.org/licenses/>.\n#\n#\n# #END_LICENSE#############################################################\n\nimport copy, os, sys\nimport argparse\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib as mpl\nfrom matplotlib import pyplot as plt\n\nimport random\nimport copy\nimport itertools\nfrom collections import deque\n#from hashlib import md5\nfrom functools import cmp_to_key\n\n#import six\n#from six.moves import (cPickle, map, range, zip)\n\n\nimport re\n\nITERABLE_TYPES = set([list, set, tuple, frozenset])\n\n# Regular expressions used for reading newick format\n_ILEGAL_NEWICK_CHARS = \":;(),\\[\\]\\t\\n\\r=\"\n_NON_PRINTABLE_CHARS_RE = \"[\\x00-\\x1f]+\"\n\n_NHX_RE = \"\\[&&NHX:[^\\]]*\\]\"\n_FLOAT_RE = \"\\s*[+-]?\\d+\\.?\\d*(?:[eE][-+]?\\d+)?\\s*\"\n# _FLOAT_RE = \"[+-]?\\d+\\.?\\d*\"\n# _NAME_RE = \"[^():,;\\[\\]]+\"\n_NAME_RE = \"[^():,;]+?\"\n\n# thanks to: http://stackoverflow.com/a/29452781/1006828\n_QUOTED_TEXT_RE = r\"\"\"((?=[\"'])(?:\"[^\"\\\\]*(?:\\\\[\\s\\S][^\"\\\\]*)*\"|'[^'\\\\]*(?:\\\\[\\s\\S][^'\\\\]*)*'))\"\"\"\n# _QUOTED_TEXT_RE = r\"\"\"[\"'](?:(?<=\")[^\"\\\\]*(?s:\\\\.[^\"\\\\]*)*\"|(?<=')[^'\\\\]*(?s:\\\\.[^'\\\\]*)*')\"\"]\"]\"\"\"\n# _QUOTED_TEXT_RE = r\"\"\"(?=[\"'])(?:\"[^\"\\\\]*(?:\\\\[\\s\\S][^\"\\\\]*)*\"|'[^'\\\\]*(?:\\\\[\\s\\S][^'\\\\]*)*')]\"]\")\"]\"\"\"\n\n_QUOTED_TEXT_PREFIX = 'ete3_quotref_'\n\nDEFAULT_DIST = 1.0\nDEFAULT_NAME = ''\nDEFAULT_SUPPORT = 1.0\nFLOAT_FORMATTER = \"%0.6g\"\n# DIST_FORMATTER = \":\"+FLOAT_FORMATTER\nNAME_FORMATTER = \"%s\"\n\n\ndef set_float_format(formatter):\n    ''' Set the conversion format used to represent float distances and support\n    values in the newick representation of trees.\n\n    For example, use set_float_format('%0.32f') to specify 32 decimal numbers\n    when exporting node distances and bootstrap values.\n\n    Scientific notation (%e) or any other custom format is allowed. The\n    formatter string should not contain any character that may break newick\n    structure (i.e.: \":;,()\")\n\n    '''\n    global FLOAT_FORMATTER\n    FLOAT_FORMATTER = formatter\n    # DIST_FORMATTER = \":\"+FLOAT_FORMATTER\n\n\n# Allowed formats. This table is used to read and write newick using\n# different convenctions. You can also add your own formats in an easy way.\n#\n#\n# FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\\\n#    [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]]\n#\n# Attributes are placed in the newick as follows:\n#\n# .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ...\n#\n#\n#           /-A\n# -NoName--|\n#          |          /-B\n#           \\C-------|\n#                    |          /-D\n#                     \\E-------|\n#                               \\-G\n#\n# Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);\n# Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);\n# Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);\n# Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);\n# Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));\n# Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);\n# Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);\n# Format 7 = (A,(B,(D,G)E)C);\n# Format 8 = (A,(B,(D,G)));\n# Format 9 = (,(,(,)));\n\nNW_FORMAT = {\n    0: [['name', str, True], [\"dist\", float, True], ['support', float, True], [\"dist\", float, True]],\n    # Flexible with support\n    1: [['name', str, True], [\"dist\", float, True], ['name', str, True], [\"dist\", float, True]],\n    # Flexible with internal node names\n    2: [['name', str, False], [\"dist\", float, False], ['support', float, False], [\"dist\", float, False]],\n    # Strict with support values\n    3: [['name', str, False], [\"dist\", float, False], ['name', str, False], [\"dist\", float, False]],\n    # Strict with internal node names\n    4: [['name', str, False], [\"dist\", float, False], [None, None, False], [None, None, False]],\n    5: [['name', str, False], [\"dist\", float, False], [None, None, False], [\"dist\", float, False]],\n    6: [['name', str, False], [None, None, False], [None, None, False], [\"dist\", float, False]],\n    7: [['name', str, False], [\"dist\", float, False], [\"name\", str, False], [None, None, False]],\n    8: [['name', str, False], [None, None, False], [\"name\", str, False], [None, None, False]],\n    9: [['name', str, False], [None, None, False], [None, None, False], [None, None, False]],\n    # Only topology with node names\n    100: [[None, None, False], [None, None, False], [None, None, False], [None, None, False]]  # Only Topology\n}\n\n\ndef format_node(node, node_type, format, dist_formatter=None,\n                support_formatter=None, name_formatter=None,\n                quoted_names=False):\n    if dist_formatter is None: dist_formatter = FLOAT_FORMATTER\n    if support_formatter is None: support_formatter = FLOAT_FORMATTER\n    if name_formatter is None: name_formatter = NAME_FORMATTER\n\n    if node_type == \"leaf\":\n        container1 = NW_FORMAT[format][0][0]  # name\n        container2 = NW_FORMAT[format][1][0]  # dists\n        converterFn1 = NW_FORMAT[format][0][1]\n        converterFn2 = NW_FORMAT[format][1][1]\n        flexible1 = NW_FORMAT[format][0][2]\n    else:\n        container1 = NW_FORMAT[format][2][0]  # support/name\n        container2 = NW_FORMAT[format][3][0]  # dist\n        converterFn1 = NW_FORMAT[format][2][1]\n        converterFn2 = NW_FORMAT[format][3][1]\n        flexible1 = NW_FORMAT[format][2][2]\n\n    if converterFn1 == str:\n        try:\n            if not quoted_names:\n                FIRST_PART = re.sub(\"[\" + _ILEGAL_NEWICK_CHARS + \"]\", \"_\", \\\n                                    str(getattr(node, container1)))\n            else:\n                FIRST_PART = str(getattr(node, container1))\n            if not FIRST_PART and container1 == 'name' and not flexible1:\n                FIRST_PART = \"NoName\"\n\n        except (AttributeError, TypeError):\n            FIRST_PART = \"?\"\n\n        FIRST_PART = name_formatter % FIRST_PART\n        if quoted_names:\n            # FIRST_PART = '\"%s\"' %FIRST_PART.decode('string_escape').replace('\"', '\\\\\"')\n            FIRST_PART = '\"%s\"' % FIRST_PART\n\n    elif converterFn1 is None:\n        FIRST_PART = \"\"\n    else:\n        try:\n            FIRST_PART = support_formatter % (converterFn2(getattr(node, container1)))\n        except (ValueError, TypeError):\n            FIRST_PART = \"?\"\n\n    if converterFn2 == str:\n        try:\n            SECOND_PART = \":\" + re.sub(\"[\" + _ILEGAL_NEWICK_CHARS + \"]\", \"_\", \\\n                                       str(getattr(node, container2)))\n        except (ValueError, TypeError):\n            SECOND_PART = \":?\"\n    elif converterFn2 is None:\n        SECOND_PART = \"\"\n    else:\n        try:\n            # SECOND_PART = \":%0.6f\" %(converterFn2(getattr(node, container2)))\n            SECOND_PART = \":%s\" % (dist_formatter % (converterFn2(getattr(node, container2))))\n        except (ValueError, TypeError):\n            SECOND_PART = \":?\"\n\n    return \"%s%s\" % (FIRST_PART, SECOND_PART)\n\n\ndef print_supported_formats():\n    from ..coretype.tree import TreeNode\n    t = TreeNode()\n    t.populate(4, \"ABCDEFGHI\")\n    print(t)\n    for f in NW_FORMAT:\n        print(\"Format\", f, \"=\", write_newick(t, features=None, format=f))\n\n\nclass NewickError(Exception):\n    \"\"\"Exception class designed for NewickIO errors.\"\"\"\n\n    def __init__(self, value):\n        if value is None:\n            value = ''\n        value += \"\\nYou may want to check other newick loading flags like 'format' or 'quoted_node_names'.\"\n        Exception.__init__(self, value)\n\n\ndef read_newick(newick, root_node=None, format=0, quoted_names=False):\n    \"\"\" Reads a newick tree from either a string or a file, and returns\n    an ETE tree structure.\n\n    A previously existent node object can be passed as the root of the\n    tree, which means that all its new children will belong to the same\n    class as the root(This allows to work with custom TreeNode\n    objects).\n\n    You can also take advantage from this behaviour to concatenate\n    several tree structures.\n    \"\"\"\n\n    if root_node is None:\n        from ..coretype.tree import TreeNode\n        root_node = TreeNode()\n\n    if isinstance(newick, str):\n        if os.path.exists(newick):\n            if newick.endswith('.gz'):\n                import gzip\n                nw = gzip.open(newick).read()\n            else:\n                nw = open(newick, 'rU').read()\n        else:\n            nw = newick\n\n        matcher = compile_matchers(formatcode=format)\n        nw = nw.strip()\n        if not nw.startswith('(') and nw.endswith(';'):\n            # return _read_node_data(nw[:-1], root_node, \"single\", matcher, format)\n            return _read_newick_from_string(nw, root_node, matcher, format, quoted_names)\n        elif not nw.startswith('(') or not nw.endswith(';'):\n            raise NewickError('Unexisting tree file or Malformed newick tree structure.')\n        else:\n            return _read_newick_from_string(nw, root_node, matcher, format, quoted_names)\n\n    else:\n        raise NewickError(\"'newick' argument must be either a filename or a newick string.\")\n\n\ndef _read_newick_from_string(nw, root_node, matcher, formatcode, quoted_names):\n    \"\"\" Reads a newick string in the New Hampshire format. \"\"\"\n\n    if quoted_names:\n        # Quoted text is mapped to references\n        quoted_map = {}\n        unquoted_nw = ''\n        counter = 0\n        for token in re.split(_QUOTED_TEXT_RE, nw):\n            counter += 1\n            if counter % 2 == 1:  # normal newick tree structure data\n                unquoted_nw += token\n            else:  # quoted text, add to dictionary and replace with reference\n                quoted_ref_id = _QUOTED_TEXT_PREFIX + str(int(counter / 2))\n                unquoted_nw += quoted_ref_id\n                quoted_map[quoted_ref_id] = token[1:-1]  # without the quotes\n        nw = unquoted_nw\n\n    if not nw.startswith('(') and nw.endswith(';'):\n        _read_node_data(nw[:-1], root_node, \"single\", matcher, format)\n        if quoted_names:\n            if root_node.name.startswith(_QUOTED_TEXT_PREFIX):\n                root_node.name = quoted_map[root_node.name]\n        return root_node\n\n    if nw.count('(') != nw.count(')'):\n        raise NewickError('Parentheses do not match. Broken tree structure?')\n\n    # white spaces and separators are removed\n    nw = re.sub(\"[\\n\\r\\t]+\", \"\", nw)\n\n    current_parent = None\n    # Each chunk represents the content of a parent node, and it could contain\n    # leaves and closing parentheses.\n    # We may find:\n    # leaf, ..., leaf,\n    # leaf, ..., leaf))),\n    # leaf)), leaf, leaf))\n    # leaf))\n    # ) only if formatcode == 100\n\n    for chunk in nw.split(\"(\")[1:]:\n        # If no node has been created so far, this is the root, so use the node.\n        current_parent = root_node if current_parent is None else current_parent.add_child()\n\n        subchunks = [ch.strip() for ch in chunk.split(\",\")]\n        # We should expect that the chunk finished with a comma (if next chunk\n        # is an internal sister node) or a subchunk containing closing parenthesis until the end of the tree.\n        # [leaf, leaf, '']\n        # [leaf, leaf, ')))', leaf, leaf, '']\n        # [leaf, leaf, ')))', leaf, leaf, '']\n        # [leaf, leaf, ')))', leaf), leaf, 'leaf);']\n        if subchunks[-1] != '' and not subchunks[-1].endswith(';'):\n            raise NewickError('Broken newick structure at: %s' % chunk)\n\n        # lets process the subchunks. Every closing parenthesis will close a\n        # node and go up one level.\n        for i, leaf in enumerate(subchunks):\n            if leaf.strip() == '' and i == len(subchunks) - 1:\n                continue  # \"blah blah ,( blah blah\"\n            closing_nodes = leaf.split(\")\")\n\n            # first part after splitting by ) always contain leaf info\n            _read_node_data(closing_nodes[0], current_parent, \"leaf\", matcher, formatcode)\n\n            # next contain closing nodes and data about the internal nodes.\n            if len(closing_nodes) > 1:\n                for closing_internal in closing_nodes[1:]:\n                    closing_internal = closing_internal.rstrip(\";\")\n                    # read internal node data and go up one level\n                    _read_node_data(closing_internal, current_parent, \"internal\", matcher, formatcode)\n                    current_parent = current_parent.up\n\n    # references in node names are replaced with quoted text before returning\n    if quoted_names:\n        for node in root_node.traverse():\n            if node.name.startswith(_QUOTED_TEXT_PREFIX):\n                node.name = quoted_map[node.name]\n\n    return root_node\n\n\ndef _parse_extra_features(node, NHX_string):\n    \"\"\" Reads node's extra data form its NHX string. NHX uses this\n    format:  [&&NHX:prop1=value1:prop2=value2] \"\"\"\n    NHX_string = NHX_string.replace(\"[&&NHX:\", \"\")\n    NHX_string = NHX_string.replace(\"]\", \"\")\n    for field in NHX_string.split(\":\"):\n        try:\n            pname, pvalue = field.split(\"=\")\n        except ValueError as e:\n            raise NewickError('Invalid NHX format %s' % field)\n        node.add_feature(pname, pvalue)\n\n\ndef compile_matchers(formatcode):\n    matchers = {}\n    for node_type in [\"leaf\", \"single\", \"internal\"]:\n        if node_type == \"leaf\" or node_type == \"single\":\n            container1 = NW_FORMAT[formatcode][0][0]\n            container2 = NW_FORMAT[formatcode][1][0]\n            converterFn1 = NW_FORMAT[formatcode][0][1]\n            converterFn2 = NW_FORMAT[formatcode][1][1]\n            flexible1 = NW_FORMAT[formatcode][0][2]\n            flexible2 = NW_FORMAT[formatcode][1][2]\n        else:\n            container1 = NW_FORMAT[formatcode][2][0]\n            container2 = NW_FORMAT[formatcode][3][0]\n            converterFn1 = NW_FORMAT[formatcode][2][1]\n            converterFn2 = NW_FORMAT[formatcode][3][1]\n            flexible1 = NW_FORMAT[formatcode][2][2]\n            flexible2 = NW_FORMAT[formatcode][3][2]\n\n        if converterFn1 == str:\n            FIRST_MATCH = \"(\" + _NAME_RE + \")\"\n        elif converterFn1 == float:\n            FIRST_MATCH = \"(\" + _FLOAT_RE + \")\"\n        elif converterFn1 is None:\n            FIRST_MATCH = '()'\n\n        if converterFn2 == str:\n            SECOND_MATCH = \"(:\" + _NAME_RE + \")\"\n        elif converterFn2 == float:\n            SECOND_MATCH = \"(:\" + _FLOAT_RE + \")\"\n        elif converterFn2 is None:\n            SECOND_MATCH = '()'\n\n        if flexible1 and node_type != 'leaf':\n            FIRST_MATCH += \"?\"\n        if flexible2:\n            SECOND_MATCH += \"?\"\n\n        matcher_str = '^\\s*%s\\s*%s\\s*(%s)?\\s*$' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)\n        compiled_matcher = re.compile(matcher_str)\n        matchers[node_type] = [container1, container2, converterFn1, converterFn2, compiled_matcher]\n\n    return matchers\n\n\ndef _read_node_data(subnw, current_node, node_type, matcher, formatcode):\n    \"\"\" Reads a leaf node from a subpart of the original newick\n    tree \"\"\"\n\n    if node_type == \"leaf\" or node_type == \"single\":\n        if node_type == \"leaf\":\n            node = current_node.add_child()\n        else:\n            node = current_node\n    else:\n        node = current_node\n\n    subnw = subnw.strip()\n\n    if not subnw and node_type == 'leaf' and formatcode != 100:\n        raise NewickError('Empty leaf node found')\n    elif not subnw:\n        return\n\n    container1, container2, converterFn1, converterFn2, compiled_matcher = matcher[node_type]\n    data = re.match(compiled_matcher, subnw)\n    if data:\n        data = data.groups()\n        # This prevents ignoring errors even in flexible nodes:\n        if subnw and data[0] is None and data[1] is None and data[2] is None:\n            raise NewickError(\"Unexpected newick format '%s'\" % subnw)\n\n        if data[0] is not None and data[0] != '':\n            node.add_feature(container1, converterFn1(data[0].strip()))\n\n        if data[1] is not None and data[1] != '':\n            node.add_feature(container2, converterFn2(data[1][1:].strip()))\n\n        if data[2] is not None \\\n                and data[2].startswith(\"[&&NHX\"):\n            _parse_extra_features(node, data[2])\n    else:\n        raise NewickError(\"Unexpected newick format '%s' \" % subnw[0:50])\n    return\n\n\ndef write_newick(rootnode, features=None, format=1, format_root_node=True,\n                 is_leaf_fn=None, dist_formatter=None, support_formatter=None,\n                 name_formatter=None, quoted_names=False):\n    \"\"\" Iteratively export a tree structure and returns its NHX\n    representation. \"\"\"\n    newick = []\n    leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children)\n    for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn):\n        if postorder:\n            newick.append(\")\")\n            if node.up is not None or format_root_node:\n                newick.append(format_node(node, \"internal\", format,\n                                          dist_formatter=dist_formatter,\n                                          support_formatter=support_formatter,\n                                          name_formatter=name_formatter,\n                                          quoted_names=quoted_names))\n                newick.append(_get_features_string(node, features))\n        else:\n            if node is not rootnode and node != node.up.children[0]:\n                newick.append(\",\")\n\n            if leaf(node):\n                newick.append(format_node(node, \"leaf\", format,\n                                          dist_formatter=dist_formatter,\n                                          support_formatter=support_formatter,\n                                          name_formatter=name_formatter,\n                                          quoted_names=quoted_names))\n                newick.append(_get_features_string(node, features))\n            else:\n                newick.append(\"(\")\n\n    newick.append(\";\")\n    return ''.join(newick)\n\n\ndef _get_features_string(self, features=None):\n    \"\"\" Generates the extended newick string NHX with extra data about\n    a node. \"\"\"\n    string = \"\"\n    if features is None:\n        features = []\n    elif features == []:\n        features = sorted(self.features)\n\n    for pr in features:\n        if hasattr(self, pr):\n            raw = getattr(self, pr)\n            if type(raw) in ITERABLE_TYPES:\n                raw = '|'.join(map(str, raw))\n            elif type(raw) == dict:\n                raw = '|'.join(map(lambda x, y: \"%s-%s\" % (x, y), raw.items()))\n            elif type(raw) == str:\n                pass\n            else:\n                raw = str(raw)\n\n            value = re.sub(\"[\" + _ILEGAL_NEWICK_CHARS + \"]\", \"_\", \\\n                           raw)\n            if string != \"\":\n                string += \":\"\n            string += \"%s=%s\" % (pr, str(value))\n    if string != \"\":\n        string = \"[&&NHX:\" + string + \"]\"\n\n    return string\n\n####------------------------------------------------------######\n#### This is from ete3 coretype/tree.py\n\n\nDEFAULT_COMPACT = False\nDEFAULT_SHOWINTERNAL = False\nDEFAULT_DIST = 1.0\nDEFAULT_SUPPORT = 1.0\nDEFAULT_NAME = \"\"\n\n\n\n\n\nclass TreeError(Exception):\n    \"\"\"\n    A problem occurred during a TreeNode operation\n    \"\"\"\n\n    def __init__(self, value=''):\n        self.value = value\n\n    def __str__(self):\n        return repr(self.value)\n\n\nclass TreeNode(object):\n    \"\"\"\n    TreeNode (Tree) class is used to store a tree structure. A tree\n    consists of a collection of TreeNode instances connected in a\n    hierarchical way. Trees can be loaded from the New Hampshire Newick\n    format (newick).\n\n    :argument newick: Path to the file containing the tree or, alternatively,\n       the text string containing the same information.\n\n    :argument 0 format: subnewick format\n\n      .. table::\n\n          ======  ==============================================\n          FORMAT  DESCRIPTION\n          ======  ==============================================\n          0        flexible with support values\n          1        flexible with internal node names\n          2        all branches + leaf names + internal supports\n          3        all branches + all names\n          4        leaf branches + leaf names\n          5        internal and leaf branches + leaf names\n          6        internal branches + leaf names\n          7        leaf branches + all names\n          8        all names\n          9        leaf names\n          100      topology only\n          ======  ==============================================\n\n    :returns: a tree node object which represents the base of the tree.\n\n    **Examples:**\n\n    ::\n\n        t1 = Tree() # creates an empty tree\n        t2 = Tree('(A:1,(B:1,(C:1,D:1):0.5):0.5);')\n        t3 = Tree('/home/user/myNewickFile.txt')\n    \"\"\"\n\n    def _get_dist(self):\n        return self._dist\n\n    def _set_dist(self, value):\n        try:\n            self._dist = float(value)\n        except ValueError:\n            raise TreeError('node dist must be a float number')\n\n    def _get_support(self):\n        return self._support\n\n    def _set_support(self, value):\n        try:\n            self._support = float(value)\n        except ValueError:\n            raise TreeError('node support must be a float number')\n\n    def _get_up(self):\n        return self._up\n\n    def _set_up(self, value):\n        if type(value) == type(self) or value is None:\n            self._up = value\n        else:\n            raise TreeError(\"bad node_up type\")\n\n    def _get_children(self):\n        return self._children\n\n    def _set_children(self, value):\n        if type(value) == list and \\\n                len(set([type(n) == type(self) for n in value])) < 2:\n            self._children = value\n        else:\n            raise TreeError(\"Incorrect children type\")\n\n    def _get_style(self):\n        if self._img_style is None:\n            self._set_style(None)\n\n        return self._img_style\n\n    def _set_style(self, value):\n        self.set_style(value)\n\n    #: Branch length distance to parent node. Default = 0.0\n    img_style = property(fget=_get_style, fset=_set_style)\n\n    #: Branch length distance to parent node. Default = 0.0\n    dist = property(fget=_get_dist, fset=_set_dist)\n    #: Branch support for current node\n    support = property(fget=_get_support, fset=_set_support)\n    #: Pointer to parent node\n    up = property(fget=_get_up, fset=_set_up)\n    #: A list of children nodes\n    children = property(fget=_get_children, fset=_set_children)\n\n    def _set_face_areas(self, value):\n        if isinstance(value, _FaceAreas):\n            self._faces = value\n        else:\n            raise ValueError(\"[%s] is not a valid FaceAreas instance\" % type(value))\n\n    def _get_face_areas(self):\n        if not hasattr(self, \"_faces\"):\n            self._faces = _FaceAreas()\n        return self._faces\n\n    faces = property(fget=_get_face_areas, \\\n                     fset=_set_face_areas)\n\n    def __init__(self, newick=None, format=0, dist=None, support=None,\n                 name=None, quoted_node_names=False):\n        self._children = []\n        self._up = None\n        self._dist = DEFAULT_DIST\n        self._support = DEFAULT_SUPPORT\n        self._img_style = None\n        self.features = set([])\n        # Add basic features\n        self.features.update([\"dist\", \"support\", \"name\"])\n        if dist is not None:\n            self.dist = dist\n        if support is not None:\n            self.support = support\n\n        self.name = name if name is not None else DEFAULT_NAME\n\n        # Initialize tree\n        if newick is not None:\n            self._dist = 0.0\n            read_newick(newick, root_node=self, format=format,\n                        quoted_names=quoted_node_names)\n\n    def __nonzero__(self):\n        return True\n\n    def __bool__(self):\n        \"\"\"\n        Python3's equivalent of __nonzero__\n        If this is not defined bool(class_instance) will call\n        __len__ in python3\n        \"\"\"\n        return True\n\n    def __repr__(self):\n        return \"Tree node '%s' (%s)\" % (self.name, hex(self.__hash__()))\n\n    def __and__(self, value):\n        \"\"\" This allows to execute tree&'A' to obtain the descendant node\n        whose name is A\"\"\"\n        value = str(value)\n        try:\n            first_match = next(self.iter_search_nodes(name=value))\n            return first_match\n        except StopIteration:\n            raise TreeError(\"Node not found\")\n\n    def __add__(self, value):\n        \"\"\" This allows to sum two trees.\"\"\"\n        # Should a make the sum with two copies of the original trees?\n        if type(value) == self.__class__:\n            new_root = self.__class__()\n            new_root.add_child(self)\n            new_root.add_child(value)\n            return new_root\n        else:\n            raise TreeError(\"Invalid node type\")\n\n    def __str__(self):\n        \"\"\" Print tree in newick format. \"\"\"\n        return self.get_ascii(compact=DEFAULT_COMPACT, \\\n                              show_internal=DEFAULT_SHOWINTERNAL)\n\n    def __contains__(self, item):\n        \"\"\" Check if item belongs to this node. The 'item' argument must\n        be a node instance or its associated name.\"\"\"\n        if isinstance(item, self.__class__):\n            return item in set(self.get_descendants())\n        elif type(item) == str:\n            return item in set([n.name for n in self.traverse()])\n\n    def __len__(self):\n        \"\"\"Node len returns number of children.\"\"\"\n        return len(self.get_leaves())\n\n    def __iter__(self):\n        \"\"\" Iterator over leaf nodes\"\"\"\n        return self.iter_leaves()\n\n    def add_feature(self, pr_name, pr_value):\n        \"\"\"\n        Add or update a node's feature.\n        \"\"\"\n        setattr(self, pr_name, pr_value)\n        self.features.add(pr_name)\n\n    def add_features(self, **features):\n        \"\"\"\n        Add or update several features. \"\"\"\n        for fname, fvalue in features.items():\n            setattr(self, fname, fvalue)\n            self.features.add(fname)\n\n    def del_feature(self, pr_name):\n        \"\"\"\n        Permanently deletes a node's feature.\n        \"\"\"\n        if hasattr(self, pr_name):\n            delattr(self, pr_name)\n            self.features.remove(pr_name)\n\n    # Topology management\n    def add_child(self, child=None, name=None, dist=None, support=None):\n        \"\"\"\n        Adds a new child to this node. If child node is not suplied\n        as an argument, a new node instance will be created.\n\n        :argument None child: the node instance to be added as a child.\n        :argument None name: the name that will be given to the child.\n        :argument None dist: the distance from the node to the child.\n        :argument None support: the support value of child partition.\n\n        :returns: The child node instance\n\n        \"\"\"\n        if child is None:\n            child = self.__class__()\n\n        if name is not None:\n            child.name = name\n        if dist is not None:\n            child.dist = dist\n        if support is not None:\n            child.support = support\n\n        self.children.append(child)\n        child.up = self\n        return child\n\n    def remove_child(self, child):\n        \"\"\"\n        Removes a child from this node (parent and child\n        nodes still exit but are no longer connected).\n        \"\"\"\n        try:\n            self.children.remove(child)\n        except ValueError as e:\n            raise TreeError(\"child not found\")\n        else:\n            child.up = None\n            return child\n\n    def add_sister(self, sister=None, name=None, dist=None):\n        \"\"\"\n        Adds a sister to this node. If sister node is not supplied\n        as an argument, a new TreeNode instance will be created and\n        returned.\n        \"\"\"\n        if self.up is None:\n            raise TreeError(\"A parent node is required to add a sister\")\n        else:\n            return self.up.add_child(child=sister, name=name, dist=dist)\n\n    def remove_sister(self, sister=None):\n        \"\"\"\n        Removes a sister node. It has the same effect as\n        **`TreeNode.up.remove_child(sister)`**\n\n        If a sister node is not supplied, the first sister will be deleted\n        and returned.\n\n        :argument sister: A node instance\n\n        :return: The node removed\n        \"\"\"\n        sisters = self.get_sisters()\n        if len(sisters) > 0:\n            if sister is None:\n                sister = sisters.pop(0)\n            return self.up.remove_child(sister)\n\n    def delete(self, prevent_nondicotomic=True, preserve_branch_length=False):\n        \"\"\"\n        Deletes node from the tree structure. Notice that this method\n        makes 'disappear' the node from the tree structure. This means\n        that children from the deleted node are transferred to the\n        next available parent.\n\n        :param True prevent_nondicotomic: When True (default), delete\n            function will be execute recursively to prevent\n            single-child nodes.\n\n        :param False preserve_branch_length: If True, branch lengths\n            of the deleted nodes are transferred (summed up) to its\n            parent's branch, thus keeping original distances among\n            nodes.\n\n        **Example:**\n\n        ::\n\n                / C\n          root-|\n               |        / B\n                \\--- H |\n                        \\ A\n\n          > H.delete() will produce this structure:\n\n                / C\n               |\n          root-|--B\n               |\n                \\ A\n\n        \"\"\"\n        parent = self.up\n        if parent:\n            if preserve_branch_length:\n                if len(self.children) == 1:\n                    self.children[0].dist += self.dist\n                elif len(self.children) > 1:\n                    parent.dist += self.dist\n\n            for ch in self.children:\n                parent.add_child(ch)\n\n            parent.remove_child(self)\n\n        # Avoids parents with only one child\n        if prevent_nondicotomic and parent and \\\n                len(parent.children) < 2:\n            parent.delete(prevent_nondicotomic=False,\n                          preserve_branch_length=preserve_branch_length)\n\n    def detach(self):\n        \"\"\"\n        Detachs this node (and all its descendants) from its parent\n        and returns the referent to itself.\n\n        Detached node conserves all its structure of descendants, and can\n        be attached to another node through the 'add_child' function. This\n        mechanism can be seen as a cut and paste.\n        \"\"\"\n\n        if self.up:\n            self.up.children.remove(self)\n            self.up = None\n        return self\n\n    def prune(self, nodes, preserve_branch_length=False):\n        \"\"\"Prunes the topology of a node to conserve only the selected list of leaf\n        internal nodes. The minimum number of nodes that conserve the\n        topological relationships among the requested nodes will be\n        retained. Root node is always conserved.\n\n        :var nodes: a list of node names or node objects that should be retained\n\n        :param False preserve_branch_length: If True, branch lengths\n          of the deleted nodes are transferred (summed up) to its\n          parent's branch, thus keeping original distances among\n          nodes.\n\n        **Examples:**\n\n        ::\n\n          t1 = Tree('(((((A,B)C)D,E)F,G)H,(I,J)K)root;', format=1)\n          t1.prune(['A', 'B'])\n\n\n          #                /-A\n          #          /D /C|\n          #       /F|      \\-B\n          #      |  |\n          #    /H|   \\-E\n          #   |  |                        /-A\n          #-root  \\-G                 -root\n          #   |                           \\-B\n          #   |   /-I\n          #    \\K|\n          #       \\-J\n\n\n\n          t1 = Tree('(((((A,B)C)D,E)F,G)H,(I,J)K)root;', format=1)\n          t1.prune(['A', 'B', 'C'])\n\n          #                /-A\n          #          /D /C|\n          #       /F|      \\-B\n          #      |  |\n          #    /H|   \\-E\n          #   |  |                              /-A\n          #-root  \\-G                  -root- C|\n          #   |                                 \\-B\n          #   |   /-I\n          #    \\K|\n          #       \\-J\n\n\n\n          t1 = Tree('(((((A,B)C)D,E)F,G)H,(I,J)K)root;', format=1)\n          t1.prune(['A', 'B', 'I'])\n\n\n          #                /-A\n          #          /D /C|\n          #       /F|      \\-B\n          #      |  |\n          #    /H|   \\-E                    /-I\n          #   |  |                      -root\n          #-root  \\-G                      |   /-A\n          #   |                             \\C|\n          #   |   /-I                          \\-B\n          #    \\K|\n          #       \\-J\n\n          t1 = Tree('(((((A,B)C)D,E)F,G)H,(I,J)K)root;', format=1)\n          t1.prune(['A', 'B', 'F', 'H'])\n\n          #                /-A\n          #          /D /C|\n          #       /F|      \\-B\n          #      |  |\n          #    /H|   \\-E\n          #   |  |                              /-A\n          #-root  \\-G                -root-H /F|\n          #   |                                 \\-B\n          #   |   /-I\n          #    \\K|\n          #       \\-J\n\n        \"\"\"\n\n        def cmp_nodes(x, y):\n            # if several nodes are in the same path of two kept nodes,\n            # only one should be maintained. This prioritize internal\n            # nodes that are already in the to_keep list and then\n            # deeper nodes (closer to the leaves).\n            if n2depth[x] > n2depth[y]:\n                return -1\n            elif n2depth[x] < n2depth[y]:\n                return 1\n            else:\n                return 0\n\n        to_keep = set(_translate_nodes(self, *nodes))\n        start, node2path = self.get_common_ancestor(to_keep, get_path=True)\n        to_keep.add(self)\n\n        # Calculate which kept nodes are visiting the same nodes in\n        # their path to the common ancestor.\n        n2count = {}\n        n2depth = {}\n        for seed, path in node2path.items():\n            for visited_node in path:\n                if visited_node not in n2depth:\n                    depth = visited_node.get_distance(start, topology_only=True)\n                    n2depth[visited_node] = depth\n                if visited_node is not seed:\n                    n2count.setdefault(visited_node, set()).add(seed)\n\n        # if several internal nodes are in the path of exactly the same kept\n        # nodes, only one (the deepest) should be maintain.\n        visitors2nodes = {}\n        for node, visitors in n2count.items():\n            # keep nodes connection at least two other nodes\n            if len(visitors) > 1:\n                visitor_key = frozenset(visitors)\n                visitors2nodes.setdefault(visitor_key, set()).add(node)\n\n        for visitors, nodes in visitors2nodes.items():\n            if not (to_keep & nodes):\n                sorted_nodes = sorted(nodes, key=cmp_to_key(cmp_nodes))\n                to_keep.add(sorted_nodes[0])\n\n        for n in self.get_descendants('postorder'):\n            if n not in to_keep:\n                if preserve_branch_length:\n                    if len(n.children) == 1:\n                        n.children[0].dist += n.dist\n                    elif len(n.children) > 1 and n.up:\n                        n.up.dist += n.dist\n\n                n.delete(prevent_nondicotomic=False)\n\n    def swap_children(self):\n        \"\"\"\n        Swaps current children order.\n        \"\"\"\n        if len(self.children) > 1:\n            self.children.reverse()\n\n    # #####################\n    # Tree traversing\n    # #####################\n\n    def get_children(self):\n        \"\"\"\n        Returns an independent list of node's children.\n        \"\"\"\n        return [ch for ch in self.children]\n\n    def get_sisters(self):\n        \"\"\"\n        Returns an independent list of sister nodes.\n        \"\"\"\n        if self.up is not None:\n            return [ch for ch in self.up.children if ch != self]\n        else:\n            return []\n\n    def iter_leaves(self, is_leaf_fn=None):\n        \"\"\"\n        Returns an iterator over the leaves under this node.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        for n in self.traverse(strategy=\"preorder\", is_leaf_fn=is_leaf_fn):\n            if not is_leaf_fn:\n                if n.is_leaf():\n                    yield n\n            else:\n                if is_leaf_fn(n):\n                    yield n\n\n    def get_leaves(self, is_leaf_fn=None):\n        \"\"\"\n        Returns the list of terminal nodes (leaves) under this node.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        return [n for n in self.iter_leaves(is_leaf_fn=is_leaf_fn)]\n\n    def iter_leaf_names(self, is_leaf_fn=None):\n        \"\"\"\n        Returns an iterator over the leaf names under this node.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        for n in self.iter_leaves(is_leaf_fn=is_leaf_fn):\n            yield n.name\n\n    def get_leaf_names(self, is_leaf_fn=None):\n        \"\"\"\n        Returns the list of terminal node names under the current\n        node.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        return [name for name in self.iter_leaf_names(is_leaf_fn=is_leaf_fn)]\n\n    def iter_descendants(self, strategy=\"levelorder\", is_leaf_fn=None):\n        \"\"\"\n        Returns an iterator over all descendant nodes.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        for n in self.traverse(strategy=strategy, is_leaf_fn=is_leaf_fn):\n            if n is not self:\n                yield n\n\n    def get_descendants(self, strategy=\"levelorder\", is_leaf_fn=None):\n        \"\"\"\n        Returns a list of all (leaves and internal) descendant nodes.\n\n        :argument None is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n        \"\"\"\n        return [n for n in self.iter_descendants(strategy=strategy, \\\n                                                 is_leaf_fn=is_leaf_fn)]\n\n    def traverse(self, strategy=\"levelorder\", is_leaf_fn=None):\n        \"\"\"\n        Returns an iterator to traverse the tree structure under this\n        node.\n\n        :argument \"levelorder\" strategy: set the way in which tree\n           will be traversed. Possible values are: \"preorder\" (first\n           parent and then children) 'postorder' (first children and\n           the parent) and \"levelorder\" (nodes are visited in order\n           from root to leaves)\n\n        :argument None is_leaf_fn: If supplied, ``is_leaf_fn``\n           function will be used to interrogate nodes about if they\n           are terminal or internal. ``is_leaf_fn`` function should\n           receive a node instance as first argument and return True\n           or False. Use this argument to traverse a tree by\n           dynamically collapsing internal nodes matching\n           ``is_leaf_fn``.\n        \"\"\"\n        if strategy == \"preorder\":\n            return self._iter_descendants_preorder(is_leaf_fn=is_leaf_fn)\n        elif strategy == \"levelorder\":\n            return self._iter_descendants_levelorder(is_leaf_fn=is_leaf_fn)\n        elif strategy == \"postorder\":\n            return self._iter_descendants_postorder(is_leaf_fn=is_leaf_fn)\n\n    def iter_prepostorder(self, is_leaf_fn=None):\n        \"\"\"\n        Iterate over all nodes in a tree yielding every node in both\n        pre and post order. Each iteration returns a postorder flag\n        (True if node is being visited in postorder) and a node\n        instance.\n        \"\"\"\n        to_visit = [self]\n        if is_leaf_fn is not None:\n            _leaf = is_leaf_fn\n        else:\n            _leaf = self.__class__.is_leaf\n\n        while to_visit:\n            node = to_visit.pop(-1)\n            try:\n                node = node[1]\n            except TypeError:\n                # PREORDER ACTIONS\n                yield (False, node)\n                if not _leaf(node):\n                    # ADD CHILDREN\n                    to_visit.extend(reversed(node.children + [[1, node]]))\n            else:\n                # POSTORDER ACTIONS\n                yield (True, node)\n\n    def _iter_descendants_postorder(self, is_leaf_fn=None):\n        to_visit = [self]\n        if is_leaf_fn is not None:\n            _leaf = is_leaf_fn\n        else:\n            _leaf = self.__class__.is_leaf\n\n        while to_visit:\n            node = to_visit.pop(-1)\n            try:\n                node = node[1]\n            except TypeError:\n                # PREORDER ACTIONS\n                if not _leaf(node):\n                    # ADD CHILDREN\n                    to_visit.extend(reversed(node.children + [[1, node]]))\n                else:\n                    yield node\n            else:\n                # POSTORDER ACTIONS\n                yield node\n\n    def _iter_descendants_levelorder(self, is_leaf_fn=None):\n        \"\"\"\n        Iterate over all desdecendant nodes.\n        \"\"\"\n        tovisit = deque([self])\n        while len(tovisit) > 0:\n            node = tovisit.popleft()\n            yield node\n            if not is_leaf_fn or not is_leaf_fn(node):\n                tovisit.extend(node.children)\n\n    def _iter_descendants_preorder(self, is_leaf_fn=None):\n        \"\"\"\n        Iterator over all descendant nodes.\n        \"\"\"\n        to_visit = deque()\n        node = self\n        while node is not None:\n            yield node\n            if not is_leaf_fn or not is_leaf_fn(node):\n                to_visit.extendleft(reversed(node.children))\n            try:\n                node = to_visit.popleft()\n            except:\n                node = None\n\n    def iter_ancestors(self):\n        '''versionadded: 2.2\n\n        Iterates over the list of all ancestor nodes from current node\n        to the current tree root.\n\n        '''\n        node = self\n        while node.up is not None:\n            yield node.up\n            node = node.up\n\n    def get_ancestors(self):\n        '''versionadded: 2.2\n\n        Returns the list of all ancestor nodes from current node to\n        the current tree root.\n\n        '''\n        return [n for n in self.iter_ancestors()]\n\n    def describe(self):\n        \"\"\"\n        Prints general information about this node and its\n        connections.\n        \"\"\"\n        if len(self.get_tree_root().children) == 2:\n            rooting = \"Yes\"\n        elif len(self.get_tree_root().children) > 2:\n            rooting = \"No\"\n        else:\n            rooting = \"No children\"\n        max_node, max_dist = self.get_farthest_leaf()\n        cached_content = self.get_cached_content()\n        print(\"Number of leaf nodes:\\t%d\" % len(cached_content[self]))\n        print(\"Total number of nodes:\\t%d\" % len(cached_content))\n        print(\"Rooted:\\t%s\" % rooting)\n        print(\"Most distant node:\\t%s\" % max_node.name)\n        print(\"Max. distance:\\t%f\" % max_dist)\n\n    def write(self, features=None, outfile=None, format=0, is_leaf_fn=None,\n              format_root_node=False, dist_formatter=None, support_formatter=None,\n              name_formatter=None, quoted_node_names=False):\n        \"\"\"\n        Returns the newick representation of current node. Several\n        arguments control the way in which extra data is shown for\n        every node:\n\n        :argument features: a list of feature names to be exported\n          using the Extended Newick Format (i.e. features=[\"name\",\n          \"dist\"]). Use an empty list to export all available features\n          in each node (features=[])\n\n        :argument outfile: writes the output to a given file\n\n        :argument format: defines the newick standard used to encode the\n          tree. See tutorial for details.\n\n        :argument False format_root_node: If True, it allows features\n          and branch information from root node to be exported as a\n          part of the newick text string. For newick compatibility\n          reasons, this is False by default.\n\n        :argument is_leaf_fn: See :func:`TreeNode.traverse` for\n          documentation.\n\n        **Example:**\n\n        ::\n\n             t.get_newick(features=[\"species\",\"name\"], format=1)\n\n        \"\"\"\n\n        nw = write_newick(self, features=features, format=format,\n                          is_leaf_fn=is_leaf_fn,\n                          format_root_node=format_root_node,\n                          dist_formatter=dist_formatter,\n                          support_formatter=support_formatter,\n                          name_formatter=name_formatter,\n                          quoted_names=quoted_node_names)\n\n        if outfile is not None:\n            with open(outfile, \"w\") as OUT:\n                OUT.write(nw)\n        else:\n            return nw\n\n    def get_tree_root(self):\n        \"\"\"\n        Returns the absolute root node of current tree structure.\n        \"\"\"\n        root = self\n        while root.up is not None:\n            root = root.up\n        return root\n\n    def get_common_ancestor(self, *target_nodes, **kargs):\n        \"\"\"\n        Returns the first common ancestor between this node and a given\n        list of 'target_nodes'.\n\n        **Examples:**\n\n        ::\n\n          t = tree.Tree(\"(((A:0.1, B:0.01):0.001, C:0.0001):1.0[&&NHX:name=common], (D:0.00001):0.000001):2.0[&&NHX:name=root];\")\n          A = t.get_descendants_by_name(\"A\")[0]\n          C = t.get_descendants_by_name(\"C\")[0]\n          common =  A.get_common_ancestor(C)\n          print common.name\n\n        \"\"\"\n\n        get_path = kargs.get(\"get_path\", False)\n\n        if len(target_nodes) == 1 and type(target_nodes[0]) \\\n                in set([set, tuple, list, frozenset]):\n            target_nodes = target_nodes[0]\n\n        # Convert node names into node instances\n        target_nodes = _translate_nodes(self, *target_nodes)\n\n        # If only one node is provided, use self as the second target\n        if type(target_nodes) != list:\n            target_nodes = [target_nodes, self]\n\n        n2path = {}\n        reference = []\n        ref_node = None\n        for n in target_nodes:\n            current = n\n            while current:\n                n2path.setdefault(n, set()).add(current)\n                if not ref_node:\n                    reference.append(current)\n                current = current.up\n            if not ref_node:\n                ref_node = n\n\n        common = None\n        for n in reference:\n            broken = False\n            for node, path in n2path.items():\n                if node is not ref_node and n not in path:\n                    broken = True\n                    break\n\n            if not broken:\n                common = n\n                break\n        if not common:\n            raise TreeError(\"Nodes are not connected!\")\n\n        if get_path:\n            return common, n2path\n        else:\n            return common\n\n    def iter_search_nodes(self, **conditions):\n        \"\"\"\n        Search nodes in an iterative way. Matches are yielded as they\n        are being found. This avoids needing to scan the full tree\n        topology before returning the first matches. Useful when\n        dealing with huge trees.\n        \"\"\"\n\n        for n in self.traverse():\n            conditions_passed = 0\n            for key, value in conditions.items():\n                if hasattr(n, key) and getattr(n, key) == value:\n                    conditions_passed += 1\n            if conditions_passed == len(conditions):\n                yield n\n\n    def search_nodes(self, **conditions):\n        \"\"\"\n        Returns the list of nodes matching a given set of conditions.\n\n        **Example:**\n\n        ::\n\n          tree.search_nodes(dist=0.0, name=\"human\")\n\n        \"\"\"\n        matching_nodes = []\n        for n in self.iter_search_nodes(**conditions):\n            matching_nodes.append(n)\n        return matching_nodes\n\n    def get_leaves_by_name(self, name):\n        \"\"\"\n        Returns a list of leaf nodes matching a given name.\n        \"\"\"\n        return self.search_nodes(name=name, children=[])\n\n    def is_leaf(self):\n        \"\"\"\n        Return True if current node is a leaf.\n        \"\"\"\n        return len(self.children) == 0\n\n    def is_root(self):\n        \"\"\"\n        Returns True if current node has no parent\n        \"\"\"\n        if self.up is None:\n            return True\n        else:\n            return False\n\n    # ###########################\n    # Distance related functions\n    # ###########################\n    def get_distance(self, target, target2=None, topology_only=False):\n        \"\"\"\n        Returns the distance between two nodes. If only one target is\n        specified, it returns the distance between the target and the\n        current node.\n\n        :argument target: a node within the same tree structure.\n\n        :argument target2: a node within the same tree structure. If\n          not specified, current node is used as target2.\n\n        :argument False topology_only: If set to True, distance will\n          refer to the number of nodes between target and target2.\n\n        :returns: branch length distance between target and\n          target2. If topology_only flag is True, returns the number\n          of nodes between target and target2.\n\n        \"\"\"\n\n        if target2 is None:\n            target2 = self\n            root = self.get_tree_root()\n        else:\n            # is target node under current node?\n            root = self\n\n        target, target2 = _translate_nodes(root, target, target2)\n        ancestor = root.get_common_ancestor(target, target2)\n\n        dist = 0.0\n        for n in [target2, target]:\n            current = n\n            while current != ancestor:\n                if topology_only:\n                    if current != target:\n                        dist += 1\n                else:\n                    dist += current.dist\n                current = current.up\n        return dist\n\n    def get_farthest_node(self, topology_only=False):\n        \"\"\"\n        Returns the node's farthest descendant or ancestor node, and the\n        distance to it.\n\n        :argument False topology_only: If set to True, distance\n          between nodes will be referred to the number of nodes\n          between them. In other words, topological distance will be\n          used instead of branch length distances.\n\n        :return: A tuple containing the farthest node referred to the\n          current node and the distance to it.\n\n        \"\"\"\n        # Init farthest node to current farthest leaf\n        farthest_node, farthest_dist = self.get_farthest_leaf(topology_only=topology_only)\n\n        prev = self\n        cdist = 0.0 if topology_only else prev.dist\n        current = prev.up\n        while current is not None:\n            for ch in current.children:\n                if ch != prev:\n                    if not ch.is_leaf():\n                        fnode, fdist = ch.get_farthest_leaf(topology_only=topology_only)\n                    else:\n                        fnode = ch\n                        fdist = 0\n                    if topology_only:\n                        fdist += 1.0\n                    else:\n                        fdist += ch.dist\n                    if cdist + fdist > farthest_dist:\n                        farthest_dist = cdist + fdist\n                        farthest_node = fnode\n            prev = current\n            if topology_only:\n                cdist += 1\n            else:\n                cdist += prev.dist\n            current = prev.up\n        return farthest_node, farthest_dist\n\n    def _get_farthest_and_closest_leaves(self, topology_only=False, is_leaf_fn=None):\n        # if called from a leaf node, no necessary to compute\n        if (is_leaf_fn and is_leaf_fn(self)) or self.is_leaf():\n            return self, 0.0, self, 0.0\n\n        min_dist = None\n        min_node = None\n        max_dist = None\n        max_node = None\n        d = 0.0\n        for post, n in self.iter_prepostorder(is_leaf_fn=is_leaf_fn):\n            if n is self:\n                continue\n            if post:\n                d -= n.dist if not topology_only else 1.0\n            else:\n                if (is_leaf_fn and is_leaf_fn(n)) or n.is_leaf():\n                    total_d = d + n.dist if not topology_only else d\n                    if min_dist is None or total_d < min_dist:\n                        min_dist = total_d\n                        min_node = n\n                    if max_dist is None or total_d > max_dist:\n                        max_dist = total_d\n                        max_node = n\n                else:\n                    d += n.dist if not topology_only else 1.0\n        return min_node, min_dist, max_node, max_dist\n\n    def get_farthest_leaf(self, topology_only=False, is_leaf_fn=None):\n        \"\"\"\n        Returns node's farthest descendant node (which is always a leaf), and the\n        distance to it.\n\n        :argument False topology_only: If set to True, distance\n          between nodes will be referred to the number of nodes\n          between them. In other words, topological distance will be\n          used instead of branch length distances.\n\n        :return: A tuple containing the farthest leaf referred to the\n          current node and the distance to it.\n        \"\"\"\n        min_node, min_dist, max_node, max_dist = self._get_farthest_and_closest_leaves(\n            topology_only=topology_only, is_leaf_fn=is_leaf_fn)\n        return max_node, max_dist\n\n    def get_closest_leaf(self, topology_only=False, is_leaf_fn=None):\n        \"\"\"Returns node's closest descendant leaf and the distance to\n        it.\n\n        :argument False topology_only: If set to True, distance\n          between nodes will be referred to the number of nodes\n          between them. In other words, topological distance will be\n          used instead of branch length distances.\n\n        :return: A tuple containing the closest leaf referred to the\n          current node and the distance to it.\n\n        \"\"\"\n        min_node, min_dist, max_node, max_dist = self._get_farthest_and_closest_leaves(\n            topology_only=topology_only, is_leaf_fn=is_leaf_fn)\n\n        return min_node, min_dist\n\n    def get_midpoint_outgroup(self):\n        \"\"\"\n        Returns the node that divides the current tree into two distance-balanced\n        partitions.\n        \"\"\"\n        # Gets the farthest node to the current root\n        root = self.get_tree_root()\n        nA, r2A_dist = root.get_farthest_leaf()\n        nB, A2B_dist = nA.get_farthest_node()\n\n        outgroup = nA\n        middist = A2B_dist / 2.0\n        cdist = 0\n        current = nA\n        while current is not None:\n            cdist += current.dist\n            if cdist > (middist):  # Deja de subir cuando se pasa del maximo\n                break\n            else:\n                current = current.up\n        return current\n\n    def populate(self, size, names_library=None, reuse_names=False,\n                 random_branches=False, branch_range=(0, 1),\n                 support_range=(0, 1)):\n        \"\"\"\n        Generates a random topology by populating current node.\n\n        :argument None names_library: If provided, names library\n          (list, set, dict, etc.) will be used to name nodes.\n\n        :argument False reuse_names: If True, node names will not be\n          necessarily unique, which makes the process a bit more\n          efficient.\n\n        :argument False random_branches: If True, branch distances and support\n          values will be randomized.\n\n        :argument (0,1) branch_range: If random_branches is True, this\n          range of values will be used to generate random distances.\n\n        :argument (0,1) support_range: If random_branches is True,\n          this range of values will be used to generate random branch\n          support values.\n\n        \"\"\"\n        NewNode = self.__class__\n\n        if len(self.children) > 1:\n            connector = NewNode()\n            for ch in self.get_children():\n                ch.detach()\n                connector.add_child(child=ch)\n            root = NewNode()\n            self.add_child(child=connector)\n            self.add_child(child=root)\n        else:\n            root = self\n\n        next_deq = deque([root])\n        for i in range(size - 1):\n            if random.randint(0, 1):\n                p = next_deq.pop()\n            else:\n                p = next_deq.popleft()\n\n            c1 = p.add_child()\n            c2 = p.add_child()\n            next_deq.extend([c1, c2])\n            if random_branches:\n                c1.dist = random.uniform(*branch_range)\n                c2.dist = random.uniform(*branch_range)\n                c1.support = random.uniform(*branch_range)\n                c2.support = random.uniform(*branch_range)\n            else:\n                c1.dist = 1.0\n                c2.dist = 1.0\n                c1.support = 1.0\n                c2.support = 1.0\n\n        # next contains leaf nodes\n        charset = \"abcdefghijklmnopqrstuvwxyz\"\n        if names_library:\n            names_library = deque(names_library)\n        else:\n            avail_names = itertools.combinations_with_replacement(charset, 10)\n        for n in next_deq:\n            if names_library:\n                if reuse_names:\n                    tname = random.sample(names_library, 1)[0]\n                else:\n                    tname = names_library.pop()\n            else:\n                tname = ''.join(next(avail_names))\n            n.name = tname\n\n    def set_outgroup(self, outgroup):\n        \"\"\"\n        Sets a descendant node as the outgroup of a tree.  This function\n        can be used to root a tree or even an internal node.\n\n        :argument outgroup: a node instance within the same tree\n          structure that will be used as a basal node.\n\n        \"\"\"\n\n        outgroup = _translate_nodes(self, outgroup)\n\n        if self == outgroup:\n            raise TreeError(\"Cannot set myself as outgroup\")\n\n        parent_outgroup = outgroup.up\n\n        # Detects (sub)tree root\n        n = outgroup\n        while n.up is not self:\n            n = n.up\n\n        # If outgroup is a child from root, but with more than one\n        # sister nodes, creates a new node to group them\n\n        self.children.remove(n)\n        if len(self.children) != 1:\n            down_branch_connector = self.__class__()\n            down_branch_connector.dist = 0.0\n            down_branch_connector.support = n.support\n            for ch in self.get_children():\n                down_branch_connector.children.append(ch)\n                ch.up = down_branch_connector\n                self.children.remove(ch)\n        else:\n            down_branch_connector = self.children[0]\n\n        # Connects down branch to myself or to outgroup\n        quien_va_ser_padre = parent_outgroup\n        if quien_va_ser_padre is not self:\n            # Parent-child swapping\n            quien_va_ser_hijo = quien_va_ser_padre.up\n            quien_fue_padre = None\n            buffered_dist = quien_va_ser_padre.dist\n            buffered_support = quien_va_ser_padre.support\n\n            while quien_va_ser_hijo is not self:\n                quien_va_ser_padre.children.append(quien_va_ser_hijo)\n                quien_va_ser_hijo.children.remove(quien_va_ser_padre)\n\n                buffered_dist2 = quien_va_ser_hijo.dist\n                buffered_support2 = quien_va_ser_hijo.support\n                quien_va_ser_hijo.dist = buffered_dist\n                quien_va_ser_hijo.support = buffered_support\n                buffered_dist = buffered_dist2\n                buffered_support = buffered_support2\n\n                quien_va_ser_padre.up = quien_fue_padre\n                quien_fue_padre = quien_va_ser_padre\n\n                quien_va_ser_padre = quien_va_ser_hijo\n                quien_va_ser_hijo = quien_va_ser_padre.up\n\n            quien_va_ser_padre.children.append(down_branch_connector)\n            down_branch_connector.up = quien_va_ser_padre\n            quien_va_ser_padre.up = quien_fue_padre\n\n            down_branch_connector.dist += buffered_dist\n            outgroup2 = parent_outgroup\n            parent_outgroup.children.remove(outgroup)\n            outgroup2.dist = 0\n\n        else:\n            outgroup2 = down_branch_connector\n\n        outgroup.up = self\n        outgroup2.up = self\n        # outgroup is always the first children. Some function my\n        # trust on this fact, so do no change this.\n        self.children = [outgroup, outgroup2]\n        middist = (outgroup2.dist + outgroup.dist) / 2\n        outgroup.dist = middist\n        outgroup2.dist = middist\n        outgroup2.support = outgroup.support\n\n    def unroot(self):\n        \"\"\"\n        Unroots current node. This function is expected to be used on\n        the absolute tree root node, but it can be also be applied to\n        any other internal node. It will convert a split into a\n        multifurcation.\n        \"\"\"\n        if len(self.children) == 2:\n            if not self.children[0].is_leaf():\n                self.children[0].delete()\n            elif not self.children[1].is_leaf():\n                self.children[1].delete()\n            else:\n                raise TreeError(\"Cannot unroot a tree with only two leaves\")\n\n    #\n    #\n    # def copy(self, method=\"cpickle\"):\n    #     \"\"\".. versionadded: 2.1\n    #\n    #     Returns a copy of the current node.\n    #\n    #     :var cpickle method: Protocol used to copy the node\n    #     structure. The following values are accepted:\n    #\n    #        - \"newick\": Tree topology, node names, branch lengths and\n    #          branch support values will be copied by as represented in\n    #          the newick string (copy by newick string serialisation).\n    #\n    #        - \"newick-extended\": Tree topology and all node features\n    #          will be copied based on the extended newick format\n    #          representation. Only node features will be copied, thus\n    #          excluding other node attributes. As this method is also\n    #          based on newick serialisation, features will be converted\n    #          into text strings when making the copy.\n    #\n    #        - \"cpickle\": The whole node structure and its content is\n    #          cloned based on cPickle object serialisation (slower, but\n    #          recommended for full tree copying)\n    #\n    #        - \"deepcopy\": The whole node structure and its content is\n    #          copied based on the standard \"copy\" Python functionality\n    #          (this is the slowest method but it allows to copy complex\n    #          objects even if attributes point to lambda functions,\n    #          etc.)\n    #\n    #     \"\"\"\n    #     method = method.lower()\n    #     if method == \"newick\":\n    #         new_node = self.__class__(self.write(features=[\"name\"], format_root_node=True))\n    #     elif method == \"newick-extended\":\n    #         self.write(features=[], format_root_node=True)\n    #         new_node = self.__class__(self.write(features=[]))\n    #     elif method == \"deepcopy\":\n    #         parent = self.up\n    #         self.up = None\n    #         new_node = copy.deepcopy(self)\n    #         self.up = parent\n    #     elif method == \"cpickle\":\n    #         parent = self.up\n    #         self.up = None\n    #         new_node = six.moves.cPickle.loads(six.moves.cPickle.dumps(self, 2))\n    #         self.up = parent\n    #     else:\n    #         raise TreeError(\"Invalid copy method\")\n    #\n    #     return new_node\n\n    def _asciiArt(self, char1='-', show_internal=True, compact=False, attributes=None):\n        \"\"\"\n        Returns the ASCII representation of the tree.\n\n        Code based on the PyCogent GPL project.\n        \"\"\"\n        if not attributes:\n            attributes = [\"name\"]\n        node_name = ', '.join(map(str, [getattr(self, v) for v in attributes if hasattr(self, v)]))\n\n        LEN = max(3, len(node_name) if not self.children or show_internal else 3)\n        PAD = ' ' * LEN\n        PA = ' ' * (LEN - 1)\n        if not self.is_leaf():\n            mids = []\n            result = []\n            for c in self.children:\n                if len(self.children) == 1:\n                    char2 = '/'\n                elif c is self.children[0]:\n                    char2 = '/'\n                elif c is self.children[-1]:\n                    char2 = '\\\\'\n                else:\n                    char2 = '-'\n                (clines, mid) = c._asciiArt(char2, show_internal, compact, attributes)\n                mids.append(mid + len(result))\n                result.extend(clines)\n                if not compact:\n                    result.append('')\n            if not compact:\n                result.pop()\n            (lo, hi, end) = (mids[0], mids[-1], len(result))\n            prefixes = [PAD] * (lo + 1) + [PA + '|'] * (hi - lo - 1) + [PAD] * (end - hi)\n            mid = int((lo + hi) / 2)\n            prefixes[mid] = char1 + '-' * (LEN - 2) + prefixes[mid][-1]\n            result = [p + l for (p, l) in zip(prefixes, result)]\n            if show_internal:\n                stem = result[mid]\n                result[mid] = stem[0] + node_name + stem[len(node_name) + 1:]\n            return (result, mid)\n        else:\n            return ([char1 + '-' + node_name], 0)\n\n    def get_ascii(self, show_internal=True, compact=False, attributes=None):\n        \"\"\"\n        Returns a string containing an ascii drawing of the tree.\n\n        :argument show_internal: includes internal edge names.\n        :argument compact: use exactly one line per tip.\n\n        :param attributes: A list of node attributes to shown in the\n            ASCII representation.\n\n        \"\"\"\n        (lines, mid) = self._asciiArt(show_internal=show_internal,\n                                      compact=compact, attributes=attributes)\n        return '\\n' + '\\n'.join(lines)\n\n    def ladderize(self, direction=0):\n        \"\"\"\n        .. versionadded: 2.1\n\n        Sort the branches of a given tree (swapping children nodes)\n        according to the size of each partition.\n\n        ::\n\n           t =  Tree(\"(f,((d, ((a,b),c)),e));\")\n\n           print t\n\n           #\n           #      /-f\n           #     |\n           #     |          /-d\n           # ----|         |\n           #     |     /---|          /-a\n           #     |    |    |     /---|\n           #     |    |     \\---|     \\-b\n           #      \\---|         |\n           #          |          \\-c\n           #          |\n           #           \\-e\n\n           t.ladderize()\n           print t\n\n           #      /-f\n           # ----|\n           #     |     /-e\n           #      \\---|\n           #          |     /-d\n           #           \\---|\n           #               |     /-c\n           #                \\---|\n           #                    |     /-a\n           #                     \\---|\n           #                          \\-b\n\n        \"\"\"\n\n        if not self.is_leaf():\n            n2s = {}\n            for n in self.get_children():\n                s = n.ladderize(direction=direction)\n                n2s[n] = s\n\n            self.children.sort(key=lambda x: n2s[x])\n            if direction == 1:\n                self.children.reverse()\n            size = sum(n2s.values())\n        else:\n            size = 1\n\n        return size\n\n    def sort_descendants(self, attr=\"name\"):\n        \"\"\"\n        .. versionadded: 2.1\n\n        Sort the branches of a given tree by node names. After the\n        tree is sorted, nodes are labeled in ascending order. This\n        can be used to ensure that nodes in a tree with the same node\n        names are always labeled in the same way. Note that if\n        duplicated names are present, extra criteria should be added\n        to sort nodes.\n\n        Unique id is stored as a node._nid attribute\n\n        \"\"\"\n\n        node2content = self.get_cached_content(store_attr=attr, container_type=list)\n\n        for n in self.traverse():\n            if not n.is_leaf():\n                n.children.sort(key=lambda x: str(sorted(node2content[x])))\n\n    def get_cached_content(self, store_attr=None, container_type=set, leaves_only=True, _store=None):\n        \"\"\"\n        .. versionadded: 2.2\n\n        Returns a dictionary pointing to the preloaded content of each\n        internal node under this tree. Such a dictionary is intended\n        to work as a cache for operations that require many traversal\n        operations.\n\n        :param None store_attr: Specifies the node attribute that\n            should be cached (i.e. name, distance, etc.). When none,\n            the whole node instance is cached.\n\n        :param _store: (internal use)\n\n        \"\"\"\n\n        if _store is None:\n            _store = {}\n\n        def get_value(_n):\n            if store_attr is None:\n                _val = [_n]\n            else:\n                if not isinstance(store_attr, str):\n                    _val = [tuple(getattr(_n, attr, None) for attr in store_attr)]\n\n                else:\n                    _val = [getattr(_n, store_attr, None)]\n\n            return _val\n\n        for ch in self.children:\n            ch.get_cached_content(store_attr=store_attr,\n                                  container_type=container_type,\n                                  leaves_only=leaves_only,\n                                  _store=_store)\n\n        if self.children:\n            if not leaves_only:\n                val = container_type(get_value(self))\n            else:\n                val = container_type()\n            for ch in self.children:\n                if type(val) == list:\n                    val.extend(_store[ch])\n                if type(val) == set:\n                    val.update(_store[ch])\n\n                if not leaves_only:\n                    if type(val) == list:\n                        val.extend(get_value(ch))\n                    if type(val) == set:\n                        val.update(get_value(ch))\n\n            _store[self] = val\n        else:\n            _store[self] = container_type(get_value(self))\n\n        return _store\n\n    def robinson_foulds(self, t2, attr_t1=\"name\", attr_t2=\"name\",\n                        unrooted_trees=False, expand_polytomies=False,\n                        polytomy_size_limit=5, skip_large_polytomies=False,\n                        correct_by_polytomy_size=False, min_support_t1=0.0,\n                        min_support_t2=0.0):\n        \"\"\"\n        .. versionadded: 2.2\n\n        Returns the Robinson-Foulds symmetric distance between current\n        tree and a different tree instance.\n\n        :param t2: reference tree\n\n        :param name attr_t1: Compare trees using a custom node\n                              attribute as a node name.\n\n        :param name attr_t2: Compare trees using a custom node\n                              attribute as a node name in target tree.\n\n        :param False attr_t2: If True, consider trees as unrooted.\n\n        :param False expand_polytomies: If True, all polytomies in the reference\n           and target tree will be expanded into all possible binary\n           trees. Robinson-foulds distance will be calculated between all\n           tree combinations and the minimum value will be returned.\n           See also, :func:`NodeTree.expand_polytomy`.\n\n        :returns: (rf, rf_max, common_attrs, names, edges_t1, edges_t2,  discarded_edges_t1, discarded_edges_t2)\n\n        \"\"\"\n        ref_t = self\n        target_t = t2\n        if not unrooted_trees and (len(ref_t.children) > 2 or len(target_t.children) > 2):\n            raise TreeError(\"Unrooted tree found! You may want to activate the unrooted_trees flag.\")\n\n        if expand_polytomies and correct_by_polytomy_size:\n            raise TreeError(\"expand_polytomies and correct_by_polytomy_size are mutually exclusive.\")\n\n        if expand_polytomies and unrooted_trees:\n            raise TreeError(\"expand_polytomies and unrooted_trees arguments cannot be enabled at the same time\")\n\n        attrs_t1 = set([getattr(n, attr_t1) for n in ref_t.iter_leaves() if hasattr(n, attr_t1)])\n        attrs_t2 = set([getattr(n, attr_t2) for n in target_t.iter_leaves() if hasattr(n, attr_t2)])\n        common_attrs = attrs_t1 & attrs_t2\n        # release mem\n        attrs_t1, attrs_t2 = None, None\n\n        # Check for duplicated items (is it necessary? can we optimize? what's the impact in performance?')\n        size1 = len([True for n in ref_t.iter_leaves() if getattr(n, attr_t1, None) in common_attrs])\n        size2 = len([True for n in target_t.iter_leaves() if getattr(n, attr_t2, None) in common_attrs])\n        if size1 > len(common_attrs):\n            raise TreeError('Duplicated items found in source tree')\n        if size2 > len(common_attrs):\n            raise TreeError('Duplicated items found in reference tree')\n\n        if expand_polytomies:\n            ref_trees = [Tree(nw) for nw in\n                         ref_t.expand_polytomies(map_attr=attr_t1,\n                                                 polytomy_size_limit=polytomy_size_limit,\n                                                 skip_large_polytomies=skip_large_polytomies)]\n            target_trees = [Tree(nw) for nw in\n                            target_t.expand_polytomies(map_attr=attr_t2,\n                                                       polytomy_size_limit=polytomy_size_limit,\n                                                       skip_large_polytomies=skip_large_polytomies)]\n            attr_t1, attr_t2 = \"name\", \"name\"\n        else:\n            ref_trees = [ref_t]\n            target_trees = [target_t]\n\n        polytomy_correction = 0\n        if correct_by_polytomy_size:\n            corr1 = sum([0] + [len(n.children) - 2 for n in ref_t.traverse() if len(n.children) > 2])\n            corr2 = sum([0] + [len(n.children) - 2 for n in target_t.traverse() if len(n.children) > 2])\n            if corr1 and corr2:\n                raise TreeError(\"Both trees contain polytomies! Try expand_polytomies=True instead\")\n            else:\n                polytomy_correction = max([corr1, corr2])\n\n        min_comparison = None\n        for t1 in ref_trees:\n            t1_content = t1.get_cached_content()\n            t1_leaves = t1_content[t1]\n            if unrooted_trees:\n                edges1 = set([\n                    tuple(sorted([tuple(sorted([getattr(n, attr_t1) for n in content if\n                                                hasattr(n, attr_t1) and getattr(n, attr_t1) in common_attrs])),\n                                  tuple(sorted([getattr(n, attr_t1) for n in t1_leaves - content if\n                                                hasattr(n, attr_t1) and getattr(n, attr_t1) in common_attrs]))]))\n                    for content in t1_content.values()])\n                edges1.discard(((), ()))\n            else:\n                edges1 = set([\n                    tuple(sorted([getattr(n, attr_t1) for n in content if\n                                  hasattr(n, attr_t1) and getattr(n, attr_t1) in common_attrs]))\n                    for content in t1_content.values()])\n                edges1.discard(())\n\n            if min_support_t1:\n                support_t1 = dict([\n                    (tuple(sorted([getattr(n, attr_t1) for n in content if\n                                   hasattr(n, attr_t1) and getattr(n, attr_t1) in common_attrs])), branch.support)\n                    for branch, content in t1_content.values()])\n\n            for t2 in target_trees:\n                t2_content = t2.get_cached_content()\n                t2_leaves = t2_content[t2]\n                if unrooted_trees:\n                    edges2 = set([\n                        tuple(sorted([\n                            tuple(sorted([getattr(n, attr_t2) for n in content if\n                                          hasattr(n, attr_t2) and getattr(n, attr_t2) in common_attrs])),\n                            tuple(sorted([getattr(n, attr_t2) for n in t2_leaves - content if\n                                          hasattr(n, attr_t2) and getattr(n, attr_t2) in common_attrs]))]))\n                        for content in t2_content.values()])\n                    edges2.discard(((), ()))\n                else:\n                    edges2 = set([\n                        tuple(sorted([getattr(n, attr_t2) for n in content if\n                                      hasattr(n, attr_t2) and getattr(n, attr_t2) in common_attrs]))\n                        for content in t2_content.values()])\n                    edges2.discard(())\n\n                if min_support_t2:\n                    support_t2 = dict([\n                        (tuple(sorted(([getattr(n, attr_t2) for n in content if\n                                        hasattr(n, attr_t2) and getattr(n, attr_t2) in common_attrs]))), branch.support)\n                        for branch, content in t2_content.values()])\n\n                # if a support value is passed as a constraint, discard lowly supported branches from the analysis\n                discard_t1, discard_t2 = set(), set()\n                if min_support_t1 and unrooted_trees:\n                    discard_t1 = set(\n                        [p for p in edges1 if support_t1.get(p[0], support_t1.get(p[1], 999999999)) < min_support_t1])\n                elif min_support_t1:\n                    discard_t1 = set([p for p in edges1 if support_t1[p] < min_support_t1])\n\n                if min_support_t2 and unrooted_trees:\n                    discard_t2 = set(\n                        [p for p in edges2 if support_t2.get(p[0], support_t2.get(p[1], 999999999)) < min_support_t2])\n                elif min_support_t2:\n                    discard_t2 = set([p for p in edges2 if support_t2[p] < min_support_t2])\n\n                # rf = len(edges1 ^ edges2) - (len(discard_t1) + len(discard_t2)) - polytomy_correction # poly_corr is 0 if the flag is not enabled\n                # rf = len((edges1-discard_t1) ^ (edges2-discard_t2)) - polytomy_correction\n\n                # the two root edges are never counted here, as they are always\n                # present in both trees because of the common attr filters\n                rf = len(((edges1 ^ edges2) - discard_t2) - discard_t1) - polytomy_correction\n\n                if unrooted_trees:\n                    # thought this may work, but it does not, still I don't see why\n                    # max_parts = (len(common_attrs)*2) - 6 - len(discard_t1) - len(discard_t2)\n                    max_parts = (len([p for p in edges1 - discard_t1 if len(p[0]) > 1 and len(p[1]) > 1]) +\n                                 len([p for p in edges2 - discard_t2 if len(p[0]) > 1 and len(p[1]) > 1]))\n                else:\n                    # thought this may work, but it does not, still I don't see why\n                    # max_parts = (len(common_attrs)*2) - 4 - len(discard_t1) - len(discard_t2)\n\n                    # Otherwise we need to count the actual number of valid\n                    # partitions in each tree -2 is to avoid counting the root\n                    # partition of the two trees (only needed in rooted trees)\n                    max_parts = (len([p for p in edges1 - discard_t1 if len(p) > 1]) +\n                                 len([p for p in edges2 - discard_t2 if len(p) > 1])) - 2\n\n                    # print max_parts\n\n                if not min_comparison or min_comparison[0] > rf:\n                    min_comparison = [rf, max_parts, common_attrs, edges1, edges2, discard_t1, discard_t2]\n\n        return min_comparison\n\n    def compare(self, ref_tree, use_collateral=False, min_support_source=0.0, min_support_ref=0.0,\n                has_duplications=False, expand_polytomies=False, unrooted=False,\n                max_treeko_splits_to_be_artifact=1000, ref_tree_attr='name', source_tree_attr='name'):\n\n        \"\"\"compare this tree with another using robinson foulds symmetric difference\n        and number of shared edges. Trees of different sizes and with duplicated\n        items allowed.\n\n        returns: a Python dictionary with results\n\n        \"\"\"\n        source_tree = self\n\n        def _safe_div(a, b):\n            if a != 0:\n                return a / float(b)\n            else:\n                return 0.0\n\n        def _compare(src_tree, ref_tree):\n            # calculate partitions and rf distances\n            rf, maxrf, common, ref_p, src_p, ref_disc, src_disc = ref_tree.robinson_foulds(src_tree,\n                                                                                           expand_polytomies=expand_polytomies,\n                                                                                           unrooted_trees=unrooted,\n                                                                                           attr_t1=ref_tree_attr,\n                                                                                           attr_t2=source_tree_attr,\n                                                                                           min_support_t2=min_support_source,\n                                                                                           min_support_t1=min_support_ref)\n\n            # if trees share leaves, count their distances\n            if len(common) > 0 and src_p and ref_p:\n                if unrooted:\n                    valid_ref_edges = set([p for p in (ref_p - ref_disc) if len(p[0]) > 1 and len(p[1]) > 0])\n                    valid_src_edges = set([p for p in (src_p - src_disc) if len(p[0]) > 1 and len(p[1]) > 0])\n                    common_edges = valid_ref_edges & valid_src_edges\n                else:\n\n                    valid_ref_edges = set([p for p in (ref_p - ref_disc) if len(p) > 1])\n                    valid_src_edges = set([p for p in (src_p - src_disc) if len(p) > 1])\n                    common_edges = valid_ref_edges & valid_src_edges\n\n            else:\n                valid_ref_edges = set()\n                valid_src_edges = set()\n                common_edges = set()\n\n                # # % of ref edges found in tree\n                # ref_found.append(float(len(p2 & p1)) / reftree_edges)\n\n                # # valid edges in target, discard also leaves\n                # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])\n                # if p2bis:\n                #     incompatible_target_branches = float(len((p2-d2) - p1))\n                #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))\n\n            return rf, maxrf, len(common), valid_ref_edges, valid_src_edges, common_edges\n\n        total_valid_ref_edges = len([n for n in ref_tree.traverse() if n.children and n.support > min_support_ref])\n        result = {}\n        if has_duplications:\n            orig_target_size = len(source_tree)\n            ntrees, ndups, sp_trees = source_tree.get_speciation_trees(\n                autodetect_duplications=True, newick_only=True,\n                target_attr=source_tree_attr, map_features=[source_tree_attr, \"support\"])\n\n            if ntrees < max_treeko_splits_to_be_artifact:\n                all_rf = []\n                ref_found = []\n                src_found = []\n                tree_sizes = []\n                all_max_rf = []\n                common_names = 0\n\n                for subtree_nw in sp_trees:\n\n                    # if seedid and not use_collateral and (seedid not in subtree_nw):\n                    #    continue\n                    subtree = source_tree.__class__(subtree_nw, sp_naming_function=source_tree._speciesFunction)\n                    if not subtree.children:\n                        continue\n\n                    # only necessary if rf function is going to filter by support\n                    # value.  It slows downs the analysis, obviously, as it has to\n                    # find the support for each node in the treeko tree from the\n                    # original one.\n                    if min_support_source > 0:\n                        subtree_content = subtree.get_cached_content(store_attr='name')\n                        for n in subtree.traverse():\n                            if n.children:\n                                n.support = source_tree.get_common_ancestor(subtree_content[n]).support\n\n                    total_rf, max_rf, ncommon, valid_ref_edges, valid_src_edges, common_edges = _compare(subtree,\n                                                                                                         ref_tree)\n\n                    all_rf.append(total_rf)\n                    all_max_rf.append(max_rf)\n                    tree_sizes.append(ncommon)\n\n                    if unrooted:\n                        ref_found_in_src = len(common_edges) / float(len(valid_ref_edges)) if valid_ref_edges else None\n                        src_found_in_ref = len(common_edges) / float(len(valid_src_edges)) if valid_src_edges else None\n                    else:\n                        # in rooted trees, we want to discount the root edge\n                        # from the percentage of congruence. Otherwise we will never see a 0%\n                        # congruence for totally different trees\n                        ref_found_in_src = (len(common_edges) - 1) / float(len(valid_ref_edges) - 1) if len(\n                            valid_ref_edges) > 1 else None\n                        src_found_in_ref = (len(common_edges) - 1) / float(len(valid_src_edges) - 1) if len(\n                            valid_src_edges) > 1 else None\n\n                    if ref_found_in_src is not None:\n                        ref_found.append(ref_found_in_src)\n\n                    if src_found_in_ref is not None:\n                        src_found.append(src_found_in_ref)\n\n                if all_rf:\n                    # Treeko speciation distance\n                    alld = [_safe_div(all_rf[i], float(all_max_rf[i])) for i in range(len(all_rf))]\n                    a = sum([alld[i] * tree_sizes[i] for i in range(len(all_rf))])\n                    b = float(sum(tree_sizes))\n                    treeko_d = a / b if a else 0.0\n                    result[\"treeko_dist\"] = treeko_d\n\n                    result[\"rf\"] = np.mean(all_rf)\n                    result[\"max_rf\"] = max(all_max_rf)\n                    result[\"effective_tree_size\"] = np.mean(tree_sizes)\n                    result[\"norm_rf\"] = np.mean(\n                        [_safe_div(all_rf[i], float(all_max_rf[i])) for i in range(len(all_rf))])\n\n                    result[\"ref_edges_in_source\"] = np.mean(ref_found)\n                    result[\"source_edges_in_ref\"] = np.mean(src_found)\n\n                    result[\"source_subtrees\"] = len(all_rf)\n                    result[\"common_edges\"] = set()\n                    result[\"source_edges\"] = set()\n                    result[\"ref_edges\"] = set()\n        else:\n            total_rf, max_rf, ncommon, valid_ref_edges, valid_src_edges, common_edges = _compare(source_tree, ref_tree)\n\n            result[\"rf\"] = float(total_rf) if max_rf else \"NA\"\n            result[\"max_rf\"] = float(max_rf)\n            if unrooted:\n                result[\"ref_edges_in_source\"] = len(common_edges) / float(\n                    len(valid_ref_edges)) if valid_ref_edges else \"NA\"\n                result[\"source_edges_in_ref\"] = len(common_edges) / float(\n                    len(valid_src_edges)) if valid_src_edges else \"NA\"\n            else:\n                # in rooted trees, we want to discount the root edge from the\n                # percentage of congruence. Otherwise we will never see a 0%\n                # congruence for totally different trees\n                result[\"ref_edges_in_source\"] = (len(common_edges) - 1) / float(len(valid_ref_edges) - 1) if len(\n                    valid_ref_edges) > 1 else \"NA\"\n                result[\"source_edges_in_ref\"] = (len(common_edges) - 1) / float(len(valid_src_edges) - 1) if len(\n                    valid_src_edges) > 1 else \"NA\"\n\n            result[\"effective_tree_size\"] = ncommon\n            result[\"norm_rf\"] = total_rf / float(max_rf) if max_rf else \"NA\"\n            result[\"treeko_dist\"] = \"NA\"\n            result[\"source_subtrees\"] = 1\n            result[\"common_edges\"] = common_edges\n            result[\"source_edges\"] = valid_src_edges\n            result[\"ref_edges\"] = valid_ref_edges\n        return result\n\n    def _diff(self, t2, output='topology', attr_t1='name', attr_t2='name', color=True):\n        \"\"\"\n        .. versionadded:: 2.3\n\n        Show or return the difference between two tree topologies.\n\n        :param [raw|table|topology|diffs|diffs_tab] output: Output type\n\n        \"\"\"\n        from ..tools import ete_diff\n        difftable = ete_diff.treediff(self, t2, attr1=attr_t1, attr2=attr_t2)\n        if output == \"topology\":\n            ete_diff.show_difftable_topo(difftable, attr_t1, attr_t2, usecolor=color)\n        elif output == \"diffs\":\n            ete_diff.show_difftable(difftable)\n        elif output == \"diffs_tab\":\n            ete_diff.show_difftable_tab(difftable)\n        elif output == 'table':\n            rf, rf_max, _, _, _, _, _ = self.robinson_foulds(t2, attr_t1=attr_t1, attr_t2=attr_t2)[:2]\n            ete_diff.show_difftable_summary(difftable, rf, rf_max)\n        else:\n            return difftable\n\n    def iter_edges(self, cached_content=None):\n        '''\n        .. versionadded:: 2.3\n\n        Iterate over the list of edges of a tree. Each edge is represented as a\n        tuple of two elements, each containing the list of nodes separated by\n        the edge.\n        '''\n\n        if not cached_content:\n            cached_content = self.get_cached_content()\n        all_leaves = cached_content[self]\n        for n, side1 in cached_content.items():\n            yield (side1, all_leaves - side1)\n\n    def get_edges(self, cached_content=None):\n        '''\n        .. versionadded:: 2.3\n\n        Returns the list of edges of a tree. Each edge is represented as a\n        tuple of two elements, each containing the list of nodes separated by\n        the edge.\n        '''\n\n        return [edge for edge in self.iter_edges(cached_content)]\n\n    def standardize(self, delete_orphan=True, preserve_branch_length=True):\n        \"\"\"\n        .. versionadded:: 2.3\n\n        process current tree structure to produce a standardized topology: nodes\n        with only one child are removed and multifurcations are automatically resolved.\n\n\n        \"\"\"\n        self.resolve_polytomy()\n\n        for n in self.get_descendants():\n            if len(n.children) == 1:\n                n.delete(prevent_nondicotomic=True,\n                         preserve_branch_length=preserve_branch_length)\n\n    # def get_topology_id(self, attr=\"name\"):\n    #     '''\n    #     .. versionadded:: 2.3\n    #\n    #     Returns the unique ID representing the topology of the current tree. Two\n    #     trees with the same topology will produce the same id. If trees are\n    #     unrooted, make sure that the root node is not binary or use the\n    #     tree.unroot() function before generating the topology id.\n    #\n    #     This is useful to detect the number of unique topologies over a bunch of\n    #     trees, without requiring full distance methods.\n    #\n    #     The id is, by default, calculated based on the terminal node's names. Any\n    #     other node attribute could be used instead.\n    #\n    #\n    #     '''\n    #     edge_keys = []\n    #     for s1, s2 in self.get_edges():\n    #         k1 = sorted([getattr(e, attr) for e in s1])\n    #         k2 = sorted([getattr(e, attr) for e in s2])\n    #         edge_keys.append(sorted([k1, k2]))\n    #     return md5(str(sorted(edge_keys)).encode('utf-8')).hexdigest()\n\n    # def get_partitions(self):\n    #     \"\"\"\n    #     .. versionadded: 2.1\n\n    #     It returns the set of all possible partitions under a\n    #     node. Note that current implementation is quite inefficient\n    #     when used in very large trees.\n\n    #     t = Tree(\"((a, b), e);\")\n    #     partitions = t.get_partitions()\n\n    #     # Will return:\n    #     # a,b,e\n    #     # a,e\n    #     # b,e\n    #     # a,b\n    #     # e\n    #     # b\n    #     # a\n    #     \"\"\"\n    #     all_leaves = frozenset(self.get_leaf_names())\n    #     all_partitions = set([all_leaves])\n    #     for n in self.iter_descendants():\n    #         p1 = frozenset(n.get_leaf_names())\n    #         p2 = frozenset(all_leaves - p1)\n    #         all_partitions.add(p1)\n    #         all_partitions.add(p2)\n    #     return all_partitions\n\n    def convert_to_ultrametric(self, tree_length=None, strategy='balanced'):\n        \"\"\"\n        .. versionadded: 2.1\n\n        Converts a tree into ultrametric topology (all leaves must have\n        the same distance to root). Note that, for visual inspection\n        of ultrametric trees, node.img_style[\"size\"] should be set to\n        0.\n        \"\"\"\n\n        # Could something like this replace the old algorithm?\n        # most_distant_leaf, tree_length = self.get_farthest_leaf()\n        # for leaf in self:\n        #    d = leaf.get_distance(self)\n        #    leaf.dist += (tree_length - d)\n        # return\n\n        # pre-calculate how many splits remain under each node\n        node2max_depth = {}\n        for node in self.traverse(\"postorder\"):\n            if not node.is_leaf():\n                max_depth = max([node2max_depth[c] for c in node.children]) + 1\n                node2max_depth[node] = max_depth\n            else:\n                node2max_depth[node] = 1\n        node2dist = {self: 0.0}\n        if not tree_length:\n            most_distant_leaf, tree_length = self.get_farthest_leaf()\n        else:\n            tree_length = float(tree_length)\n\n        step = tree_length / node2max_depth[self]\n        for node in self.iter_descendants(\"levelorder\"):\n            if strategy == \"balanced\":\n                node.dist = (tree_length - node2dist[node.up]) / node2max_depth[node]\n                node2dist[node] = node.dist + node2dist[node.up]\n            elif strategy == \"fixed\":\n                if not node.is_leaf():\n                    node.dist = step\n                else:\n                    node.dist = tree_length - ((node2dist[node.up]) * step)\n                node2dist[node] = node2dist[node.up] + 1\n            node.dist = node.dist\n\n    def check_monophyly(self, values, target_attr, ignore_missing=False,\n                        unrooted=False):\n        \"\"\"\n        .. versionadded: 2.2\n\n        Returns True if a given target attribute is monophyletic under\n        this node for the provided set of values.\n\n        If not all values are represented in the current tree\n        structure, a ValueError exception will be raised to warn that\n        strict monophyly could never be reached (this behaviour can be\n        avoided by enabling the `ignore_missing` flag.\n\n        :param values: a set of values for which monophyly is\n            expected.\n\n        :param target_attr: node attribute being used to check\n            monophyly (i.e. species for species trees, names for gene\n            family trees, or any custom feature present in the tree).\n\n        :param False ignore_missing: Avoid raising an Exception when\n            missing attributes are found.\n\n\n        .. versionchanged: 2.3\n\n        :param False unrooted: If True, tree will be treated as unrooted, thus\n          allowing to find monophyly even when current outgroup is splitting a\n          monophyletic group.\n\n        :returns: the following tuple\n                  IsMonophyletic (boolean),\n                  clade type ('monophyletic', 'paraphyletic' or 'polyphyletic'),\n                  leaves breaking the monophyly (set)\n\n        \"\"\"\n\n        if type(values) != set:\n            values = set(values)\n\n        # This is the only time I traverse the tree, then I use cached\n        # leaf content\n        n2leaves = self.get_cached_content()\n\n        # Raise an error if requested attribute values are not even present\n        if ignore_missing:\n            found_values = set([getattr(n, target_attr) for n in n2leaves[self]])\n            missing_values = values - found_values\n            values = values & found_values\n\n        # Locate leaves matching requested attribute values\n        targets = set([leaf for leaf in n2leaves[self]\n                       if getattr(leaf, target_attr) in values])\n        if not ignore_missing:\n            if values - set([getattr(leaf, target_attr) for leaf in targets]):\n                raise ValueError(\n                    'The monophyly of the provided values could never be reached, as not all of them exist in the tree.'\n                    ' Please check your target attribute and values, or set the ignore_missing flag to True')\n\n        if unrooted:\n            smallest = None\n            for side1, side2 in self.iter_edges(cached_content=n2leaves):\n                if targets.issubset(side1) and (not smallest or len(side1) < len(smallest)):\n                    smallest = side1\n                elif targets.issubset(side2) and (not smallest or len(side2) < len(smallest)):\n                    smallest = side2\n                if smallest is not None and len(smallest) == len(targets):\n                    break\n            foreign_leaves = smallest - targets\n        else:\n            # Check monophyly with get_common_ancestor. Note that this\n            # step does not require traversing the tree again because\n            # targets are node instances instead of node names, and\n            # get_common_ancestor function is smart enough to detect it\n            # and avoid unnecessary traversing.\n            common = self.get_common_ancestor(targets)\n            observed = n2leaves[common]\n            foreign_leaves = set([leaf for leaf in observed\n                                  if getattr(leaf, target_attr) not in values])\n\n        if not foreign_leaves:\n            return True, \"monophyletic\", foreign_leaves\n        else:\n            # if the requested attribute is not monophyletic in this\n            # node, let's differentiate between poly and paraphyly.\n            poly_common = self.get_common_ancestor(foreign_leaves)\n            # if the common ancestor of all foreign leaves is self\n            # contained, we have a paraphyly. Otherwise, polyphyly.\n            polyphyletic = [leaf for leaf in poly_common if\n                            getattr(leaf, target_attr) in values]\n            if polyphyletic:\n                return False, \"polyphyletic\", foreign_leaves\n            else:\n                return False, \"paraphyletic\", foreign_leaves\n\n    def get_monophyletic(self, values, target_attr):\n        \"\"\"\n        .. versionadded:: 2.2\n\n        Returns a list of nodes matching the provided monophyly\n        criteria. For a node to be considered a match, all\n        `target_attr` values within and node, and exclusively them,\n        should be grouped.\n\n        :param values: a set of values for which monophyly is\n            expected.\n\n        :param target_attr: node attribute being used to check\n            monophyly (i.e. species for species trees, names for gene\n            family trees).\n\n        \"\"\"\n\n        if type(values) != set:\n            values = set(values)\n\n        n2values = self.get_cached_content(store_attr=target_attr)\n\n        is_monophyletic = lambda node: n2values[node] == values\n        for match in self.iter_leaves(is_leaf_fn=is_monophyletic):\n            if is_monophyletic(match):\n                yield match\n\n    def expand_polytomies(self, map_attr=\"name\", polytomy_size_limit=5,\n                          skip_large_polytomies=False):\n        '''\n        .. versionadded:: 2.3\n\n        Given a tree with one or more polytomies, this functions returns the\n        list of all trees (in newick format) resulting from the combination of\n        all possible solutions of the multifurcated nodes.\n\n        .. warning:\n\n           Please note that the number of of possible binary trees grows\n           exponentially with the number and size of polytomies. Using this\n           function with large multifurcations is not feasible:\n\n           polytomy size: 3 number of binary trees: 3\n           polytomy size: 4 number of binary trees: 15\n           polytomy size: 5 number of binary trees: 105\n           polytomy size: 6 number of binary trees: 945\n           polytomy size: 7 number of binary trees: 10395\n           polytomy size: 8 number of binary trees: 135135\n           polytomy size: 9 number of binary trees: 2027025\n\n        http://ajmonline.org/2010/darwin.php\n        '''\n\n        class TipTuple(tuple):\n            pass\n\n        def add_leaf(tree, label):\n            yield (label, tree)\n            if not isinstance(tree, TipTuple) and isinstance(tree, tuple):\n                for left in add_leaf(tree[0], label):\n                    yield (left, tree[1])\n                for right in add_leaf(tree[1], label):\n                    yield (tree[0], right)\n\n        def enum_unordered(labels):\n            if len(labels) == 1:\n                yield labels[0]\n            else:\n                for tree in enum_unordered(labels[1:]):\n                    for new_tree in add_leaf(tree, labels[0]):\n                        yield new_tree\n\n        n2subtrees = {}\n        for n in self.traverse(\"postorder\"):\n            if n.is_leaf():\n                subtrees = [getattr(n, map_attr)]\n            else:\n                subtrees = []\n                if len(n.children) > polytomy_size_limit:\n                    if skip_large_polytomies:\n                        for childtrees in itertools.product(*[n2subtrees[ch] for ch in n.children]):\n                            subtrees.append(TipTuple(childtrees))\n                    else:\n                        raise TreeError(\"Found polytomy larger than current limit: %s\" % n)\n                else:\n                    for childtrees in itertools.product(*[n2subtrees[ch] for ch in n.children]):\n                        subtrees.extend([TipTuple(subtree) for subtree in enum_unordered(childtrees)])\n\n            n2subtrees[n] = subtrees\n        return [\"%s;\" % str(nw) for nw in n2subtrees[self]]  # tuples are in newick format ^_^\n\n    def resolve_polytomy(self, default_dist=0.0, default_support=0.0,\n                         recursive=True):\n        \"\"\"\n        .. versionadded: 2.2\n\n        Resolve all polytomies under current node by creating an\n        arbitrary dicotomic structure among the affected nodes. This\n        function randomly modifies current tree topology and should\n        only be used for compatibility reasons (i.e. programs\n        rejecting multifurcated node in the newick representation).\n\n        :param 0.0 default_dist: artificial branch distance of new\n            nodes.\n\n        :param 0.0 default_support: artificial branch support of new\n            nodes.\n\n        :param True recursive: Resolve any polytomy under this\n             node. When False, only current node will be checked and fixed.\n        \"\"\"\n\n        def _resolve(node):\n            if len(node.children) > 2:\n                children = list(node.children)\n                node.children = []\n                next_node = root = node\n                for i in range(len(children) - 2):\n                    next_node = next_node.add_child()\n                    next_node.dist = default_dist\n                    next_node.support = default_support\n\n                next_node = root\n                for ch in children:\n                    next_node.add_child(ch)\n                    if ch != children[-2]:\n                        next_node = next_node.children[0]\n\n        target = [self]\n        if recursive:\n            target.extend([n for n in self.get_descendants()])\n        for n in target:\n            _resolve(n)\n\n    def add_face(self, face, column, position=\"branch-right\"):\n        \"\"\"\n        .. versionadded: 2.1\n\n        Add a fixed face to the node.  This type of faces will be\n        always attached to nodes, independently of the layout\n        function.\n\n        :argument face: a Face or inherited instance\n        :argument column: An integer number starting from 0\n        :argument \"branch-right\" position: Posible values are:\n          \"branch-right\", \"branch-top\", \"branch-bottom\", \"float\",\n          \"aligned\"\n        \"\"\"\n\n        if not hasattr(self, \"_faces\"):\n            self._faces = _FaceAreas()\n\n        if position not in FACE_POSITIONS:\n            raise ValueError(\"face position not in %s\" % FACE_POSITIONS)\n\n        if isinstance(face, Face):\n            getattr(self._faces, position).add_face(face, column=column)\n        else:\n            raise ValueError(\"not a Face instance\")\n\n\n    @staticmethod\n    def from_parent_child_table(parent_child_table):\n        \"\"\"Converts a parent-child table into an ETE Tree instance.\n\n        :argument parent_child_table: a list of tuples containing parent-child\n           relationships. For example: [(\"A\", \"B\", 0.1), (\"A\", \"C\", 0.2), (\"C\",\n           \"D\", 1), (\"C\", \"E\", 1.5)]. Where each tuple represents: [parent, child,\n           child-parent-dist]\n\n        :returns: A new Tree instance\n\n        :example:\n\n        >>> tree = Tree.from_parent_child_table([(\"A\", \"B\", 0.1), (\"A\", \"C\", 0.2), (\"C\", \"D\", 1), (\"C\", \"E\", 1.5)])\n        >>> print tree\n\n        \"\"\"\n\n        def get_node(nodename, dist=None):\n            if nodename not in nodes_by_name:\n                nodes_by_name[nodename] = Tree(name=nodename, dist=dist)\n            node = nodes_by_name[nodename]\n            if dist is not None:\n                node.dist = dist\n            node.name = nodename\n            return nodes_by_name[nodename]\n\n        nodes_by_name = {}\n        for columns in parent_child_table:\n            if len(columns) == 3:\n                parent_name, child_name, distance = columns\n                dist = float(distance)\n            else:\n                parent_name, child_name = columns\n                dist = None\n            parent = get_node(parent_name)\n            parent.add_child(get_node(child_name, dist=dist))\n\n        root = parent.get_tree_root()\n        return root\n\n    @staticmethod\n    def from_skbio(skbio_tree, map_attributes=None):\n        \"\"\"Converts a scikit-bio TreeNode object into ETE Tree object.\n\n        :argument skbio_tree: a scikit bio TreeNode instance\n\n        :argument None map_attributes: A list of attribute nanes in the\n           scikit-bio tree that should be mapped into the ETE tree\n           instance. (name, id and branch length are always mapped)\n\n        :returns: A new Tree instance\n\n        :example:\n\n        >>> tree = Tree.from_skibio(skbioTree, map_attributes=[\"value\"])\n\n        \"\"\"\n        from skbio import TreeNode as skbioTreeNode\n\n        def get_ete_node(skbio_node):\n            ete_node = all_nodes.get(skbio_node, Tree())\n            if skbio_node.length is not None:\n                ete_node.dist = float(skbio_node.length)\n            ete_node.name = skbio_node.name\n            ete_node.add_features(id=skbio_node.id)\n            if map_attributes:\n                for a in map_attributes:\n                    ete_node.add_feature(a, getattr(skbio_node, a, None))\n            return ete_node\n\n        all_nodes = {}\n        if isinstance(skbio_tree, skbioTreeNode):\n            for node in skbio_tree.preorder(include_self=True):\n                all_nodes[node] = get_ete_node(node)\n                ete_node = all_nodes[node]\n                for ch in node.children:\n                    ete_ch = get_ete_node(ch)\n                    ete_node.add_child(ete_ch)\n                    all_nodes[ch] = ete_ch\n            return ete_ch.get_tree_root()\n\n    def phonehome(self):\n        from .. import _ph\n        _ph.call()\n\n\ndef _translate_nodes(root, *nodes):\n    name2node = dict([[n, None] for n in nodes if type(n) is str])\n    for n in root.traverse():\n        if n.name in name2node:\n            if name2node[n.name] is not None:\n                raise TreeError(\"Ambiguous node name: \" + str(n.name))\n            else:\n                name2node[n.name] = n\n\n    if None in list(name2node.values()):\n        notfound = [key for key, value in name2node.items() if value is None]\n        raise ValueError(\"Node names not found: \" + str(notfound))\n\n    valid_nodes = []\n    for n in nodes:\n        if type(n) is not str:\n            if type(n) is not root.__class__:\n                raise TreeError(\"Invalid target node: \" + str(n))\n            else:\n                valid_nodes.append(n)\n\n    valid_nodes.extend(list(name2node.values()))\n    if len(valid_nodes) == 1:\n        return valid_nodes[0]\n    else:\n        return valid_nodes\n\n\nclass HsTree(TreeNode):\n#    USE_NODE_DICT = True\n\n    def __init__(tree, *args, **kwa):\n        if args:\n            newick = args[0]\n        else:\n            try:\n                newick = kwa['newick']\n            except KeyError:\n                newick = None\n\n        super(HsTree, tree).__init__(*args, **kwa)\n        try:\n            tree.mass_migrations = kwa['mass_migrations']\n        except KeyError:\n            tree.mass_migrations = []\n            if newick is not None:\n                if os.path.exists(newick+'.mass_migrations.json'):\n                    mass_migration_ls = json.load(open(newick+'.mass_migrations.json'))\n                    for ms in mass_migration_ls:\n                        tree.mass_migrations.append(MassMigration.from_dict(ms, tree))\n\n\n\n#        if HsTree.USE_NODE_DICT:\n #           tree._create_node_dict()\n\n#    def _create_node_dict(tree):\n#        max_dist = max([tree.get_distance(l) for l in tree.get_leaves()])\n#        tree.node_dict = {}\n#        for node in tree.traverse():\n#            node.time  = max_dist - tree.get_distance(node)\n#            node.name = node.get_name()\n#            tree.node_dict.update({node.name: node})\n\n#    def get_node(tree, node_name):\n#        return tree.node_dict[node_name]\n    def get_time(node):\n        rt = node.get_tree_root()\n        max_dist = rt.get_farthest_leaf()[1]\n        return max_dist - rt.get_distance(node)\n\n    def get_name(tree):\n        node1 = copy.deepcopy(tree)\n        node1.sort_descendants()\n        s = super(HsTree, node1).write(format=9)[:-1]\n        #s = node1.write(format=9)[:-1]\n        return s\n\n\n\n    def write(tree, **kwa):\n        \"\"\"\n        ete3 write method with the addition\n        of writing mass migrations into\n        separate json.\n\n        :return: (ete3 write output, mass_migrations)\n\n        \"\"\"\n\n        output = super().write(**kwa)\n        mass_migration_ls = []\n        for mm in tree.mass_migrations:\n            md = mm.to_dict()\n            mass_migration_ls.append(md)\n\n        try:\n            mm_filename = kwa['outfile'] + '.mass_migrations.json'\n            with open(mm_filename, 'w') as f:\n                json.dump(mass_migration_ls, f)\n        except KeyError:\n            pass\n\n        return output, mass_migration_ls\n\n\n    def add_mass_migration(tree, source, destination, fraction, time):\n#        if HsTree.USE_NODE_DICT:\n#            source = tree.node_dict[source_name]\n#            destination = tree.node_dict[destination_name]\n#        else:\n#            for node in tree.traverse():\n#                if node.get_name() == source_name:\n#                    source = node\n#                if node.get_name() == destination_name:\n#                    destintion = node\n        mm = MassMigration(source, destination, fraction, time)\n        if mm not in tree.mass_migrations:\n            tree.mass_migrations.append(mm)\n\n    def add_property_to_nodes(tree, property_name, property_node_dict):\n        \"\"\"\n        Adds the attribute property_name to nodes with newick\n        given as property_node_dict keys and property values as\n        dictionary values.\n\n        TODO: Update for case if node_dict is available.\n\n        Example:\n        print tree\n\n                 /-A1\n              /-|\n           /-|   \\-A2\n          |  |\n        --|   \\-B\n          |\n           \\-C\n\n        property='ne'\n        property_node_dict={'(A2,A1);':4, 'C;':1}\n\n        This adds the property node.ne to the nodes:\n           /-A1\n        --|\n           \\-A2\n        --C\n        \"\"\"\n        dic = {}\n        for k,v in property_node_dict.iteritems():\n            dic[newick_to_node_name(k)] = v\n\n        for node in tree.traverse():\n            node_name = node.get_name()\n            try:\n                setattr(node, property_name, dic[node_name])\n            except KeyError:\n                pass\n\n\n    def add_properties_to_nodes(tree, properties, properties_node_dict):\n        \"\"\"\n        Adds the attributes in the list properties to nodes with newick\n        given as property_node_dict keys and a dictionary of\n        {property:value} as dictionary values.\n\n        Example:\n        print tree\n\n                 /-A1\n              /-|\n           /-|   \\-A2\n          |  |\n        --|   \\-B\n          |\n           \\-C\n\n        properties=['ne', 'color']\n        property_node_dict={'(A2,A1);': {'ne':4, 'color': 'black'},\n                             'C;': {'color': 'green'}}\n\n        \"\"\"\n        dic = {}\n        for k, v in properties_node_dict.iteritems():\n            dic[newick_to_node_name(k)] = v\n\n\n        for node in tree.traverse():\n            node_name = node.get_name()\n            for prop in properties:\n                try:\n                    setattr(node, prop, dic[node_name][prop])\n                except KeyError:\n                    pass\n\n    def get_nodes_at_time(tree, time):\n        nodes = []\n        for n in tree.traverse():\n            ancestors = n.get_ancestors()\n            if ancestors:\n                if ancestors[0].get_time() > time > n.get_time():\n                    nodes.append(n)\n        return nodes\n\n\n\n\n    def plot(tree, ax=None, style='orthogonal',\n                      orientation='left_to_right',\n                      ax_pos=None,\n                      origin=0,\n                      use_distances=True,\n                     leaves_to_present=False,\n                      internal_node_to_present=False,\n                  node_name_fun=None,\n                  node_name_format_fun=None,\n                  leaf_name_fun=None,\n                  leaf_name_format_fun=None,\n                  line_format_fun=None,\n                  migration_arrow_format_fun=None,\n                  tick_label_format_fun=None,\n                    debug=False):\n        \"\"\"\n        Plot ete tree.\n        \"\"\"\n\n        default_node_format_args = dict(xycoords='data', ha='center',\n                                    xytext=(0,1),\n                                    textcoords='offset points',\n                                    va='bottom',\n                                    bbox=dict(boxstyle=\"round,pad=0.05\", fc=\"w\", alpha=0.5, lw=0),\n                                        size=11)\n        default_leaf_format_args = {'textcoords':'offset points'}\n                                    #, 'fontname':'monospace'\n        default_line_format_args = {'color':'k'}\n        default_migration_arrow_format_args = dict(arrowstyle=\"->, head_length = 0.5, head_width = .5\",\n                                                  color='r', linestyle='solid',linewidth=2,\n                                                    zorder=-1)\n\n        if not use_distances:\n            tree = HsTree(tree.write(format=9)[0])\n\n\n        if internal_node_to_present:\n            leaf_order = [n.get_name() for n in tree.traverse('preorder')]\n        else:\n            leaf_order = tree.get_leaf_names()\n\n        if ax is None:\n            fig = plt.figure(figsize=(12, len(leaf_order)*0.3))\n            ax = plt.gca()\n\n\n\n\n\n        assert style in ['orthogonal', 'diagonal']\n        assert orientation in ['left_to_right', 'bottom_to_top',\n                               'top_to_bottom', 'right_to_left']\n\n\n        if orientation == 'left_to_right':\n            specific_default_leaf_format = {'va':'center',\n                                            'ha':'left'}\n            edgeline = ax.hlines\n            orthogonalline = ax.vlines\n            sign =  1\n            order = 1\n            ax_pos1 = 'bottom'\n\n\n        elif orientation == 'bottom_to_top':\n            specific_default_leaf_format = {'va': 'bottom',\n                                            'ha': 'center',\n                                            'rotation': 90}\n            edgeline = ax.vlines\n            orthogonalline = ax.hlines\n            sign = 1\n            order = -1\n            ax_pos1 = 'right'\n\n\n        elif orientation == 'top_to_bottom':\n            specific_default_leaf_format = {'va':'top',\n                                        'ha': 'center',\n                                            'rotation': 90}\n            edgeline = ax.vlines\n            orthogonalline = ax.hlines\n            sign = -1\n            order = -1\n            ax_pos1 = 'right'\n\n\n        elif orientation == 'right_to_left':\n            specific_default_leaf_format = {'va': 'center',\n                                            'ha': 'right'}\n            edgeline = ax.hlines\n            orthogonalline = ax.vlines\n            sign = -1\n            order = 1\n            ax_pos1 = 'bottom'\n        else:\n            raise ValueError(\"Orientation must be left_to_right,\"\n                             \"bottom_to_top, top_to_bottom or right_to_left.\")\n\n\n        if ax_pos is None:\n            ax_pos = ax_pos1\n\n        default_leaf_format_args.update(specific_default_leaf_format)\n\n        default_leaf_format_args.update({'xytext': (sign * 5, 0)[::order]})\n\n\n\n\n        # don't plot node names if no function given\n        if node_name_fun is None:\n            node_name_fun = lambda node: False\n        if node_name_format_fun is None:\n            node_name_format_fun = lambda node: {}\n        # plot leaf.name as leaf name by default\n        if leaf_name_fun is None:\n            leaf_name_fun = lambda node: node.name\n        if leaf_name_format_fun is None:\n            leaf_name_format_fun = lambda node: {}\n        if line_format_fun is None:\n            line_format_fun = lambda node: {}\n        if migration_arrow_format_fun is None:\n            migration_arrow_format_fun = lambda node: {}\n        if tick_label_format_fun is None:\n            tick_label_format_fun = lambda x, p: format(-sign * int(x), ',')\n\n\n        max_depth = tree.get_farthest_leaf()[1]\n\n\n        max_label_width = 0\n        leaf_annots = []\n\n        for i, node in enumerate(tree.traverse('postorder')):\n            time =  node.get_time()\n            if node.is_leaf():\n                if leaves_to_present:\n                    time = 0\n\n                node.y = origin - leaf_order.index(node.name)\n\n                leaf_name = leaf_name_fun(node)\n                if leaf_name:\n                    leaf_format_args = copy.deepcopy(default_leaf_format_args)\n                    leaf_format_args.update(leaf_name_format_fun(node))\n                    x = ax.annotate(leaf_name, xy=(-sign * time , sign* node.y)[::order],\n                                xycoords='data', **leaf_format_args)\n                    leaf_annots.append(x)\n\n\n            else:\n                l = node.children[0]\n                r = node.children[1]\n\n                if not internal_node_to_present:\n                    node.y = (l.y+r.y)/2.\n                else:\n                    node.y = origin - leaf_order.index(node.get_name())\n\n\n                #print(node.y)\n\n                for c in (l,r):\n                    line_format_args = copy.deepcopy(default_line_format_args)\n                    line_format_args.update(line_format_fun(c))\n                    if style == 'orthogonal':\n                        if c.is_leaf() and leaves_to_present:\n                            ctime = 0\n                        else:\n                            ctime = c.get_time()\n\n                        edgeline(sign * (c.y), -sign * time, -sign * ctime, **line_format_args)\n                        orthogonalline(-sign * time, *sorted([sign * c.y , sign * node.y]), **line_format_args)\n\n                    elif style == 'diagonal':\n                        ax.plot([-time,-c.get_time()],[node.y, c.y])\n\n\n                    if not c.is_leaf():\n                        node_name = node_name_fun(c)\n                        if node_name:\n                            node_format_args = copy.deepcopy(default_node_format_args)\n                            node_format_args.update(node_name_format_fun(c))\n                            ax.annotate(node_name, xy=((-time-c.get_time())/2., c.y),\n                                         **node_format_args)\n\n\n\n\n        for mm in tree.mass_migrations:\n            #print \"plotting migration one\", mm.time, mm.source.get_name(), mm.destination.get_name()\n            #ax.plot([-mm.time, -mm.time],sorted([mm.source.y, mm.destination.y]), color='r')\n            #ax.arrow(-mm.time, mm.destination.y, 0 , mm.source.y - mm.destination.y,\n            #                     length_includes_head=True, color='r', linestyle='dashed')\n            migration_arrow_format_args = copy.deepcopy(default_migration_arrow_format_args)\n            migration_arrow_format_args.update(migration_arrow_format_fun(c))\n            ax.annotate(\"\",xytext=(-sign * mm.time, sign * mm.destination.y)[::order],\n                        xy=(-sign * mm.time, sign * mm.source.y)[::order],\n                         arrowprops=migration_arrow_format_args)\n\n            ax.annotate(\"{}%\".format(int(round(mm.fraction*100))),\n                        xy=(-sign * mm.time, sign * (mm.destination.y + mm.source.y)/2.)[::order],\n                         xytext=(-sign * 5,0)[::order],#ha='right',va='center',\n                        bbox=dict(boxstyle=\"round,pad=0.1\", fc=\"w\", alpha=0.5, lw=0),\n                        textcoords='offset points', color='r')\n\n\n\n        ax.spines['left'].set_visible(False)\n        ax.spines['right'].set_visible(False)\n        ax.spines['top'].set_visible(False)\n        ax.spines['bottom'].set_visible(False)\n\n\n\n\n        if ax_pos:\n            ax.spines[ax_pos].set_visible(True)\n        else:\n            if not debug:\n                ax.set_yticks([])\n                ax.set_xticks([])\n                plt.tick_params(\n                    #axis='x',  # changes apply to the x-axis\n                    which='both',  # both major and minor ticks are affected\n                    left=False,  # ticks along the bottom edge are off\n                    right=False,  # ticks along the top edge are off\n                    bottom=False,\n                    top=False,\n                    labelbottom=False)  # labels along the bottom edge are off\n\n\n        ymin, ymax = ax.get_ylim()\n        xmin, xmax = ax.get_xlim()\n        \n\n\n        if ax_pos in ['top','bottom']:\n            ax.set_ylim([ymin - (ymax - ymin) * 0.05, ymax + (ymax - ymin) * 0.01])\n            #ax.xaxis.tick_bottom()\n            if not debug:\n                ax.set_yticks([])\n            ax.get_xaxis().set_major_formatter(\n                        mpl.ticker.FuncFormatter(tick_label_format_fun))\n\n        elif ax_pos in ['left', 'right']:\n\n            ax.set_xlim([ymin - (ymax - ymin) * 0.05, ymax + (ymax - ymin) * 0.01])\n            if not debug:\n                ax.yaxis.tick_right()\n                ax.set_xticks([])\n            ax.get_yaxis().set_major_formatter(\n                        mpl.ticker.FuncFormatter(tick_label_format_fun))\n\n            \n        fig = plt.gcf()\n        renderer = fig.canvas.get_renderer()\n        inv = ax.transData.inverted()\n\n        labeledges = []\n\n        for x in leaf_annots:\n            bb = x.get_window_extent(renderer=renderer)\n            ((x0, y0), (x1, y1)) = inv.transform(bb)\n            labeledges.append(((x0, y0), (x1, y1)) )\n\n\n        labeledges = np.array(labeledges)\n        #print(np.sort(labeledges[:, int((sign + 1) / 2), int((order - 1) / -2)])[::-order])\n\n\n        edgecoord = np.sort(labeledges[:, int((sign + 1) / 2), int((order - 1) / -2)])[::-sign][0]\n        #print(edgecoord)\n\n        if internal_node_to_present:\n            for node in tree.traverse():\n                if not node.is_leaf():\n                    edgeline(sign * node.y, -sign * node.get_time(),\n                         edgecoord,\n                         linestyle='dotted')#dotted\n\n             \n        #edgecoord = edgecoord * 1.1\n\n\n        \n        # if orientation == 'left_to_right':\n        #     ec = max(xmax, edgecoord ) + (xmax-xmin)*0.05\n        #     #ax.set_xlim(xmin, ec)\n        #     ax.annotate('Y',xy=(edgecoord, 0),\n        #                ha='left')\n        #     plt.scatter(edgecoord, 0)\n        #     print(labeledges,edgecoord)\n        # elif orientation == 'right_to_left':\n        #     ec = min(xmin, edgecoord) - (xmax-xmin)*0.05\n        #     ax.set_xlim(ec,xmax)\n        # elif orientation == 'top_to_bottom':\n        #     ax.set_ylim(min(ymin, edgecoord), ymax)\n        # elif orientation == 'bottom_to_top':\n        #     ax.set_ylim(ymin, max(ymax, edgecoord ))\n\n        \n\n        #ymin, ymax = ax.get_ylim()\n        #ax.set_ylim([ymin-(ymax-ymin)*0.05,ymax+(ymax-ymin)*0.01])\n        #ax.xaxis.tick_bottom()\n        #ax.get_xaxis().set_major_formatter(\n        #        mpl.ticker.FuncFormatter(xtick_label_format_fun))\n        #ax.set_xlim([1,10])\n\n\n        \n        \n        return ax, leaf_annots\n\n\n\n    def search_node_by_newick(tree, newick):\n        for n in tree.traverse():\n            if n.get_name() == newick:\n                return n\n        raise Exception('Node not found.')\n\n    def set_leaf_order(tree, order, check_consistent=True):\n        \"\"\"\n        Changes the tree so that the leaves\n        conform to order.\n        The order must be consistent with\n        the branching structure.\n\n        Parameters:\n        tree ... ete3 tree object\n        order ... list of leaf names\n\n        Returns:\n        None (tree changed in place)\n        \"\"\"\n        order = list(order)\n        for i, node in enumerate(tree.traverse('postorder')):\n            if not node.is_leaf():\n                l = node.children[0]\n                r = node.children[1]\n                lnames = l.get_leaf_names()\n                rnames = r.get_leaf_names()\n                if order.index(lnames[0]) > order.index(rnames[0]):\n                    node.swap_children()\n        if check_consistent:\n            if tree.get_leaf_names() != order:\n                raise Exception(\"The provided order is not conistent with tree: \\nResulting order: {}\\nInput order:{}\".format(\n                    tree.get_leaf_names(), order))\n\n\n    def set_outgroup(tree, outgroup, end_at_present=True):\n        \"\"\"\n        Set root of tree.\n\n        :param outgroup: name of the outgroup leaf\n        :param end_at_present: root at a point of the branch so that outgroup ends at time 0.\n        :return: None\n        \"\"\"\n        super().set_outgroup(outgroup)\n        if end_at_present:\n            outgroup_node = tree.search_nodes(name=outgroup)[0]\n            ingroup_root = [n for n in tree.get_children() if n is not outgroup_node][0]\n            time = outgroup_node.get_time()\n            outgroup_node.dist = outgroup_node.dist + time / 2\n            ingroup_root.dist = ingroup_root.dist - time / 2\n            assert ingroup_root.dist > 0, \\\n                    \"Outgroup branch too short to lead to present. Use end_at_present=False.\"\n\n    def reverse(tree):\n        tree.set_leaf_order(tree, tree.get_leaf_names()[::-1])\n\n\n\n\ndef align_fbranch_with_tree(fbranch, tree, outgroup, ladderize=False):\n    tree_no = copy.deepcopy(tree)\n    # tree_no.ladderize()\n    # remove outgroup\n    tree_no.prune([n for n in tree_no.get_leaf_names() if n != outgroup])\n    #\n    if ladderize:\n        tree_no.ladderize()\n    \n    fb = fbranch.copy()\n    fb.index = fb.index.droplevel(0)\n    fb = fb.drop(outgroup, axis=1)\n    fb.index = [tuple(sorted(i.split(','))) for i in fb.index]\n    row_order = []\n    col_order = tree_no.get_leaf_names()\n    for n in tree_no.iter_descendants(strategy='preorder'):\n        row_order.append(tuple(sorted(n.get_leaf_names())))\n\n    assert fb.shape == (len(row_order), len(col_order)), \\\n        \"{} != ({},{}))\".format(fb.shape, len(row_order), len(col_order))\n\n    assert set(fb.index.values) == set(row_order), \\\n        \"Samples in input tree and fbranch matrix not consistent: {} != {}\".format(fb.index.values, row_order)\n\n    assert set(fb.columns.values) == set(col_order), \\\n        \"Samples in input tree and fbranch matrix not consistent: {} != {}\".format(fb.columns.values, col_order)\n\n    # order fb in a way that is conistent with the tree\n    fb = fb.loc[row_order, col_order].iloc[::-1]  # ,::-1\n\n    return fb, tree_no\n\n\ndef plot_fbranch(fbranch, tree_no_outgroup, leaves_to_present=True,\n                 use_distances=False,\n                 debug=False, tree_label_size=14, max_color_cutoff=None):\n    #print(\"1706\")\n\n    if max_color_cutoff is None:\n        max_color_cutoff = fbranch.max().max()\n    \n    n_rows, n_cols = fbranch.shape\n\n    plt.rcParams['font.size'] = tree_label_size\n\n    # depth = tree_sd2_1_ete.get_farthest_leaf(topology_only=True)[1] +2\n\n    # visited = []\n\n    #fig = plt.figure(figsize=(18,20))\n\n    fig = plt.figure(figsize=(n_cols, n_rows * 0.5))\n\n    toptree_ax = plt.subplot2grid((5, 7), (0, 2), rowspan=1, colspan=4)\n\n    \n    tree_no_outgroup.plot(origin=-0.5, ax=toptree_ax,\n                          orientation='top_to_bottom',\n                          use_distances=use_distances,\n                          leaves_to_present=leaves_to_present)\n    \n    if not debug:\n        toptree_ax.axis('off')\n\n    lefttree_ax = plt.subplot2grid((5, 7), (1, 0), rowspan=4, colspan=2)\n\n    \n    tree_no_outgroup.plot(origin=fbranch.shape[0] + 0.5, ax=lefttree_ax, orientation='left_to_right',\n                          use_distances=use_distances,\n                                leaves_to_present=leaves_to_present,\n                                    internal_node_to_present=True)\n    if not debug:\n        lefttree_ax.axis('off')\n    # ax = plot_node_tree(tree_no_outgroup, ax=ax, x0=0,y0=len(branch_mat)-0.5,em=0.5,fontsize=12)\n\n    # ax.set_xlabel('excess allele sharing with')\n    # toptree_ax.xaxis.set_label_position('top')\n\n    # ax.set_xlim([-1,10])\n\n    fbranch_ax = plt.subplot2grid((5, 7), (1, 2), rowspan=4, colspan=4, sharey=lefttree_ax, sharex=toptree_ax)\n\n    \n    branch_mat0 = fbranch.copy()\n    branch_mat0_masked = np.ma.array(branch_mat0, mask=np.isnan(branch_mat0))\n\n    fmax = branch_mat0.max().max()\n    fmin = branch_mat0.min().min()\n    colors = np.concatenate([[[1, 1, 1, 1]], plt.cm.Reds(np.linspace(0., 1 * fmax / max_color_cutoff, 256))])\n    mymap = mpl.colors.LinearSegmentedColormap.from_list('my_colormap', colors)\n\n    plt.pcolormesh(branch_mat0_masked, cmap=mymap, rasterized=True)  # ,cmap=jet\n\n    # plt.scatter(zs.index.droplevel(0).values+0.5,zs.index.droplevel(1)+0.43,marker='*',s=10)\n\n    fbranch_ax.set_xticks(np.arange(0.5, fbranch.shape[1]))\n    # fbranch_ax.set_xticklabels(fbranch.columns,rotation=90)\n    fbranch_ax.set_xticklabels([])\n    fbranch_ax.set_yticks(np.arange(0.5, fbranch.shape[0]))\n    fbranch_ax.set_yticklabels([])\n    fbranch_ax.set_facecolor((0.85, 0.85, 0.85))\n    #plt.tight_layout()\n    fbranch_ax.set_ylim(0, len(fbranch))\n    #plt.subplots_adjust(wspace=-0.1)\n    #plt.subplots_adjust(hspace=0.05)\n    fbranch_ax.xaxis.tick_top()\n    fbranch_ax.set_xlim([0, fbranch.shape[1]])\n\n    for b in range(fbranch.shape[0]):\n        l = fbranch_ax.axhline(y=b + 0.04, xmin=0, xmax=1, linewidth=1, color='grey', alpha=0.5)\n\n    for b in range(fbranch.shape[1]):\n        l = fbranch_ax.axvline(x=b - 0.02, ymin=0, ymax=1, linewidth=1, color='grey', alpha=0.5)\n\n    plt.tight_layout()\n\n    cb_ax = fig.add_axes([fbranch_ax.get_position().xmax + 0.02, fbranch_ax.get_position().ymin,\n                           0.03, fbranch_ax.get_position().ymax - fbranch_ax.get_position().ymin])\n\n    # plt.subplot2grid((5,7), (1,6), rowspan=4,colspan=1)\n    mappable = mpl.cm.ScalarMappable(cmap=mymap)\n    mappable.set_array([fmin, fmax])\n    cbar = plt.colorbar(mappable, cax=cb_ax, label='$f_b$')\n    cbar.set_label('$f_b$', size=20*(1+np.log(n_cols/8.)))\n    cbar.solids.set_rasterized(True)\n    cbar.ax.tick_params(labelsize=14*(1+np.log(n_cols/8.)))\n\n    #plt.axis('tight')\n\n    #plt.subplots_adjust(wspace=0.05)\n\n    #plt.tight_layout()\n    \n    return toptree_ax, lefttree_ax, fbranch_ax, cb_ax\n\ndef main():\n    argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\n                                        description=\"Plot f-branch statistic as produced by Dsuite. \"\n                                                    \"Produces .png and .svg files.\",\n                                                                                 add_help=True)\n    argparser.add_argument(\"fbranch\", type=argparse.FileType('r'),\n                            help=\"Path to file containing f-branch matrix as produced by Dsuite Fbranch.\")\n    argparser.add_argument(\"tree\", type = argparse.FileType('r'),\n                            help=\"Path to .newick tree file as given to Dsuite Fbranch.\")\n\n    argparser.add_argument(\"-n\", \"--run-name\", type=str,\n                            help=\"Base file name for output plots.\",default=\"fbranch\")\n    argparser.add_argument( \"--outgroup\", type=str,\n                        help=\"Outgroup name in newick file.\",default=\"Outgroup\")\n    argparser.add_argument(\"--use_distances\",\n                           help=\"Use actual node distances from newick file when plotting tree.\",\n                           action='store_true')\n    argparser.add_argument(\"--ladderize\",\n                           help=\"Ladderize the input tree before plotting.\",\n                           action='store_true')\n    argparser.add_argument(\"--color-cutoff\",\n                           help=\"Set the darkest red to this f_branch value.\",\n                           type=float)\n    argparser.add_argument(\"--tree-label-size\",\n                           help=\"Set the font size of the tree leaf names.\",\n                           type=float, default=14)\n\n    argparser.add_argument(\"--dpi\",\n                           help=\"Set the dpi for the output .png.\",\n                           type=float, default=150)\n\n    args = argparser.parse_args()\n    fb = pd.read_csv(args.fbranch, sep='\\t',\n                  index_col=[0,1])\n    #this fixes some bug where f_b sometimes is negative still\n    fb[fb<0] = 0\n\n    print(\"Reading tree...\")\n\n    tree = HsTree(args.tree.read())\n    fb1, tree_no_outgroup = align_fbranch_with_tree(fb, tree,\n                                                     outgroup=args.outgroup,\n                                                     ladderize=args.ladderize)\n    if args.use_distances:\n        leaves_to_present = False\n    else:\n        leaves_to_present = True\n    \n    print(\"Plotting fbranch...\")\n    \n    plot_fbranch(fb1, tree_no_outgroup, use_distances=args.use_distances,\n                 leaves_to_present=leaves_to_present, tree_label_size=args.tree_label_size, max_color_cutoff=args.color_cutoff)\n    \n    print(\"Saving plots...\")\n    \n    plt.savefig(args.run_name+'.svg', bbox_inches='tight')\n    \n    if fb.shape[1] > 100:\n        print(\"png output not supported for more than 100 populations. Only .svg output.\")\n    else:\n        plt.savefig(args.run_name+'.png', bbox_inches='tight', dpi=args.dpi)\n\n    return 0 \n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "utils/setup.py",
    "content": "from setuptools import setup                                                                                                                                                              \n\nsetup(\n    name='dtools',\n    version='0.1',\n    py_modules=['dtools'],\n   description='A python module for plotting fbranch',\n   author='Hannes Svardal',\n   author_email='svardallab@gmail.com',\n   scripts=['dtools.py'],\n    install_requires=[\n        'matplotlib>=3.0.2',\n       'pandas>=0.23.4'],\n    platforms=[\n    'linux-x86_64',\n    'macosx-10.10-x86_64'\n    ],\n    include_package_data=True,\n    zip_safe=False\n)\n"
  }
]